import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from .Dataset import Dataset import logging class Plotter: def __init__(self, dataset: Dataset): if type(dataset) != Dataset: logging.error("dataset parameter is not of type Dataset") raise ValueError(f"{dataset} is not of type Dataset") self.ds = dataset self.df = dataset.get_dataframe() def customize_plot(self, fig, ax, styling_params) -> None: """customize_plot Args: fig (plt.figure.Figure), ax (plt.axes.Axes), styling_params (dict) Returns: None """ if styling_params.get("title"): ax.set_title(styling_params["title"]) def distribution_plot(self, target, styling_params={}) -> None: """plot a distribution plot. Args: target (str, must be present as a column in the dataset), styling_params (dict) Returns: None """ # implementing sensible logging and error catching if type(target) != str: logging.error("parameter target should be a string.") raise ValueError("parameter target should be a string.") if not (target in self.df.columns): logging.error("parameter target cannot be found in the dataset.") raise ValueError( "parameter target cannot be found in the dataset." ) if type(styling_params) != dict: logging.error("parameter styling params should be a dict.") raise ValueError("parameter styling params should be a dict.") # plotting the plot grouped_data = self.df.groupby(target).size() plt.barh(grouped_data.index, grouped_data.values) print( str(grouped_data), str(grouped_data.index), str(grouped_data.values), ) plt.xlabel("Size") plt.ylabel(target) plt.title(f"Distribution of {target}") def plot_categorical_bar_chart( self, category1, category2, styling_params={} ) -> None: """plot a categorical bar chart. Args: category1 (str, must be present as a column in the dataset), category2 (str, must be present as a column in the dataset), styling_params (dict) Returns: None """ # implementing sensible logging and error catching if type(category1) != str: logging.error("parameter category1 should be a string.") raise ValueError("parameter category1 should be a string.") if not (category1 in self.df.columns): logging.error( "parameter category1 cannot be found in the dataset." ) raise ValueError( "parameter category1 cannot be found in the dataset." ) if type(category2) != str: logging.error("parameter category2 should be a string.") raise ValueError("parameter category2 should be a string.") if not (category2 in self.df.columns): logging.error( "parameter category2 cannot be found in the dataset." ) raise ValueError( "parameter category2 cannot be found in the dataset." ) if type(styling_params) != dict: logging.error("parameter styling params should be a dict.") raise ValueError("parameter styling params should be a dict.") # plotting the plot ct = pd.crosstab(self.df[category1], self.df[category2]) # Calculate percentages by row ct_percent = ct.apply(lambda r: r / r.sum() * 100, axis=0) fig, ax = plt.subplots() self.customize_plot(fig, ax, styling_params) ct_percent.plot(kind="bar", ax=ax) def plot_categorical_boxplot( self, target, category, styling_params={} ) -> None: """plot a categorical boxplot. Args: target (str, must be present as a column in the dataset), category (str, must be present as a column in the dataset), styling_params (dict) Returns: None """ fig, ax = plt.subplots() self.customize_plot(fig, ax, styling_params) sns.boxplot(x=category, y=target, data=self.df, palette="rainbow") def plot_categorical_histplot( self, target, category, styling_params={}, bins=30 ) -> None: """plot a categorical hisplot. Args: target (str, must be present as a column in the dataset), category (str, must be present as a column in the dataset), styling_params (dict) Returns: None """ # implementing sensible logging and error catching if type(target) != str: logging.error("parameter target should be a string.") raise ValueError("parameter target should be a string.") if not (target in self.df.columns): logging.error("parameter target cannot be found in the dataset.") raise ValueError( "parameter target cannot be found in the dataset." ) if type(category) != str: logging.error("parameter category should be a string.") raise ValueError("parameter category should be a string.") if not (category in self.df.columns): logging.error("parameter category cannot be found in the dataset.") raise ValueError( "parameter category cannot be found in the dataset." ) if type(styling_params) != dict: logging.error("parameter styling params should be a dict.") raise ValueError("parameter styling params should be a dict.") # plotting the plot uniques = self.ds.get_unique_column_values(category) fig, ax = plt.subplots() self.customize_plot(fig, ax, styling_params) for val in uniques: anx_score = self.df[self.df[category] == val][target] anx_score_weights = np.ones(len(anx_score)) / len(anx_score) ax.hist( anx_score, weights=anx_score_weights, bins=bins, alpha=0.5, ) def plot_scatterplot(self, target1, target2, styling_params={}) -> None: """plot a scatterplot. Args: target1 (str, must be present as a column in the dataset), target2 (str, must be present as a column in the dataset), styling_params (dict) Returns: None """ # implementing sensible logging and error catching if type(target1) != str: logging.error("parameter target1 should be a string.") raise ValueError("parameter target1 should be a string.") if not (target1 in self.df.columns): logging.error("parameter target1 cannot be found in the dataset.") raise ValueError( "parameter target1 cannot be found in the dataset." ) if type(target2) != str: logging.error("parameter target2 should be a string.") raise ValueError("parameter target2 should be a string.") if not (target2 in self.df.columns): logging.error("parameter target2 cannot be found in the dataset.") raise ValueError( "parameter target2 cannot be found in the dataset." ) if type(styling_params) != dict: logging.error("parameter styling params should be a dict.") raise ValueError("parameter styling params should be a dict.") # plotting the plot fig, ax = plt.subplots() self.customize_plot(fig, ax, styling_params) ax.scatter(self.df[target1], self.df[target2]) def distribution_plot(self, target: str): """ distribution_plot _summary_ Args: target (str): _description_ Returns: None """ grouped_data = self.df.groupby(target).size() sorted_data = grouped_data.sort_values(ascending=True) plt.barh(sorted_data.index, sorted_data.values, data=sorted_data) print(grouped_data.sort_values(ascending=False)) plt.xlabel("Size") plt.ylabel(target) plt.title(f"Distribution of {target}")