diff --git a/docs/class_diagram.drawio b/docs/class_diagram.drawio new file mode 100644 index 0000000000000000000000000000000000000000..51066a5cef382326c8c3e1c8477dd81a52567f4d --- /dev/null +++ b/docs/class_diagram.drawio @@ -0,0 +1 @@ +<mxfile host="Electron" modified="2023-07-12T16:27:44.192Z" agent="5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/15.8.7 Chrome/91.0.4472.164 Electron/13.6.2 Safari/537.36" etag="cKdbLBWRQm2rR1pnQ_-2" version="15.8.7" type="device"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Page-1">7ZtRc5s4EIB/jWfubsYZAzZxHkPSpNf6btKm196bRwEZ1AhEJRGb/PpbYYFNAIckdrhJmEmn1iIktN8uWu3aA+ssXF1yFAd/MQ/TgTnyVgPrfGCahjk24D8lSdeSqWWvBT4nnu60EVyTe6yFIy1NiIdFqaNkjEoSl4UuiyLsypIMcc6W5W4LRsuzxsjHFcG1i2hV+oN4MtCrMI838o+Y+EE+s2GfrK+EKO+sVyIC5LHllsj6MLDOOGNy/SlcnWGqlJfr5cef6Q86u7UvP30Rv9A/zudvf38frge7eMotxRI4juSzh76/XVx8/D7++W98dfFlObo8vfk01LeM7hBNtL6uKJMSc71kmeZ6FEsSUhRBy1mwSF7rK6AFB1HiR/DZhceDOy3nDnNJAMGpviBZDFI3INSboZQlahFCIvc2bzkB4+QehkUULhkggMtcamsy7VKPa3UniEcg5VhAn6tcM0YhmiEhdR+XUYpiQW6yB1ZdQsR9EjmwUBbmA7Ek8rCnWwXqrCE5uy2MR93fkofmprSBV1vWqPlcYhZiyVPooq9aGkeau5BuLzeGa+V+FWwZrTk2tcNoZ/GLoYvZvoJzocgHHRTTVeYbV+cz7Lr57PJ0iAL3CEnsKC2KbTOED1sr3Ygy46w3VP/rcvht4juzz8tfwQStHOv79dCsGCrMY52C6BxJJLAcqIdS2snxbNkuqF9u2SnFC9lopSJGLon8WdbnfLyRfNXrVyIG9y5oZiEB8TwcZRYk4UnWRqaeImYkkpmCJg78gRrPRkeTwQQe6AzaxqYNf6o7l2csAmNDJLMqDBa8xMqKnQY7Kptgs39X7S0tc3zUvuxm8yqBfipVq0p1saYao8hD4kjBXXAU4h7v0/FOzNfDW/t44wreCjtKsi0lkOHmzf/wdfsI2BAQqeFykt8U6POhUaFtVWlbNWQpusH0igkiCVPj83XfB8TbQW1HsNUeoaEeT9tBnR6I6aSGqaXit0TAbgo77zyGCOK339dufMdgsLfpr/tHO+36dWzUsHXUUoHo3COgDHKTKK8oMY6pPLogfsKxUuW6fbrC4o1y3/972jBe8UVdC/54F3gXAjsfom9gNL9BfO4GEJr38PcF32r5Qj8Y/Glr+Gyl/vW+v0/89rhj/Cdt8QewAfTs98n+pO35/mAbfjX/swVfAH042PfM93nkNroO8sYVVtjzcZ7PA5UEzGcRoh820gfZsU2fGcvOWuqo9RNLmeqEHUokKx/q1nOqiRqVuSuFJljCXfx4rC0R9/EuOMZxPR2OKZLkrvx0e1e9Wc13BDCJOWJwCK6jMlNH0bImH022FsfhLBHa6B3nu4xcJ+P1zYMiBb7Nq9m4GvObw9GRYZn6BJmWZmwNQA9+pRaz1YUtFlkS8ACZx2oW4493Amtq29N9wtKjFNvOK8CrpiuM9wAP2I2NcYndcF/wzAPA2/GyLsErkv0PEDZUp0Z9deqxrbV9dWp8XC4XWZN2kYx1bD5qZDXVqcnkf1mdqrXUPLG9XcgoKhcN9Yw3FjDvMFH9Th3V2uZuz2+dO33NUla9CVRjuwrj917seDrWtiWsPVQ76g/Idg1VdUCGY8bcZeENEPXmKFoRLFM4MDOO4aj81ny7MXppSbwRr915AqQp8634EjGPEHeJEEQo2jTPgjiM0R5yW8jdl7Wa6loaMvhxjCVRQXFP+bmUuy9iGU2FDIW5CMeKTKb3hoOxw1HuvFplNNUrFGUBOoH9GJw4CaMt0teYkz5J/QTMnVelzKbKhMKcRORXgjXmedZF5LSzZEXPuSXnzitQ5q69WVcfUyCdgC57h3426FctO9Wfj5u+OBZzHHPmYiGyTVpg+WCLvnjDW/QhMiH5l8G726KbUEuOkZyzRFKCuXhHmA/h0XbnkVjd13sfePQySGOK0h71i1CfdB2N5We7Cuq5x1kM8RhWtFG2U6uYrPftFwG3zK7DMqMum62AcxyClucRi5Dr4lgCgbn6kV4P/EXAx4cLz6C5+Y3iuuC1+aWn9eE/</diagram></mxfile> \ No newline at end of file diff --git a/docs/class_diagram.png b/docs/class_diagram.png new file mode 100644 index 0000000000000000000000000000000000000000..47fb58150dfebd880bcfdd8a60dfc09ad90c8635 Binary files /dev/null and b/docs/class_diagram.png differ diff --git a/src/Dataset.py b/src/Dataset.py index 72cb721bc4450a5a21a3a99f195ec3819c670188..fc3c33476a6967cb44c571c2fb2433efff939ecc 100644 --- a/src/Dataset.py +++ b/src/Dataset.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import logging class Dataset: @@ -11,8 +12,23 @@ class Dataset: dataset_filename (str): the path of the dataset, relative to the location of the file calling this function. """ - raw_dataframe = pd.read_csv(dataset_filename, encoding="windows-1254") + if type(dataset_filename) != str: + logging.error("parameter `dataset_filename` is not a string") + raise ValueError(f"{dataset_filename} is not a string") + + if not dataset_filename.endswith(".csv"): + logging.error("dataset filename should be CSV") + raise OSError(f"{dataset_filename} is not a CSV file.") + try: + raw_dataframe = pd.read_csv( + dataset_filename, encoding="windows-1254" + ) + logging.info("Dataframe successfully loaded") + except FileNotFoundError as e: + logging.error("CSV file not found") + raise e self.dataframe = self.preprocess_dataset(raw_dataframe) + logging.info("Dataset class successfully initialised") def preprocess_dataset(self, raw_dataframe: pd.DataFrame) -> pd.DataFrame: """preprocess dataframe immediately after loading it. @@ -25,6 +41,12 @@ class Dataset: Returns: pd.DataFrame: resulting preprocessed dataframe. """ + if type(raw_dataframe) != pd.DataFrame: + logging.error( + "parameter `raw_dataframe` is not a pandas DataFrame" + ) + raise ValueError(f"{raw_dataframe} is not a pandas DataFrame") + dataframe = self._drop_unnecessary_columns( raw_dataframe ) # for conveneince @@ -39,6 +61,10 @@ class Dataset: return dataframe def get_is_competitive_col(self, dataframe: pd.DataFrame): + if type(dataframe) != pd.DataFrame: + logging.error("parameter `dataframe` is not a pandas DataFrame") + raise ValueError(f"{dataframe} is not a pandas DataFrame") + is_competitive_col = np.zeros(shape=len(dataframe)) is_competitive_col[ (dataframe["whyplay"] == "improving") @@ -64,6 +90,10 @@ class Dataset: Returns: pd.DataFrame: the dataframe. """ + + if type(dataframe) != pd.DataFrame: + logging.error("parameter `dataframe` is not a pandas DataFrame") + raise ValueError(f"{dataframe} is not a pandas DataFrame") rows_to_drop = ( [ "League", @@ -92,6 +122,10 @@ class Dataset: Returns: pd.DataFrame: the dataframe. """ + if type(dataframe) != pd.DataFrame: + logging.error("parameter `dataframe` is not a pandas DataFrame") + raise ValueError(f"{dataframe} is not a pandas DataFrame") + # drop rows where users did not accept to having their data used dataframe = dataframe.drop( dataframe[dataframe["accept"] != "Accept"].index, @@ -108,6 +142,10 @@ class Dataset: Returns: pd.Series: the Is_competitive column. """ + if type(dataframe) != pd.DataFrame: + logging.error("parameter `dataframe` is not a pandas DataFrame") + raise ValueError(f"{dataframe} is not a pandas DataFrame") + dataframe["whyplay"] = dataframe["whyplay"].str.lower() most_common_whyplay_reasons = list( dataframe.groupby("whyplay") @@ -122,7 +160,7 @@ class Dataset: is_competitive_col = self.get_is_competitive_col(dataframe) return is_competitive_col - def treat_outliers(self, df, colname): + def treat_outliers(self, df, colname) -> pd.DataFrame: q = df[colname].quantile(0.99) return df[df[colname] < q] @@ -138,6 +176,10 @@ class Dataset: Returns: pd.Series: the anxiety score column. """ + if type(dataframe) != pd.DataFrame: + logging.error("parameter `dataframe` is not a pandas DataFrame") + raise ValueError(f"{dataframe} is not a pandas DataFrame") + gad_max = 21 gad_min = 0 gad_normalised = (dataframe["GAD_T"] - gad_min) / gad_max @@ -161,6 +203,10 @@ class Dataset: Returns: pd.Series: the boolean narcissist column. """ + if type(dataframe) != pd.DataFrame: + logging.error("parameter `dataframe` is not a pandas DataFrame") + raise ValueError(f"{dataframe} is not a pandas DataFrame") + return np.where(dataframe["Narcissism"] <= 1.0, True, False) def get_dataframe(self) -> pd.DataFrame: @@ -183,6 +229,13 @@ class Dataset: Returns: pd.Series: The sorted column. """ + if type(colname) != str: + logging.error("parameter `colname` is not a string") + raise ValueError(f"{colname} is not a string") + if type(colname) not in (None, bool): + logging.error("parameter `ascending` is not a bool or None") + raise ValueError(f"{colname} is not a bool or None") + return self.dataframe[colname].sort_values(ascending=ascending) def get_unique_column_values(self, colname: str): @@ -195,6 +248,10 @@ class Dataset: string array: an array of strings containing the unique values present in the column """ + if type(colname) != str: + logging.error("parameter `colname` is not a string") + raise ValueError(f"{colname} is not a string") + return self.dataframe[colname].explode().unique() def get_category_counts( @@ -210,6 +267,13 @@ class Dataset: Returns: pd.Series: the counted categories. """ + if type(colname) != str: + logging.error("parameter `colname` is not a string") + raise ValueError(f"{colname} is not a string") + if type(colname) not in (None, bool): + logging.error("parameter `ascending` is not a bool or None") + raise ValueError(f"{colname} is not a bool or None") + grouped_size = self.dataframe.groupby(colname).size() return ( grouped_size diff --git a/src/Plotter.py b/src/Plotter.py index c9055befddaf2a526378850a681c37e187803df6..c8d9051a3b7e2319a463cf91aba90b003908e7a9 100644 --- a/src/Plotter.py +++ b/src/Plotter.py @@ -3,18 +3,61 @@ import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from .Dataset import Dataset - +import logging class Plotter: def __init__(self, dataset: Dataset): + if type(dataset) != Dataset: + logging.error("dataset parameter is not of type Dataset") + raise ValueError(f"{dataset} is not of type Dataset") + self.ds = dataset self.df = dataset.get_dataframe() - def customize_plot(self, fig, ax, styling_params): + def customize_plot(self, fig, ax, styling_params) -> None: + """ customize_plot + + Args: + fig (plt.figure.Figure), + ax (plt.axes.Axes), + styling_params (dict) + + + Returns: + None + """ if styling_params.get("title"): ax.set_title(styling_params["title"]) - def distribution_plot(self, target): + def distribution_plot(self, target, styling_params = {}) -> None: + """ plot a distribution plot. + + Args: + target (str, must be present as a column in the dataset), + styling_params (dict) + + + Returns: + None + """ + + # implementing sensible logging and error catching + if (type(target) != str): + logging.error("parameter target should be a string.") + raise ValueError("parameter target should be a string.") + + if not (target in self.df.columns): + logging.error("parameter target cannot be found in the dataset.") + raise ValueError( + "parameter target cannot be found in the dataset." + ) + + if (type(styling_params) != dict): + logging.error("parameter styling params should be a dict.") + raise ValueError("parameter styling params should be a dict.") + + # plotting the plot + grouped_data = self.df.groupby(target).size() plt.barh(grouped_data.index, grouped_data.values) print( @@ -28,7 +71,44 @@ class Plotter: def plot_categorical_bar_chart( self, category1, category2, styling_params={} - ): + ) -> None: + """ plot a categorical bar chart. + + Args: + category1 (str, must be present as a column in the dataset), + category2 (str, must be present as a column in the dataset), + styling_params (dict) + + + Returns: + None + """ + # implementing sensible logging and error catching + if (type(category1) != str): + logging.error("parameter category1 should be a string.") + raise ValueError("parameter category1 should be a string.") + + if not (category1 in self.df.columns): + logging.error("parameter category1 cannot be found in the dataset.") + raise ValueError( + "parameter category1 cannot be found in the dataset." + ) + + if (type(category2) != str): + logging.error("parameter category2 should be a string.") + raise ValueError("parameter category2 should be a string.") + + if not (category2 in self.df.columns): + logging.error("parameter category2 cannot be found in the dataset.") + raise ValueError( + "parameter category2 cannot be found in the dataset." + ) + + if (type(styling_params) != dict): + logging.error("parameter styling params should be a dict.") + raise ValueError("parameter styling params should be a dict.") + + # plotting the plot ct = pd.crosstab(self.df[category1], self.df[category2]) # Calculate percentages by row ct_percent = ct.apply(lambda r: r / r.sum() * 100, axis=0) @@ -36,14 +116,91 @@ class Plotter: self.customize_plot(fig, ax, styling_params) ct_percent.plot(kind="bar", ax=ax) - def plot_categorical_boxplot(self, target, category, styling_params={}): + def plot_categorical_boxplot( + self, target, category, styling_params={} + ) -> None: + """ plot a categorical boxplot. + + Args: + target (str, must be present as a column in the dataset), + category (str, must be present as a column in the dataset), + styling_params (dict) + + + Returns: + None + """ + # implementing sensible logging and error catching + if (type(target) != str): + logging.error("parameter target should be a string.") + raise ValueError("parameter target should be a string.") + + if not (target in self.df.columns): + logging.error("parameter target cannot be found in the dataset.") + raise ValueError( + "parameter target cannot be found in the dataset." + ) + + if (type(category) != str): + logging.error("parameter category should be a string.") + raise ValueError("parameter category should be a string.") + + if not (category in self.df.columns): + logging.error("parameter category cannot be found in the dataset.") + raise ValueError( + "parameter category cannot be found in the dataset." + ) + + if (type(styling_params) != dict): + logging.error("parameter styling params should be a dict.") + raise ValueError("parameter styling params should be a dict.") + + # plotting the plot fig, ax = plt.subplots() self.customize_plot(fig, ax, styling_params) sns.boxplot(x=category, y=target, data=self.df, palette="rainbow") def plot_categorical_histplot( self, target, category, styling_params={}, bins=30 - ): + ) -> None: + """ plot a categorical hisplot. + + Args: + target (str, must be present as a column in the dataset), + category (str, must be present as a column in the dataset), + styling_params (dict) + + + Returns: + None + """ + # implementing sensible logging and error catching + if (type(target) != str): + logging.error("parameter target should be a string.") + raise ValueError("parameter target should be a string.") + + if not (target in self.df.columns): + logging.error("parameter target cannot be found in the dataset.") + raise ValueError( + "parameter target cannot be found in the dataset." + ) + + if (type(category) != str): + logging.error("parameter category should be a string.") + raise ValueError("parameter category should be a string.") + + if not (category in self.df.columns): + logging.error("parameter category cannot be found in the dataset.") + raise ValueError( + "parameter category cannot be found in the dataset." + ) + + if (type(styling_params) != dict): + logging.error("parameter styling params should be a dict.") + raise ValueError("parameter styling params should be a dict.") + + # plotting the plot + uniques = self.ds.get_unique_column_values(category) fig, ax = plt.subplots() self.customize_plot(fig, ax, styling_params) @@ -57,7 +214,45 @@ class Plotter: alpha=0.5, ) - def plot_scatterplot(self, target1, target2, styling_params={}): + def plot_scatterplot(self, target1, target2, styling_params={}) -> None: + """ plot a scatterplot. + + Args: + target1 (str, must be present as a column in the dataset), + target2 (str, must be present as a column in the dataset), + styling_params (dict) + + + Returns: + None + """ + + # implementing sensible logging and error catching + if (type(target1) != str): + logging.error("parameter target1 should be a string.") + raise ValueError("parameter target1 should be a string.") + + if not (target1 in self.df.columns): + logging.error("parameter target1 cannot be found in the dataset.") + raise ValueError( + "parameter target1 cannot be found in the dataset." + ) + + if (type(target2) != str): + logging.error("parameter target2 should be a string.") + raise ValueError("parameter target2 should be a string.") + + if not (target2 in self.df.columns): + logging.error("parameter target2 cannot be found in the dataset.") + raise ValueError( + "parameter target2 cannot be found in the dataset." + ) + + if (type(styling_params) != dict): + logging.error("parameter styling params should be a dict.") + raise ValueError("parameter styling params should be a dict.") + + # plotting the plot fig, ax = plt.subplots() self.customize_plot(fig, ax, styling_params) - ax.scatter(self.df[target1], self.df[target2]) + ax.scatter(self.df[target1], self.df[target2]) \ No newline at end of file