Dataset.py

import numpy as np
import pandas as pd


class Dataset:
    def __init__(self, dataset_filename: str) -> None:
        raw_dataframe = pd.read_csv(dataset_filename, encoding="windows-1254")
        self.dataframe = self.preprocess_dataset(raw_dataframe)

    def preprocess_dataset(self, raw_dataframe: pd.DataFrame) -> pd.DataFrame:
        """preprocess dataframe immediately after loading it.

        Args:
            raw_dataframe (pd.DataFrame):
                raw dataframe as read from pd.read_csv().

        Returns:
            pd.DataFrame: resulting preprocessed dataframe.
        """
        dataframe = raw_dataframe.drop(["League"], axis="columns")
        dataframe["Anxiety_score"] = self.get_combined_anxiety_score(dataframe)
        dataframe["Is_narcissist"] = self.get_is_narcissist_col(dataframe)
        # more preprocessing goes here
        return dataframe

    def get_combined_anxiety_score(self, dataframe: pd.DataFrame) -> pd.Series:
        """Get the combined axiety score, as a column.
        This score is based on the GAN, SPIN and SWL metrics.
        Each of the three columns are first normalised,
          then the mean is returned.

        Args:
            dataframe (pd.DataFrame): the dataframe.

        Returns:
            pd.Series: the anxiety score column.
        """
        gad_max = 21
        gad_min = 0
        gad_normalised = (dataframe["GAD_T"] - gad_min) / gad_max
        spin_max = 68
        spin_min = 0
        spin_normalised = (dataframe["SPIN_T"] - spin_min) / spin_max
        swl_max = 35
        swl_min = 5
        swl_flipped = 1 - (dataframe["SWL_T"] - swl_min) / swl_max
        combined = (gad_normalised + spin_normalised + swl_flipped) / 3
        return combined

    def get_is_narcissist_col(self, dataframe: pd.DataFrame):
        return np.where(dataframe["Narcissism"] <= 1.0, True, False)

    def get_dataframe(self) -> pd.DataFrame:
        """A getter function for the dataframe.

        Returns:
            pd.DataFrame: the dataset.
        """
        return self.dataframe

    def draw_histogram(self):
        raise NotImplementedError

    def get_dataset_columns(self) -> list:
        return self.dataframe.columns

    def get_plottable_columns(self) -> list:
        raise NotImplementedError

    def get_sorted_column(
        self, colname: str, ascending: bool = True
    ) -> pd.Series:
        """Returns a single column, sorted either ascending or descending.

        Args:
            colname (str): the column name (see get_dataset_columns()).
            ascending (bool, optional): Sorting order. Defaults to True.

        Returns:
            pd.Series: The sorted column.
        """
        return self.dataframe[colname].sort_values(ascending=ascending)