Dataset.py

import numpy as np
import pandas as pd


class Dataset:
    def __init__(self, dataset_filename: str) -> None:
        """A wrapper class for the pandas dataframe.
        Loads and preprocesses the dataset, then stores it in self.dataframe.

        Args:
            dataset_filename (str): the path of the dataset,
              relative to the location of the file calling this function.
        """
        raw_dataframe = pd.read_csv(dataset_filename, encoding="windows-1254")
        self.dataframe = self.preprocess_dataset(raw_dataframe)

    def preprocess_dataset(self, raw_dataframe: pd.DataFrame) -> pd.DataFrame:
        """preprocess dataframe immediately after loading it.

        Args:
            raw_dataframe (pd.DataFrame):
                raw dataframe as read from pd.read_csv().
                This dataframe is discarded afterwards.

        Returns:
            pd.DataFrame: resulting preprocessed dataframe.
        """
        dataframe = self._drop_unnecessary_columns(
            raw_dataframe
        )  # for conveneince
        dataframe = self.remove_nonaccepting_rows(dataframe)
        dataframe = self.treat_outliers(dataframe, "Hours")

        dataframe["Anxiety_score"] = self.get_combined_anxiety_score(dataframe)
        dataframe["Is_narcissist"] = self.get_is_narcissist_col(dataframe)

        dataframe["Is_competitive"] = self.preprocess_whyplay(dataframe)
        # more preprocessing goes here
        return dataframe

    def get_is_competitive_col(self, dataframe: pd.DataFrame):
        is_competitive_col = np.zeros(shape=len(dataframe))
        is_competitive_col[
            (dataframe["whyplay"] == "improving")
            | (dataframe["whyplay"] == "winning")
            | (dataframe["whyplay"] == "all of the above")
        ] = True
        is_competitive_col[
            (dataframe["whyplay"] == "having fun")
            | (dataframe["whyplay"] == "relaxing")
        ] = False
        is_competitive_col[(dataframe["whyplay"] == "other")] = None

        return is_competitive_col

    def _drop_unnecessary_columns(
        self, dataframe: pd.DataFrame
    ) -> pd.DataFrame:
        """Drop unnecessary rows from the dataset.

        Args:
            dataframe (pd.DataFrame): the dataframe.

        Returns:
            pd.DataFrame: the dataframe.
        """
        rows_to_drop = (
            [
                "League",
                "S. No.",
                "Timestamp",
                "highestleague",
                "earnings",
                "Birthplace",
                "Birthplace_ISO3",
            ]
            + [f"GAD{i}" for i in range(1, 8)]
            + ["GADE"]
            + [f"SWL{i}" for i in range(1, 6)]
            + [f"SPIN{i}" for i in range(1, 18)]
        )
        return dataframe.drop(rows_to_drop, axis="columns")

    def remove_nonaccepting_rows(
        self, dataframe: pd.DataFrame
    ) -> pd.DataFrame:
        """Removes rows where participants did not consent to data processing.

        Args:
            dataframe (pd.DataFrame): the dataframe.

        Returns:
            pd.DataFrame: the dataframe.
        """
        # drop rows where users did not accept to having their data used
        dataframe = dataframe.drop(
            dataframe[dataframe["accept"] != "Accept"].index,
        )
        dataframe = dataframe.drop(["accept"], axis=1)
        return dataframe

    def preprocess_whyplay(self, dataframe: pd.DataFrame) -> pd.Series:
        """Preprocesses the whyplay column, and returns a Is_competitive col.

        Args:
            dataframe (pd.DataFrame): the dataframe.

        Returns:
            pd.Series: the Is_competitive column.
        """
        dataframe["whyplay"] = dataframe["whyplay"].str.lower()
        most_common_whyplay_reasons = list(
            dataframe.groupby("whyplay")
            .size()
            .sort_values(ascending=False)
            .head(5)
            .index
        )
        dataframe.loc[
            ~dataframe["whyplay"].isin(most_common_whyplay_reasons), "whyplay"
        ] = "other"
        is_competitive_col = self.get_is_competitive_col(dataframe)
        return is_competitive_col

    def treat_outliers(self, df, colname) -> pd.DataFrame:
        q = df[colname].quantile(0.99)
        return df[df[colname] < q]

    def get_combined_anxiety_score(self, dataframe: pd.DataFrame) -> pd.Series:
        """Get the combined anxiety score, as a column.
        This score is based on the GAN, SPIN and SWL metrics.
        Each of the three columns are first normalised,
          then the mean is returned.

        Args:
            dataframe (pd.DataFrame): the dataframe.

        Returns:
            pd.Series: the anxiety score column.
        """
        gad_max = 21
        gad_min = 0
        gad_normalised = (dataframe["GAD_T"] - gad_min) / gad_max
        spin_max = 68
        spin_min = 0
        spin_normalised = (dataframe["SPIN_T"] - spin_min) / spin_max
        swl_max = 35
        swl_min = 5
        swl_flipped = 1 - (dataframe["SWL_T"] - swl_min) / swl_max
        combined = (gad_normalised + spin_normalised + swl_flipped) / 3
        return combined

    def get_is_narcissist_col(self, dataframe: pd.DataFrame) -> pd.Series:
        """Get a boolean narcissist column.
        The Narcissism score of 1.0 is considered Not a Narcissist,
        while all values above that are above are considered Narcissist.

        Args:
            dataframe (pd.DataFrame): the dataframe

        Returns:
            pd.Series: the boolean narcissist column.
        """
        return np.where(dataframe["Narcissism"] <= 1.0, True, False)

    def get_dataframe(self) -> pd.DataFrame:
        """A getter function for the dataframe.

        Returns:
            pd.DataFrame: the dataset.
        """
        return self.dataframe

    def get_sorted_column(
        self, colname: str, ascending: bool = True
    ) -> pd.Series:
        """Returns a single column, sorted either ascending or descending.

        Args:
            colname (str): the column name (see get_dataset_columns()).
            ascending (bool, optional): Sorting order. Defaults to True.

        Returns:
            pd.Series: The sorted column.
        """
        return self.dataframe[colname].sort_values(ascending=ascending)

    def get_unique_column_values(self, colname: str):
        """Returns a count of categorical values in the dataset.

        Args:
            colname (str): the column name.

        Returns:
            string array: an array of strings containing the unique values
            present in the column
        """
        return self.dataframe[colname].explode().unique()

    def get_category_counts(
        self, colname: str, ascending: bool = None
    ) -> pd.Series:
        """Returns a count of categorical values in the dataset.

        Args:
            colname (str): the column name.
            ascending (bool, optional): Direction to sort results.
              If set to None, the results are not sorted. Defaults to None.

        Returns:
            pd.Series: the counted categories.
        """
        grouped_size = self.dataframe.groupby(colname).size()
        return (
            grouped_size
            if ascending is None
            else grouped_size.sort_values(ascending=ascending)
        )