refactored Dataset class columns

64b16115 · Sortofamudkip · dc023d15 · 64b16115
Commit 64b16115 authored 1 year ago by Sortofamudkip
--- a/src/Dataset.py
+++ b/src/Dataset.py
@@ -20,27 +20,21 @@ class Dataset:
        Args:
            raw_dataframe (pd.DataFrame):
                raw dataframe as read from pd.read_csv().
+                This dataframe is discarded afterwards.

        Returns:
            pd.DataFrame: resulting preprocessed dataframe.
        """
-        dataframe = raw_dataframe  # for conveneince
-        rows_to_drop = [
-            "League",
-            "S. No.",
-            "Timestamp",
-            "highestleague",
-            "earnings",
-        ]
+        dataframe = self._drop_unnecessary_columns(
+            raw_dataframe
+        )  # for conveneince
        # rows_to_drop = []
-        dataframe = raw_dataframe.drop(rows_to_drop, axis="columns")
        dataframe = self.remove_nonaccepting_rows(dataframe)

        dataframe["Anxiety_score"] = self.get_combined_anxiety_score(dataframe)
        dataframe["Is_narcissist"] = self.get_is_narcissist_col(dataframe)

-        self.preprocess_whyplay(dataframe)
-        dataframe["Is_competitive"] = self.get_is_competitive_col(dataframe)
+        dataframe["Is_competitive"] = self.preprocess_whyplay(dataframe)
        # more preprocessing goes here
        return dataframe

@@ -60,6 +54,16 @@ class Dataset:

        return is_competitive_col

+    def _drop_unnecessary_columns(self, dataframe: pd.DataFrame):
+        rows_to_drop = [
+            "League",
+            "S. No.",
+            "Timestamp",
+            "highestleague",
+            "earnings",
+        ]
+        return dataframe.drop(rows_to_drop, axis="columns")
+
    def remove_nonaccepting_rows(self, dataframe: pd.DataFrame):
        # drop rows where users did not accept to having their data used
        dataframe = dataframe.drop(
@@ -80,7 +84,8 @@ class Dataset:
        dataframe.loc[
            ~dataframe["whyplay"].isin(most_common_whyplay_reasons), "whyplay"
        ] = "other"
-        return most_common_whyplay_reasons
+        is_competitive_col = self.get_is_competitive_col(dataframe)
+        return is_competitive_col

    def get_combined_anxiety_score(self, dataframe: pd.DataFrame) -> pd.Series:
        """Get the combined anxiety score, as a column.
@@ -140,10 +145,8 @@ class Dataset:
            pd.Series: The sorted column.
        """
        return self.dataframe[colname].sort_values(ascending=ascending)
-    
-    
-    def get_unique_column_values(
-            self, colname: str):
+
+    def get_unique_column_values(self, colname: str):
        """Returns a count of categorical values in the dataset.

        Args:
@@ -155,7 +158,6 @@ class Dataset:
        """
        return self.dataframe[colname].explode().unique()

-
    def get_category_counts(
        self, colname: str, ascending: bool = None
    ) -> pd.Series: