basic preprocess_dataset() and get_sorted_column()

7bdb14cf · Sortofamudkip · 5129400f · 7bdb14cf
Commit 7bdb14cf authored 1 year ago by Sortofamudkip
--- a/src/Dataset.py
+++ b/src/Dataset.py
@@ -3,12 +3,30 @@ import pandas as pd

 class Dataset:
    def __init__(self, dataset_filename: str) -> None:
-        raw_dataset = pd.read_csv(dataset_filename, encoding="windows-1254")
-        self.dataset = self.preprocess_dataset(raw_dataset)
+        raw_dataframe = pd.read_csv(dataset_filename, encoding="windows-1254")
+        self.dataframe = self.preprocess_dataset(raw_dataframe)

-    def preprocess_dataset(self, raw_dataframe):
-        # preprocessing goes here
-        return raw_dataframe
+    def preprocess_dataset(self, raw_dataframe: pd.DataFrame) -> pd.DataFrame:
+        """preprocess dataframe immediately after loading it.
+
+        Args:
+            raw_dataframe (pd.DataFrame):
+                raw dataframe as read from pd.read_csv().
+
+        Returns:
+            pd.DataFrame: resulting preprocessed dataframe.
+        """
+        dataframe = raw_dataframe.drop(["League"], axis="columns")
+        # more preprocessing goes here
+        return dataframe
+
+    def get_dataframe(self) -> pd.DataFrame:
+        """A getter function for the dataframe.
+
+        Returns:
+            pd.DataFrame: the dataset.
+        """
+        return self.dataframe

    def draw_histogram(self):
        raise NotImplementedError
@@ -18,3 +36,15 @@ class Dataset:

    def get_plottable_columns(self) -> list:
        raise NotImplementedError
+
+    def get_sorted_column(self, colname: str, ascending=True) -> pd.Series:
+        """Returns a single column, sorted either ascending or descending.
+
+        Args:
+            colname (str): the column name (see get_dataset_columns()).
+            ascending (bool, optional): Sorting order. Defaults to True.
+
+        Returns:
+            pd.Series: The sorted column.
+        """
+        return self.dataframe[colname].sort_values(ascending=ascending)