Skip to content
Snippets Groups Projects
Dataset.py 1.55 KiB
Newer Older
import pandas as pd


class Dataset:
    def __init__(self, dataset_filename: str) -> None:
        raw_dataframe = pd.read_csv(dataset_filename, encoding="windows-1254")
        self.dataframe = self.preprocess_dataset(raw_dataframe)
    def preprocess_dataset(self, raw_dataframe: pd.DataFrame) -> pd.DataFrame:
        """preprocess dataframe immediately after loading it.

        Args:
            raw_dataframe (pd.DataFrame):
                raw dataframe as read from pd.read_csv().

        Returns:
            pd.DataFrame: resulting preprocessed dataframe.
        """
        dataframe = raw_dataframe.drop(["League"], axis="columns")
        # more preprocessing goes here
        return dataframe

    def get_dataframe(self) -> pd.DataFrame:
        """A getter function for the dataframe.

        Returns:
            pd.DataFrame: the dataset.
        """
        return self.dataframe

    def draw_histogram(self):
        raise NotImplementedError

    def get_dataset_columns(self) -> list:
        raise NotImplementedError

    def get_plottable_columns(self) -> list:
        raise NotImplementedError

    def get_sorted_column(self, colname: str, ascending=True) -> pd.Series:
        """Returns a single column, sorted either ascending or descending.

        Args:
            colname (str): the column name (see get_dataset_columns()).
            ascending (bool, optional): Sorting order. Defaults to True.

        Returns:
            pd.Series: The sorted column.
        """
        return self.dataframe[colname].sort_values(ascending=ascending)