Newer
Older
import pandas as pd
class Dataset:
def __init__(self, dataset_filename: str) -> None:
raw_dataframe = pd.read_csv(dataset_filename, encoding="windows-1254")
self.dataframe = self.preprocess_dataset(raw_dataframe)
def preprocess_dataset(self, raw_dataframe: pd.DataFrame) -> pd.DataFrame:
"""preprocess dataframe immediately after loading it.
Args:
raw_dataframe (pd.DataFrame):
raw dataframe as read from pd.read_csv().
Returns:
pd.DataFrame: resulting preprocessed dataframe.
"""
dataframe = raw_dataframe.drop(["League"], axis="columns")
# more preprocessing goes here
return dataframe
def get_dataframe(self) -> pd.DataFrame:
"""A getter function for the dataframe.
Returns:
pd.DataFrame: the dataset.
"""
return self.dataframe
def draw_histogram(self):
raise NotImplementedError
def get_dataset_columns(self) -> list:
raise NotImplementedError
def get_plottable_columns(self) -> list:
raise NotImplementedError
def get_sorted_column(self, colname: str, ascending=True) -> pd.Series:
"""Returns a single column, sorted either ascending or descending.
Args:
colname (str): the column name (see get_dataset_columns()).
ascending (bool, optional): Sorting order. Defaults to True.
Returns:
pd.Series: The sorted column.
"""
return self.dataframe[colname].sort_values(ascending=ascending)