diff --git a/src/Dataset.py b/src/Dataset.py index 271450048e73f9ade1810db20e389d95cfd6831a..e4fd4be5fcfccb186401b5ce6a9ff439df021e8c 100644 --- a/src/Dataset.py +++ b/src/Dataset.py @@ -27,9 +27,25 @@ class Dataset: dataframe = raw_dataframe.drop(["League"], axis="columns") dataframe["Anxiety_score"] = self.get_combined_anxiety_score(dataframe) dataframe["Is_narcissist"] = self.get_is_narcissist_col(dataframe) + + self.preprocess_whyplay(dataframe) # more preprocessing goes here return dataframe + def preprocess_whyplay(self, dataframe: pd.DataFrame): + dataframe["whyplay"] = dataframe["whyplay"].str.lower() + most_common_whyplay_reasons = list( + dataframe.groupby("whyplay") + .size() + .sort_values(ascending=False) + .head(5) + .index + ) + dataframe[ + ~dataframe["whyplay"].isin(most_common_whyplay_reasons) + ] = "other" + return most_common_whyplay_reasons + def get_combined_anxiety_score(self, dataframe: pd.DataFrame) -> pd.Series: """Get the combined anxiety score, as a column. This score is based on the GAN, SPIN and SWL metrics.