From 6a682ba87b61147cd7768f6c6dfe53d2888722dd Mon Sep 17 00:00:00 2001 From: Sortofamudkip <wishyutp0328@gmail.com> Date: Mon, 10 Jul 2023 17:41:22 +0200 Subject: [PATCH] preprocess_whyplay --- src/Dataset.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/Dataset.py b/src/Dataset.py index 2714500..e4fd4be 100644 --- a/src/Dataset.py +++ b/src/Dataset.py @@ -27,9 +27,25 @@ class Dataset: dataframe = raw_dataframe.drop(["League"], axis="columns") dataframe["Anxiety_score"] = self.get_combined_anxiety_score(dataframe) dataframe["Is_narcissist"] = self.get_is_narcissist_col(dataframe) + + self.preprocess_whyplay(dataframe) # more preprocessing goes here return dataframe + def preprocess_whyplay(self, dataframe: pd.DataFrame): + dataframe["whyplay"] = dataframe["whyplay"].str.lower() + most_common_whyplay_reasons = list( + dataframe.groupby("whyplay") + .size() + .sort_values(ascending=False) + .head(5) + .index + ) + dataframe[ + ~dataframe["whyplay"].isin(most_common_whyplay_reasons) + ] = "other" + return most_common_whyplay_reasons + def get_combined_anxiety_score(self, dataframe: pd.DataFrame) -> pd.Series: """Get the combined anxiety score, as a column. This score is based on the GAN, SPIN and SWL metrics. -- GitLab