From 6a682ba87b61147cd7768f6c6dfe53d2888722dd Mon Sep 17 00:00:00 2001
From: Sortofamudkip <wishyutp0328@gmail.com>
Date: Mon, 10 Jul 2023 17:41:22 +0200
Subject: [PATCH] preprocess_whyplay

---
 src/Dataset.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/Dataset.py b/src/Dataset.py
index 2714500..e4fd4be 100644
--- a/src/Dataset.py
+++ b/src/Dataset.py
@@ -27,9 +27,25 @@ class Dataset:
         dataframe = raw_dataframe.drop(["League"], axis="columns")
         dataframe["Anxiety_score"] = self.get_combined_anxiety_score(dataframe)
         dataframe["Is_narcissist"] = self.get_is_narcissist_col(dataframe)
+
+        self.preprocess_whyplay(dataframe)
         # more preprocessing goes here
         return dataframe
 
+    def preprocess_whyplay(self, dataframe: pd.DataFrame):
+        dataframe["whyplay"] = dataframe["whyplay"].str.lower()
+        most_common_whyplay_reasons = list(
+            dataframe.groupby("whyplay")
+            .size()
+            .sort_values(ascending=False)
+            .head(5)
+            .index
+        )
+        dataframe[
+            ~dataframe["whyplay"].isin(most_common_whyplay_reasons)
+        ] = "other"
+        return most_common_whyplay_reasons
+
     def get_combined_anxiety_score(self, dataframe: pd.DataFrame) -> pd.Series:
         """Get the combined anxiety score, as a column.
         This score is based on the GAN, SPIN and SWL metrics.
-- 
GitLab