diff --git a/src/Dataset.py b/src/Dataset.py
index 271450048e73f9ade1810db20e389d95cfd6831a..e4fd4be5fcfccb186401b5ce6a9ff439df021e8c 100644
--- a/src/Dataset.py
+++ b/src/Dataset.py
@@ -27,9 +27,25 @@ class Dataset:
         dataframe = raw_dataframe.drop(["League"], axis="columns")
         dataframe["Anxiety_score"] = self.get_combined_anxiety_score(dataframe)
         dataframe["Is_narcissist"] = self.get_is_narcissist_col(dataframe)
+
+        self.preprocess_whyplay(dataframe)
         # more preprocessing goes here
         return dataframe
 
+    def preprocess_whyplay(self, dataframe: pd.DataFrame):
+        dataframe["whyplay"] = dataframe["whyplay"].str.lower()
+        most_common_whyplay_reasons = list(
+            dataframe.groupby("whyplay")
+            .size()
+            .sort_values(ascending=False)
+            .head(5)
+            .index
+        )
+        dataframe[
+            ~dataframe["whyplay"].isin(most_common_whyplay_reasons)
+        ] = "other"
+        return most_common_whyplay_reasons
+
     def get_combined_anxiety_score(self, dataframe: pd.DataFrame) -> pd.Series:
         """Get the combined anxiety score, as a column.
         This score is based on the GAN, SPIN and SWL metrics.