Merge branch 'master' of...

Merge branch 'master' of gitup.uni-potsdam.de:nifranz/rse-23-group-assignment-shervud-pitawanik-franz into 18-write-tests

Merge branch 'master' of...
0af064ff · Sortofamudkip · bdad7027 · f17dbf97 · 0af064ff · 0af064ff
Commit 0af064ff authored 1 year ago by Sortofamudkip
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ pyproject.toml
 # for testing
 dump.ipynb
+Bullshit.py
 DUMP_ds.py

--- a/Notebook.ipynb
+++ b/Notebook.ipynb
@@ -8,9 +8,11 @@
    "\n",
    "# Overview \n",
    "\n",
-    "In this project we decided to analyze anxiety in Gamers. We picked the dataset from kaggle because it intersected our personal interests. The data can be found [here](https://www.kaggle.com/datasets/divyansh22/online-gaming-anxiety-data)\n",
+    "In this project we decided to analyze anxiety in Gamers. We picked the dataset from kaggle because it intersected our personal interests. The data and survey can be found [here](https://www.kaggle.com/datasets/divyansh22/online-gaming-anxiety-data)\n",
    "\n",
-    "The data was acquired by a survey published and shared online. This way everyone could participate. For us that also means analyzing and di"
+    "The data was acquired by a survey published and shared online. This way everyone could participate. For us that also means taking into account that the distribution and answers can be scewed. \n",
+    "\n",
+    "## Motivation - "
   ]
  },
  {
@@ -19,10 +21,14 @@
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
+     "ename": "ModuleNotFoundError",
-     "output_type": "stream",
+     "evalue": "No module named 'src.Dataset'",
-     "text": [
+     "output_type": "error",
-      "<src.Dataset.Dataset object at 0x000001CA32F35BD0>\n"
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msrc\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mDataset\u001b[39;00m \u001b[39mimport\u001b[39;00m Dataset \n\u001b[0;32m      3\u001b[0m dataset \u001b[39m=\u001b[39m Dataset(\u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\\\u001b[39m\u001b[39mGamingStudy_data.csv\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m      4\u001b[0m \u001b[39mprint\u001b[39m(dataset)\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'src.Dataset'"
     ]
    }
   ],
@@ -30,7 +36,7 @@
    "from src.Dataset import Dataset \n",
    "\n",
    "dataset = Dataset(\"data\\GamingStudy_data.csv\")\n",
-    "print(dataset)"
+    "print(dataset)\n"
   ]
  },
  {
@@ -39,14 +45,14 @@
   "source": [
    "# Data Exploration\n",
    "\n",
-    "\n",
-    "\n",
    "Because the data was accumulated in a semi-professional way for a pre-study we had to clean it up and make some changes. \n",
    "\n",
    "Some columns could be answered with an open text field. Naturally the answeres in those columns are very diversified and hard to analyze. \n",
-    "+ Example\n",
+    "\n",
-    "+ Example\n",
+    "#### Affected Columns\n",
-    "+ Example\n",
+    "+ Whyplay\n",
+    "+ Earnings \n",
+    "+ League\n",
    "\n",
    "In the following we will explain if and how we used these columns. \n",
    "\n",
@@ -56,7 +62,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -132,8 +138,7 @@
    "A more detailed interpretation can be found [here](http://labs.psychology.illinois.edu/~ediener/Documents/Understanding%20SWLS%20Scores.pdf).\n",
    "\n",
    "Residents of developed nations (e.g. DE) usually score 20-24.\n",
-    "### Take it yourself\n",
+    "#### Questions \n",
-    "\n",
    "____ In most ways my life is close to my ideal.<br>\n",
    "____ The conditions of my life are excellent.<br>\n",
    "____ I am satisfied with my life.<br>\n",
@@ -148,13 +153,24 @@
   "metadata": {},
   "source": [
    "# Analysis\n",
-    "Explained new columns and why we did that (\"Is_narcissist, \"Anxiety_score\")\n",
+    "\n",
-    "## Normalizing the Data \n"
+    "## Preprocessing \n",
+    "* Explained new columns and why we did that *\n",
+    "\n",
+    "Some columns gave the options to write individual responses. Naturally those are not useful in data analysis. In some cases we cleaned the columns and changes the unusual cases to \"Other\"/\"NA\"\n",
+    "### Cleaned Columns\n",
+    "+ \"Whyplay\" \n",
+    "+ Accept \n",
+    "## Normalizing the Data \n",
+    "\n",
+    "### \"Is_narcissist,\n",
+    "### \"Anxiety_score\"\n",
+    "### \"Is_competetive"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@@ -171,10 +187,10 @@
       "13461    0.125210\n",
       "13462    0.591783\n",
       "13463    0.243231\n",
-       "Length: 13464, dtype: float64"
+       "Length: 13050, dtype: float64"
      ]
     },
-     "execution_count": 2,
+     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -196,18 +212,25 @@
    "\n",
    "### Women vs Men \n",
    "\n",
-    "Explanation "
+    "Explanation \n",
+    "![Example Plot](https://cdn.discordapp.com/attachments/806128836332879924/1127988009627832320/Unbenannt.png)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
+    "\n",
    "# SIDE BY SIDE PLOTS \n",
    "# LEFT = LINE Graph distribution of Anxiety Score Related to Group\n",
-    "# RIGHT = Stacked Bars comparing the GROUP with = 1. 2. 3. "
+    "# RIGHT = Stacked Bars comparing the GROUP with = \n",
+    "# 1.[Work] - 4 Bars\n",
+    "# 2.[Degree] - 5 Bars\n",
+    "# 3.[Whyplay ] - 4 Bars (Everything until \"All of them\")\n",
+    "\n",
+    "#"
   ]
  },
  {
@@ -226,26 +249,10 @@
   "source": [
    "# SIDE BY SIDE PLOTS \n",
    "# LEFT = LINE Graph distribution of Anxiety Score Related to Group\n",
-    "# RIGHT = Stacked Bars comparing the GROUP with = 1. 2. 3. "
+    "# RIGHT = Stacked Bars comparing the GROUP with = \n",
-   ]
+    "# 1.[Work] - 4 Bars\n",
-  },
+    "# 2.[Degree] - 5 Bars\n",
-  {
+    "# 3.[Whyplay ] - 4 Bars (Everything until \"All of them\")"
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### High Education vs Lower Education\n",
-    "Explanation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# SIDE BY SIDE PLOTS \n",
-    "# LEFT = LINE Graph distribution of Anxiety Score Related to Group\n",
-    "# RIGHT = Stacked Bars comparing the GROUP with = 1. 2. 3. "
   ]
  },
  {
@@ -263,14 +270,17 @@
   "source": [
    "# SIDE BY SIDE PLOTS \n",
    "# LEFT = LINE Graph distribution of Anxiety Score Related to Group\n",
-    "# RIGHT = Stacked Bars comparing the GROUP with = 1. 2. 3. "
+    "# RIGHT = Stacked Bars comparing the GROUP with = \n",
+    "# 1.[Work] - 4 Bars\n",
+    "# 2.[Degree] - 5 Bars\n",
+    "# 3.[Whyplay ] - 4 Bars (Everything until \"All of them\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Q2 - Correlations betweeet played hours and ones well being. "
+    "## Q2 - Correlations between played hours and one's well being. "
   ]
  },
  {
@@ -291,7 +301,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -308,7 +318,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -335,6 +345,13 @@
    "2. Is the amount of educated players similar "
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Scatter](https://cdn.discordapp.com/attachments/1127973734884581386/1127973829344493679/image.png)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -344,11 +361,11 @@
    "#### Analyze the countries amounting to Top 7 or 90% of the survey. \n",
    "\n",
    "\n",
-    "#Q4. MAP PLOT = Most played game per country (Dont do it if its League everywhere. )\n",
+    "#Q4.MAP PLOT = Most played game per country (Dont do it if its League everywhere. )\n",
    "#Q4 MAP PLOT = Heat Map with redder areas for more Anxiety in the country. \n",
    "#Q1.2 Grouped Bar Chart with the top game next to the \"Anxiety Score\"\n",
    "\n",
-    "#2 Grouped Bars with Education per Country IF POSSIBLE grouped with a bar for the anxiety "
+    "#2 Scatter PLot like in the example "
   ]
  }
 ],

 %% Cell type:markdown id: tags:
 # Anxiety in Computer-Gamers: differences, similiratires and learnings
 # Overview
-In this project we decided to analyze anxiety in Gamers. We picked the dataset from kaggle because it intersected our personal interests. The data can be found [here](https://www.kaggle.com/datasets/divyansh22/online-gaming-anxiety-data)
+In this project we decided to analyze anxiety in Gamers. We picked the dataset from kaggle because it intersected our personal interests. The data and survey can be found [here](https://www.kaggle.com/datasets/divyansh22/online-gaming-anxiety-data)
-The data was acquired by a survey published and shared online. This way everyone could participate. For us that also means analyzing and di
+The data was acquired by a survey published and shared online. This way everyone could participate. For us that also means taking into account that the distribution and answers can be scewed.
+## Motivation -
 %% Cell type:code id: tags:
 ``` python
 from src.Dataset import Dataset
 dataset = Dataset("data\GamingStudy_data.csv")
 print(dataset)
 ```
 %% Output
-    <src.Dataset.Dataset object at 0x000001CA32F35BD0>
+    ---------------------------------------------------------------------------
+    ModuleNotFoundError                       Traceback (most recent call last)
+Cell     In[1], line 1
+    ----> 1 from src.Dataset import Dataset
+          3 dataset = Dataset("data\GamingStudy_data.csv")
+          4 print(dataset)
+    ModuleNotFoundError: No module named 'src.Dataset'
 %% Cell type:markdown id: tags:
 # Data Exploration
 Because the data was accumulated in a semi-professional way for a pre-study we had to clean it up and make some changes.
 Some columns could be answered with an open text field. Naturally the answeres in those columns are very diversified and hard to analyze.
-+ Example
-+ Example
+#### Affected Columns
-+ Example
+ Whyplay
+ Earnings
+ League
 In the following we will explain if and how we used these columns.
 Stuff like deleted columns, general overview of the distribution (men women, games, platform) and problems with it
 %% Cell type:code id: tags:
 ``` python
 # BAR PLOTS for Gender distribution
 # BAR PLOTS FOR Plattform
 # BAR PLOTS for GAMES
 # DIFFERENT Colored Bars
 ```
 %% Cell type:markdown id: tags:
 ## Explanation of technical terms
 ### SPIN
 SPIN stands for Social Phobia Inventory
 The SPIN is a standardized set of 17 question. After answering the questionnaire a “SPIN” value is calculated which is effective for screening for and measuring the severity of social anxiety disorder
 1. I am afraid of people in authority.
 2. I am bothered by blushing in front of people.
 3. Parties and social events scare me.
 4. I avoid talking to people I don’t know.
 5. Being criticized scares me a lot.
 6. I avoid doing things or speaking to people for fear of embarrassment.
 7. Sweating in front of people causes me distress.
 8. I avoid going to parties.
 9. I avoid activities in which I am the center of attention.
 10. Talking to strangers scares me.
 11. I avoid having to give speeches.
 12. I would do anything to avoid being criticized.
 13. Heart palpitations bother me when I am around people.
 14. I am afraid of doing things when people might be watching.
 15. Being embarrassed or looking stupid are among my worst fears.
 16. I avoid speaking to anyone in authority.
 17. Trembling or shaking in front of others is distressing to me.
 ### GAD
 is a mental and behavioral, disorder, specifally an anxiety disorder characterized by excessive, uncontrollable and often irrational worry about events or activities. There are specific questionaires you can use to evaluate the disorder. In the questionnaire the minimum is 0 and maximum is 21
 #### Worries of concern
 - Health
 - Finances
 - Death
 - Family
 - Relationships
 - Work
 #### Symptoms
 - Excessive worry
 - Restlessness,
 - Low Concentration
 - Trouble sleeping
 - Exhaustion / Fatigablity
 - Irritability
 - Sweating
 - Trembling (Muscle contraction)
 In the questionnaire the question target these symptoms and worries and summarize them into a score between 0 and 21.
 ### SWL
 #### Explanation
 The survey has 5 questions. You fill it in yourself (not a psychiatrist).
 For each question, you choose any integer between 1 (highly disagree) to 7 (highly agree).
 In general, lower numbers mean you are less satisfied with life in a certain way.
 This means you can score between 5 (least satisfied) to 35 (most satisfied).
 #### Interpretation
 The (total) SWL score can be interpreted as:
 - 31 - 35 Extremely satisfied
 - 26 - 30 Satisfied
 - 21 - 25 Slightly satisfied
 - 20 Neutral
 - 15 - 19 Slightly dissatisfied
 - 10 - 14 Dissatisfied
 - 5 - 9 Extremely dissatisfied
 A more detailed interpretation can be found [here](http://labs.psychology.illinois.edu/~ediener/Documents/Understanding%20SWLS%20Scores.pdf).
 Residents of developed nations (e.g. DE) usually score 20-24.
-### Take it yourself
+#### Questions
 ____ In most ways my life is close to my ideal.<br>
 ____ The conditions of my life are excellent.<br>
 ____ I am satisfied with my life.<br>
 ____ So far I have gotten the important things I want in life.<br>
 ____ If I could live my life over, I would change almost nothing.<br>
 ---
 %% Cell type:markdown id: tags:
 # Analysis
-Explained new columns and why we did that ("Is_narcissist, "Anxiety_score")
+## Preprocessing
+* Explained new columns and why we did that *
+Some columns gave the options to write individual responses. Naturally those are not useful in data analysis. In some cases we cleaned the columns and changes the unusual cases to "Other"/"NA"
+### Cleaned Columns
+ "Whyplay"
+ Accept
 ## Normalizing the Data
+### "Is_narcissist,
+### "Anxiety_score"
+### "Is_competetive
 %% Cell type:code id: tags:
 ``` python
 # Executing and showing new columns
 dataset.get_combined_anxiety_score(dataset.get_dataframe())
 ```
 %% Output
    0        0.202288
    1        0.517320
    2        0.497993
    3        0.272969
    4        0.533567
               ...
    13459    0.212092
    13460    0.601914
    13461    0.125210
    13462    0.591783
    13463    0.243231
-    Length: 13464, dtype: float64
+    Length: 13050, dtype: float64
 %% Cell type:markdown id: tags:
 ## Q1 -  Which gamers are more anxiety prone ?
 Text .......
 We compare
 ### Women vs Men
 Explanation
+![Example Plot](https://cdn.discordapp.com/attachments/806128836332879924/1127988009627832320/Unbenannt.png)
 %% Cell type:code id: tags:
 ``` python
-# SIDE BY SIDE PLOTS
-# LEFT = LINE Graph distribution of Anxiety Score Related to Group
-# RIGHT = Stacked Bars comparing the GROUP with = 1. 2. 3.
-```
-%% Cell type:markdown id: tags:
-### Competetive vs Easy Going Players
-Explanation
-%% Cell type:code id: tags:
-``` python
 # SIDE BY SIDE PLOTS
 # LEFT = LINE Graph distribution of Anxiety Score Related to Group
-# RIGHT = Stacked Bars comparing the GROUP with = 1. 2. 3.
+# RIGHT = Stacked Bars comparing the GROUP with =
+# 1.[Work] - 4 Bars
+# 2.[Degree] - 5 Bars
+# 3.[Whyplay ] - 4 Bars (Everything until "All of them")
+#
 ```
 %% Cell type:markdown id: tags:
-### High Education vs Lower Education
+### Competetive vs Easy Going Players
 Explanation
 %% Cell type:code id: tags:
 ``` python
 # SIDE BY SIDE PLOTS
 # LEFT = LINE Graph distribution of Anxiety Score Related to Group
-# RIGHT = Stacked Bars comparing the GROUP with = 1. 2. 3.
+# RIGHT = Stacked Bars comparing the GROUP with =
+# 1.[Work] - 4 Bars
+# 2.[Degree] - 5 Bars
+# 3.[Whyplay ] - 4 Bars (Everything until "All of them")
 ```
 %% Cell type:markdown id: tags:
 ### Narcissist vs Non-Narcissist
 %% Cell type:code id: tags:
 ``` python
 # SIDE BY SIDE PLOTS
 # LEFT = LINE Graph distribution of Anxiety Score Related to Group
-# RIGHT = Stacked Bars comparing the GROUP with = 1. 2. 3.
+# RIGHT = Stacked Bars comparing the GROUP with =
+# 1.[Work] - 4 Bars
+# 2.[Degree] - 5 Bars
+# 3.[Whyplay ] - 4 Bars (Everything until "All of them")
 ```
 %% Cell type:markdown id: tags:
-## Q2 - Correlations betweeet played hours and ones well being.
+## Q2 - Correlations between played hours and one's well being.
 %% Cell type:code id: tags:
 ``` python
 ####
 ```
 %% Cell type:markdown id: tags:
 ## Q3 - Effect of the reason for playing on the satisfaction with life
 %% Cell type:code id: tags:
 ``` python
 # Horizontal bar chart, one row for every reason for with top width
 # Anxiety colored in for the amount of anxiety in that group
 ```
 %% Cell type:markdown id: tags:
 ### Effects of income level, earnings, education on the reason to play
 %% Cell type:code id: tags:
 ``` python
 #Overlaying Histogram
 # Histogram for the income level Y = %, X = low to high
 # One in Green for the income
 # One in Red for the Anxiety for those people
 ```
 %% Cell type:markdown id: tags:
 ## Q4 - Gamers from different countries
 %% Cell type:markdown id: tags:
 1. Do they play different games ?
    1. Are they reacting differently to those games
 2. Is the amount of educated players similar
+%% Cell type:markdown id: tags:
+![Scatter](https://cdn.discordapp.com/attachments/1127973734884581386/1127973829344493679/image.png)
 %% Cell type:code id: tags:
 ``` python
 #### Analyze the countries amounting to Top 7 or 90% of the survey.
-#Q4. MAP PLOT = Most played game per country (Dont do it if its League everywhere. )
+#Q4.MAP PLOT = Most played game per country (Dont do it if its League everywhere. )
 #Q4 MAP PLOT = Heat Map with redder areas for more Anxiety in the country.
 #Q1.2 Grouped Bar Chart with the top game next to the "Anxiety Score"
-#2 Grouped Bars with Education per Country IF POSSIBLE grouped with a bar for the anxiety
+#2 Scatter PLot like in the example
 ```

--- a/src/Dataset.py
+++ b/src/Dataset.py
@@ -5,7 +5,7 @@ import pandas as pd
 class Dataset:
    def __init__(self, dataset_filename: str) -> None:
        """A wrapper class for the pandas dataframe.
-        Loads the dataset
+        Loads and preprocesses the dataset, then stores it in self.dataframe.
        Args:
            dataset_filename (str): the path of the dataset,
@@ -20,16 +20,99 @@ class Dataset:
        Args:
            raw_dataframe (pd.DataFrame):
                raw dataframe as read from pd.read_csv().
+                This dataframe is discarded afterwards.
        Returns:
            pd.DataFrame: resulting preprocessed dataframe.
        """
-        dataframe = raw_dataframe.drop(["League"], axis="columns")
+        dataframe = self._drop_unnecessary_columns(
+            raw_dataframe
+        )  # for conveneince
+        dataframe = self.remove_nonaccepting_rows(dataframe)
        dataframe["Anxiety_score"] = self.get_combined_anxiety_score(dataframe)
        dataframe["Is_narcissist"] = self.get_is_narcissist_col(dataframe)
+        dataframe["Is_competitive"] = self.preprocess_whyplay(dataframe)
        # more preprocessing goes here
        return dataframe
+    def get_is_competitive_col(self, dataframe: pd.DataFrame):
+        is_competitive_col = np.zeros(shape=len(dataframe))
+        is_competitive_col[
+            (dataframe["whyplay"] == "improving")
+            | (dataframe["whyplay"] == "winning")
+            | (dataframe["whyplay"] == "all of the above")
+        ] = True
+        is_competitive_col[
+            (dataframe["whyplay"] == "having fun")
+            | (dataframe["whyplay"] == "relaxing")
+        ] = False
+        is_competitive_col[(dataframe["whyplay"] == "other")] = None
+        return is_competitive_col
+    def _drop_unnecessary_columns(
+        self, dataframe: pd.DataFrame
+    ) -> pd.DataFrame:
+        """Drop unnecessary rows from the dataset.
+        Args:
+            dataframe (pd.DataFrame): the dataframe.
+        Returns:
+            pd.DataFrame: the dataframe.
+        """
+        rows_to_drop = [
+            "League",
+            "S. No.",
+            "Timestamp",
+            "highestleague",
+            "earnings",
+        ]
+        return dataframe.drop(rows_to_drop, axis="columns")
+    def remove_nonaccepting_rows(
+        self, dataframe: pd.DataFrame
+    ) -> pd.DataFrame:
+        """Removes rows where participants did not consent to data processing.
+        Args:
+            dataframe (pd.DataFrame): the dataframe.
+        Returns:
+            pd.DataFrame: the dataframe.
+        """
+        # drop rows where users did not accept to having their data used
+        dataframe = dataframe.drop(
+            dataframe[dataframe["accept"] != "Accept"].index,
+        )
+        dataframe = dataframe.drop(["accept"], axis=1)
+        return dataframe
+    def preprocess_whyplay(self, dataframe: pd.DataFrame) -> pd.Series:
+        """Preprocesses the whyplay column, and returns a Is_competitive col.
+        Args:
+            dataframe (pd.DataFrame): the dataframe.
+        Returns:
+            pd.Series: the Is_competitive column.
+        """
+        dataframe["whyplay"] = dataframe["whyplay"].str.lower()
+        most_common_whyplay_reasons = list(
+            dataframe.groupby("whyplay")
+            .size()
+            .sort_values(ascending=False)
+            .head(5)
+            .index
+        )
+        dataframe.loc[
+            ~dataframe["whyplay"].isin(most_common_whyplay_reasons), "whyplay"
+        ] = "other"
+        is_competitive_col = self.get_is_competitive_col(dataframe)
+        return is_competitive_col
    def get_combined_anxiety_score(self, dataframe: pd.DataFrame) -> pd.Series:
        """Get the combined anxiety score, as a column.
        This score is based on the GAN, SPIN and SWL metrics.
@@ -88,3 +171,35 @@ class Dataset:
            pd.Series: The sorted column.
        """
        return self.dataframe[colname].sort_values(ascending=ascending)
+    def get_unique_column_values(self, colname: str):
+        """Returns a count of categorical values in the dataset.
+        Args:
+            colname (str): the column name.
+        Returns:
+            string array: an array of strings containing the unique values
+            present in the column
+        """
+        return self.dataframe[colname].explode().unique()
+    def get_category_counts(
+        self, colname: str, ascending: bool = None
+    ) -> pd.Series:
+        """Returns a count of categorical values in the dataset.
+        Args:
+            colname (str): the column name.
+            ascending (bool, optional): Direction to sort results.
+              If set to None, the results are not sorted. Defaults to None.
+        Returns:
+            pd.Series: the counted categories.
+        """
+        grouped_size = self.dataframe.groupby(colname).size()
+        return (
+            grouped_size
+            if ascending is None
+            else grouped_size.sort_values(ascending=ascending)
+        )
--- a/src/Plotter.py
+++ b/src/Plotter.py
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib
+import pandas as pd
+import seaborn as sns
+from .Dataset import Dataset
+class Plotter:
+    def __init__(self, dataset: Dataset):
+        self.ds = dataset
+        self.df = dataset.get_dataframe()
+    def customize_plot(self, fig, ax, styling_params):
+        if styling_params.get('title'):
+            ax.set_title(styling_params["title"])
+    def plot_categorical_bar_chart(self, category1, category2, styling_params = {}):
+        ct = pd.crosstab(self.df[category1], self.df[category2])
+        # Calculate percentages by row
+        ct_percent = ct.apply(lambda r: r/r.sum() * 100, axis=0)                
+        fig, ax = plt.subplots()
+        self.customize_plot(fig, ax, styling_params)
+        ct_percent.plot(kind='bar', ax=ax)
+    def plot_categorical_boxplot(self, target, category, styling_params = {}):
+        fig, ax = plt.subplots()
+        self.customize_plot(fig, ax, styling_params)
+        sns.boxplot(x=category,y=target,data=self.df, palette='rainbow')
+    def plot_categorical_histplot(self, target, category, styling_params = {}, bins= 30):
+        uniques = self.ds.get_unique_column_values(category)
+        fig, ax = plt.subplots()
+        self.customize_plot(fig, ax, styling_params)
+        for val in uniques:
+            anx_score = self.df[self.df[category] == val][target]
+            anx_score_weights = np.ones(len(anx_score)) / len(anx_score)
+            ax.hist(
+                anx_score,
+                weights=anx_score_weights,
+                bins = bins,
+                alpha=0.5,
+            )
\ No newline at end of file