diff --git a/.gitignore b/.gitignore index ae717087c38c236a3a8eddcdc3269277d9c399c8..4aa0a2c7b7965bef8c476dfbf60f3aa25ebdc23e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .idea results -src/__* \ No newline at end of file +src/__* +*.zip \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..d01dc8274b72bab0e40deb641932fa225a7599e8 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,9 @@ +stages: + - lint + +pylint: + image: "python:latest" + stage: lint + script: + - pip install pylint + - pylint src/main.py \ No newline at end of file diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000000000000000000000000000000000000..89d57ce8fff907ce37522e937ba9f12114ba0435 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,3 @@ +[MESSAGES CONTROL] + +disable = too-few-public-methods, too-many-branches, too-many-instance-attributes, import-error \ No newline at end of file diff --git a/src/main.py b/src/main.py index f2202b0a931453a6f8f4b5aa2419d56fac438037..d090159ee0eca24693b105b45662f8e474290fd0 100644 --- a/src/main.py +++ b/src/main.py @@ -1,5 +1,9 @@ +""" Main file to compute findings in the raw data """ + import os.path +import time +from enum import Enum import pandas as pd import seaborn as sns import numpy as np @@ -7,49 +11,69 @@ import matplotlib.pyplot as plt from matplotlib import ticker from scipy.stats import pearsonr -from enum import Enum from kPOD import k_pod -import time ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + + class Categories(Enum): - Dashboard = "Dashboard" + """ + Enum for all product categories evaluated in this study + """ + DASHBOARD = "Dashboard" LMS = "Learning Plattform" - Games = "Games" + GAMES = "Games" VR = "Virtual Reality" class Interviews: """ This class shows data points of every saved interview. - In this data frame is also saved how much time was taken for the questionnaire and some additional variables as you - can see here: https://www.soscisurvey.de/help/doku.php/en:results:variables + In this data frame is also saved how much time was taken for the questionnaire and some + additional variables as you can see + here: https://www.soscisurvey.de/help/doku.php/en:results:variables """ def __init__(self) -> None: + """ init Interviews and read out data_ueq.csv""" super().__init__() - - self.df = pd.read_csv(os.path.join(ROOT_DIR, '..', 'data', 'data_ueq.csv'), delimiter=';', quotechar='"', encoding='utf-16') + self.data = pd.read_csv(os.path.join(ROOT_DIR, '..', 'data', 'data_ueq.csv'), + delimiter=';', + quotechar='"', + encoding='utf-16') def __str__(self) -> str: - return f'{self.df.shape[0]} interviews' - - def get_items_per_product_category(self, product_category_items): - return self.df[product_category_items] + """ shows how many interviews are in this instance """ + return f'{self.data.shape[0]} interviews' def get_filtered(self, all_categories): + """ + Removes unwanted or incomplete interviews, rename columns from .*_21 -> .*_01 + + :param all_categories: columns names for the categories + :return: Dataframe with cleaned interviews + """ if isinstance(all_categories, ProductCategories): - cat_games = all_categories.get_items(Categories.Games) + # Get the category names related to games and LMS from the input object + cat_games = all_categories.get_items(Categories.GAMES) cat_lms = all_categories.get_items(Categories.LMS) + # remove interviews which are not complete - unwanted_interviews = self.df[self.df[cat_games[0]].isnull() | self.df[cat_lms[0]].isnull()] - print(f'{unwanted_interviews.shape[0]} interviews have not completed the questionnaire.') - df_nonnull_interviews = self.df.drop(unwanted_interviews.index) + unwanted_interviews = self.data[ + self.data[cat_games[0]].isnull() | self.data[cat_lms[0]].isnull()] + print( + f'{unwanted_interviews.shape[0]} interviews have not completed the questionnaire.') + + df_nonnull_interviews = self.data.drop(unwanted_interviews.index) + # remove interviews which are, based on indicators, not well answered df_too_fast_interviews = df_nonnull_interviews[ - (df_nonnull_interviews['TIME_RSI'] > 2.0) | (df_nonnull_interviews['DEG_TIME'] > 150)] - print(f'{df_too_fast_interviews.shape[0]} interviews have completed too fast or too weird.') + (df_nonnull_interviews['TIME_RSI'] > 2.0) | ( + df_nonnull_interviews['DEG_TIME'] > 150)] + print(f'{df_too_fast_interviews.shape[0]} ' + f'interviews have completed too fast or too weird.') result = df_nonnull_interviews.drop(df_too_fast_interviews.index) + # Rename certain columns in the resulting DataFrame result = result.rename(columns={"EX08_21": "EX08_01", "EX08_22": "EX08_02", "EX08_23": "EX08_03", "EX08_24": "EX08_04", "EX08_25": "EX08_05", "EX08_26": "EX08_06", @@ -63,107 +87,153 @@ class Interviews: }) print(f'{result.shape[0]} filtered interviews.') return result + return self.data class Rankings: + """ + Wrapper class to have the scales in order of importance for their use cases + """ def __init__(self): - self.author_dashboard = ['Nützlichkeit', 'Inhaltsqualität', 'Übersichtlichkeit', 'Vertrauenswürdigkeit', - 'Vertrauen', 'Effizienz', 'Durchschaubarkeit', 'Anpassbarkeit', 'Steuerbarkeit', - 'Intuitive Bedienung', 'Schönheit', 'Stimulation', 'Verbundenheit', 'Attraktivität', - 'Wertigkeit', 'Originalität', 'Immersion', 'Identität', 'Haptik', 'Akustik'] - self.author_VR = ['Stimulation', 'Attraktivität', 'Immersion', 'Steuerbarkeit', 'Intuitive Bedienung', + """ + Init wrapper class with important scales + """ + # scales from the author of the paper for dashboard + self.author_dashboard = ['Nützlichkeit', 'Inhaltsqualität', 'Übersichtlichkeit', + 'Vertrauenswürdigkeit', + 'Vertrauen', 'Effizienz', 'Durchschaubarkeit', 'Anpassbarkeit', + 'Steuerbarkeit', + 'Intuitive Bedienung', 'Schönheit', 'Stimulation', 'Verbundenheit', + 'Attraktivität', + 'Wertigkeit', 'Originalität', 'Immersion', 'Identität', 'Haptik', + 'Akustik'] + + # scales from the author of the paper for VR + self.author_vr = ['Stimulation', 'Attraktivität', 'Immersion', 'Steuerbarkeit', + 'Intuitive Bedienung', 'Akustik', 'Durchschaubarkeit', 'Übersichtlichkeit', - 'Inhaltsqualität', 'Schönheit', 'Vertrauenswürdigkeit', 'Vertrauen', 'Haptik', - 'Nützlichkeit', 'Originalität', 'Anpassbarkeit', 'Wertigkeit', 'Effizienz', + 'Inhaltsqualität', 'Schönheit', 'Vertrauenswürdigkeit', 'Vertrauen', + 'Haptik', + 'Nützlichkeit', 'Originalität', 'Anpassbarkeit', 'Wertigkeit', + 'Effizienz', 'Verbundenheit', 'Identität'] - self.paper_LMS = ['Inhaltsqualität', 'Nützlichkeit', 'Durchschaubarkeit', 'Übersichtlichkeit', 'Effizienz', - 'Intuitive Bedienung', 'Vertrauen', 'Steuerbarkeit', 'Stimulation', 'Anpassbarkeit', + + # scales from the paper of Winter et al. 2017 for LMS + self.paper_lms = ['Inhaltsqualität', 'Nützlichkeit', 'Durchschaubarkeit', + 'Übersichtlichkeit', 'Effizienz', + 'Intuitive Bedienung', 'Vertrauen', 'Steuerbarkeit', 'Stimulation', + 'Anpassbarkeit', 'Wertigkeit', 'visuelle Ästhetik', 'Immersion', 'Originalität', 'Identität', 'Verbundenheit'] - self.handbook_LMS = ['Inhaltsqualität', 'Vertrauenswürdigkeit', 'Nützlichkeit', 'Übersichtlichkeit', + + # scales from the Handbook of UEQ+v2 for LMS + self.handbook_lms = ['Inhaltsqualität', 'Vertrauenswürdigkeit', 'Nützlichkeit', + 'Übersichtlichkeit', 'Durchschaubarkeit', 'Effizienz', 'Vertrauen', 'Steuerbarkeit'] - self.latest_paper_LMS = ['Inhaltsqualität', 'Nützlichkeit', 'Übersichtlichkeit', 'Durchschaubarkeit', + + # scales from the paper of Kollmorgen et al. 2021 for LMS + self.latest_paper_lms = ['Inhaltsqualität', 'Nützlichkeit', 'Übersichtlichkeit', + 'Durchschaubarkeit', 'Effizienz', 'Steuerbarkeit', 'Intuitive Bedienung', - 'Vertrauen', 'Wertigkeit', 'Stimulation', 'Anpassbarkeit', 'visuelle Ästhetik', + 'Vertrauen', 'Wertigkeit', 'Stimulation', 'Anpassbarkeit', + 'visuelle Ästhetik', 'Originalität'] - self.new_LMS = ['Inhaltsqualität', 'Inhaltsseriosität', 'Vertrauen', 'Effizienz', 'Nützlichkeit', + + # scales from the paper of the authors for LMS + self.new_lms = ['Inhaltsqualität', 'Inhaltsseriosität', 'Vertrauen', 'Effizienz', + 'Nützlichkeit', 'Durchschaubarkeit', 'Steuerbarkeit', 'Übersichtlichkeit', - 'Intuitive Bedienung', 'Anpassbarkeit', 'Wertigkeit', 'Attraktivität', 'visuelle Ästhetik', + 'Intuitive Bedienung', 'Anpassbarkeit', 'Wertigkeit', 'Attraktivität', + 'visuelle Ästhetik', 'Stimulation', 'Akustik', 'Identität', 'Verbundenheit', 'Originalität', 'Haptik', 'Immersion'] - self.paper_games = ['Immersion', 'Stimulation', 'visuelle Ästhetik', 'Originalität', 'Steuerbarkeit', + # scales from the paper of Winter et al. 2017 for Games + self.paper_games = ['Immersion', 'Stimulation', 'visuelle Ästhetik', 'Originalität', + 'Steuerbarkeit', 'Durchschaubarkeit', 'Intuitive Bedienung', - 'Verbundenheit', 'Übersichtlichkeit', 'Effizienz', 'Wertigkeit', 'Anpassbarkeit', + 'Verbundenheit', 'Übersichtlichkeit', 'Effizienz', 'Wertigkeit', + 'Anpassbarkeit', 'Vertrauen', 'Identität', 'Inhaltsqualität', 'Nützlichkeit'] + + # scales from the Handbook of UEQ+v2 for Games self.handbook_games = ['Immersion', 'Stimulation', 'Originalität', 'Intuitive Bedienung'] - self.latest_paper_games = ['Stimulation', 'Originalität', 'visuelle Ästhetik', 'Steuerbarkeit', + + # scales from the paper of Kollmorgen et al. 2021 for Games + self.latest_paper_games = ['Stimulation', 'Originalität', 'visuelle Ästhetik', + 'Steuerbarkeit', 'Durchschaubarkeit', 'Intuitive Bedienung', - 'Originalität', 'Effizienz', 'Wertigkeit', 'Vertrauen', 'Anpassbarkeit', + 'Originalität', 'Effizienz', 'Wertigkeit', 'Vertrauen', + 'Anpassbarkeit', 'Inhaltsqualität', 'Nützlichkeit'] - self.new_games = ['Stimulation', 'Immersion', 'Vertrauen', 'Steuerbarkeit', 'Durchschaubarkeit', + + # scales from the paper of the authors for Games + self.new_games = ['Stimulation', 'Immersion', 'Vertrauen', 'Steuerbarkeit', + 'Durchschaubarkeit', 'Attraktivität', 'Intuitive Bedienung', 'Originalität', - 'visuelle Ästhetik', 'Effizienz', 'Übersichtlichkeit', 'Wertigkeit', 'Anpassbarkeit', + 'visuelle Ästhetik', 'Effizienz', 'Übersichtlichkeit', 'Wertigkeit', + 'Anpassbarkeit', 'Inhaltsseriosität', 'Akustik', 'Inhaltsqualität', 'Verbundenheit', 'Haptik', 'Identität', 'Nützlichkeit'] def get_all_rankings(self): - return {'LMS from paper': self.paper_LMS, - 'LMS from handbook': self.handbook_LMS, - 'LMS from latest paper': self.latest_paper_LMS, - 'LMS from our study': self.new_LMS, + """ + gets all rankings + + :return: key-value list with all scales + """ + return {'LMS from paper': self.paper_lms, + 'LMS from handbook': self.handbook_lms, + 'LMS from latest paper': self.latest_paper_lms, + 'LMS from our study': self.new_lms, 'games from paper': self.paper_games, 'games from handbook': self.handbook_games, 'games from latest paper': self.latest_paper_games, 'games from our study': self.new_games} -class Variables: - """ - Gives an explanation of every variable used in the questionnaire. - """ - - def __init__(self) -> None: - super().__init__() - self.df = pd.read_csv(os.path.join(ROOT_DIR, '..', 'data', 'variables_ueq.csv'), delimiter=';', quotechar='"', encoding='utf-16') - - def get_groups(self): - return self.df[['VAR', 'LABEL']][7:12] - - class Values: """ Gives an explanation of every given value in the interviews """ def __init__(self) -> None: + """ + init values and read out variables_ueq.csv + """ super().__init__() - self.df = pd.read_csv(os.path.join(ROOT_DIR, '..', 'data', 'variables_ueq.csv'), delimiter=';', quotechar='"', encoding='utf-16') + self.data = pd.read_csv(os.path.join(ROOT_DIR, '..', 'data', 'variables_ueq.csv'), + delimiter=';', quotechar='"', + encoding='utf-16') class ProductCategories: """ Product categories are the core of this evaluation. The goal is to evaluate which items are important for the product categories. - In the created dict in items every product category is connected with the used code and names of their items. + In the created dict in items every product category is connected with the used code and names + of their items. """ def __init__(self) -> None: + """ + init product categories and build item codes to filter them in the raw data + """ super().__init__() self.items = { - Categories.Dashboard: { + Categories.DASHBOARD: { 'code': 'EX01' }, - Categories.Games: { + Categories.GAMES: { 'code': 'EX05' }, Categories.LMS: { @@ -175,23 +245,42 @@ class ProductCategories: } # create the tokens to access every item in a product category - for key in self.items.keys(): - self.items[key]['items'] = [self.items[key]['code'] + "_" + "{:02d}".format(i) for i in range(1, 21)] + for key in self.items.keys(): # pylint: disable=consider-iterating-dictionary, consider-using-dict-items + self.items[key]['items'] = [f"{self.items[key]['code']}_{i:02d}" for i in range(1, 21)] + + def get_items(self, product_cat): + """ + get items by a specific product category + + :param product_cat: product category which is used for filtering the items + :return: all items used for this product category + """ + return self.items[product_cat]['items'] - def get_items(self, key): - return self.items[key]['items'] +def save_plot(plot, plot_name): + """ + saves the plot into results folder -def save_plot(plot, path): + :param plot: plot data object + :param plot_name: filename in which this plot is saved + """ dir_path = os.path.abspath(os.path.join(ROOT_DIR, '..', 'results')) if not os.path.exists(dir_path): os.mkdir(dir_path) - saved_path = os.path.join(dir_path, path + '.png') + saved_path = os.path.join(dir_path, plot_name + '.png') print("saved plot to " + saved_path) plot.savefig(fname=saved_path, dpi='figure', format='png') def replace_cryptic_names(columns): + """ + replace the cryptic short keys from the raw data table into a human-readable scale name + + :param columns: columns with cryptics_names + :return: a list with human-readable scales + """ + new_columns = list(columns) for i in range(len(columns)): if "_01" in new_columns[i]: @@ -211,7 +300,8 @@ def replace_cryptic_names(columns): elif "_08" in new_columns[i]: new_columns[i] = 'Originalität' # sixth scale in handbook - OR in Paper elif "_09" in new_columns[i]: - new_columns[i] = 'visuelle Ästhetik' # seventh scale in handbook - SC (Schönheit) in Paper + # seventh scale in handbook - SC (Schönheit) in Paper + new_columns[i] = 'visuelle Ästhetik' elif "_10" in new_columns[i]: new_columns[i] = 'Identität' # first left out scale in handbook - ID in Paper elif "_11" in new_columns[i]: @@ -239,6 +329,13 @@ def replace_cryptic_names(columns): def translate_scales(scales, short=False): + """ + translate german scales into english + + :param scales: german scales + :param short: if a shortform of a scale is needed + :return: translated scales + """ scales = list(scales) result = scales translations = { @@ -286,72 +383,116 @@ def translate_scales(scales, short=False): 'Akustik': "Aco", 'Inhaltsseriosität': "ToC" } - for i in range(len(scales)): - result[i] = translations[scales[i]] + for i, scala in enumerate(scales): + result[i] = translations[scala] return result -def filter_columns_by_value(df_category, value): - filtered_columns = df_category.columns[value] - df_category_area = df_category[filtered_columns] +def filter_columns_by_value(data, value): + """ + filters a given dataframe by a value + + :param data: to be filtered dataframe + :param value: filter condition or value to be filtered for + :return: filtered dataframe + """ + filtered_columns = data.columns[value] + df_category_area = data[filtered_columns] return df_category_area class Plotter: + """ + In this object are all plotting functions welcome + """ - def __init__(self, df, pc) -> None: + def __init__(self, data, cat) -> None: + """ + init Plotter object with used dataframe and product categories + :param data: interview data + :param cat: used product categories + """ super().__init__() - self.df = df - self.pc = pc + self.data = data + self.cat = cat self.plotted = 0 def filter_by_gender(self, gender): - if gender == 'male': - return self.df[self.df['DE02'] == 1] - elif gender == 'female': - return self.df[self.df['DE02'] == 2] - else: - return self.df + """ + filters the data frame by a specific gender - def plot_groups(self, variable_labels): - df_groups = self.df[['DE01_01', 'DE01_02', 'DE01_03', 'DE01_04', 'DE01_05']] - 1 + :param gender: specific gender to be filtered for + :return: filtered dataframe + """ + if gender == 'male': + return self.data[self.data['DE02'] == 1] + if gender == 'female': + return self.data[self.data['DE02'] == 2] + return self.data + + def plot_groups(self): + """ + plots the data for the Academic Community + + :return: the plot + """ + df_groups = self.data[['DE01_01', 'DE01_02', 'DE01_03', 'DE01_04', 'DE01_05']] - 1 df_groups.columns = ['Stu', 'Teach', 'Res', 'Tech', 'Out'] - sns.set_theme(style="whitegrid") - df_groups_sum = pd.DataFrame(df_groups.sum(), columns=['Sum']).sort_values(by='Sum', ascending=False) - ax = sns.barplot(data=df_groups_sum, palette="Blues_d", errorbar=None, y="Sum", x=df_groups_sum.index) - ax.set(title='Academic Community (Multiple Choice)') - ax.yaxis.set_major_locator(ticker.MultipleLocator(5)) - for tick in ax.xaxis.get_major_ticks(): + df_groups_sum = pd.DataFrame(df_groups.sum(), columns=['Sum']).sort_values(by='Sum', + ascending=False) + + axes = sns.barplot(data=df_groups_sum, palette="Blues_d", errorbar=None, y="Sum", + x=df_groups_sum.index) + axes.set(title='Academic Community (Multiple Choice)') + axes.yaxis.set_major_locator(ticker.MultipleLocator(5)) + + # Increase the font size for the x and y tick labels + for tick in axes.xaxis.get_major_ticks(): tick.label.set_fontsize(15) - for tick in ax.yaxis.get_major_ticks(): + for tick in axes.yaxis.get_major_ticks(): tick.label.set_fontsize(15) + + sns.set_theme(style="whitegrid") sns.despine(left=True) - self.plotted = self.plotted + 1 save_plot(plt, 'demographics_communities') + self.plotted = self.plotted + 1 return plt def plot_gender(self): - df_gender = self.df['DE02'] \ + """ + plots the gender data + + :return: the plot + """ + df_gender = self.data['DE02'] \ .replace(1, 'male') \ .replace(2, 'female') \ .replace(3, 'diverse') \ .replace(-1, 'not mentioned') - # print(df_gender.sum()) df_gender_count = pd.DataFrame(df_gender.value_counts(normalize=False)) - sns.set_theme(style="whitegrid") - ax = sns.barplot(data=df_gender_count, palette="Blues_d", errorbar=None, y="count", - x=df_gender_count.index.values) - ax.set(title='Gender') - for tick in ax.xaxis.get_major_ticks(): + + axes = sns.barplot(data=df_gender_count, palette="Blues_d", errorbar=None, y="count", + x=df_gender_count.index.values) + axes.set(title='Gender') + + # Increase the font size for the x and y tick labels + for tick in axes.xaxis.get_major_ticks(): tick.label.set_fontsize(15) - for tick in ax.yaxis.get_major_ticks(): + for tick in axes.yaxis.get_major_ticks(): tick.label.set_fontsize(15) - self.plotted = self.plotted + 1 + + sns.set_theme(style="whitegrid") save_plot(plt, 'demographics_gender') + self.plotted = self.plotted + 1 return plt def plot_faculty(self): - df_faculty = self.df['DE03'] \ + """ + plots the data to which UP faculty every participant belongs to + + :return: the plot + """ + df_faculty = self.data['DE03'] \ .replace(1, 'Jur') \ .replace(2, 'Phil') \ .replace(3, 'HuWi') \ @@ -363,33 +504,54 @@ class Plotter: .replace(9, 'Other') \ .replace(-1, 'kA') \ .replace(-9, 'nB') - ax = sns.countplot(x=df_faculty, palette="Blues_d") - ax.set(title='Fakultät') - self.plotted = self.plotted + 1 + + axes = sns.countplot(x=df_faculty, palette="Blues_d") + axes.set(title='Fakultät') + save_plot(plt, 'demographics_disciplines') + self.plotted = self.plotted + 1 return plt def plot_disciplines(self): - df_faculty = self.df['DE04'] \ + """ + plots to which discipline every participant belongs to based on the DFG disciplines + + :return: the plot + """ + df_faculty = self.data['DE04'] \ .replace(1, 'Geistes- und Sozialwissenschaften') \ .replace(2, 'Lebenswissenschaften') \ .replace(3, 'Naturwissenschaften') \ .replace(4, 'Ingenieurwissenschaften') \ .replace(-1, 'kA') \ .replace(-9, 'nB') - ax = sns.countplot(x=df_faculty, palette="pastel") - ax.set(title='Disciplines') - self.plotted = self.plotted + 1 + + axes = sns.countplot(x=df_faculty, palette="pastel") + axes.set(title='Disciplines') + save_plot(plt, 'demographics_disciplines') + self.plotted = self.plotted + 1 return plt def plot_item(self, category_title, area, max_count=20, gender='all'): - category_items = self.pc.get_items(category_title) + """ + plot the scales by its importance + + :param category_title: title for the plot + :param area: the area of the scales which needs to be plotted (top, mid, low, all) + :param max_count: Limit the number of columns to plot + :param gender: filter by gender + :return: the plot + """ + + category_items = self.cat.get_items(category_title) + # Filter the DataFrame based on gender, if applicable if gender == 'all': - df_category = self.df[category_items] + df_category = self.data[category_items] else: df_category = self.filter_by_gender(gender)[category_items] + # Filter the DataFrame into top, mid, low area if area == 'top': df_category_area = filter_columns_by_value(df_category, df_category.mean() >= 4) elif area == 'mid': @@ -401,22 +563,36 @@ class Plotter: df_category_area = filter_columns_by_value(df_category, df_category.mean() >= 0) else: df_category_area = df_category + df_category_area.columns = replace_cryptic_names(df_category_area.columns) - df_category_area = df_category_area.reindex(df_category_area.mean().sort_values(ascending=False).index, axis=1) + # Reorder the columns based on the mean of their values, sorted in descending order + df_category_area = df_category_area.reindex( + df_category_area.mean().sort_values(ascending=False).index, axis=1) df_category_area = df_category_area.iloc[:, : max_count] - ax = sns.barplot(data=df_category_area, palette="Blues_d", orient='h', errorbar="sd") - ax.set(title=category_title.value + ' - ' + area) + + axes = sns.barplot(data=df_category_area, palette="Blues_d", orient='h', errorbar="sd") + axes.set(title=category_title.value + ' - ' + area) + sns.despine(left=True) - self.plotted = self.plotted + 1 save_plot(plt, 'items_' + category_title.value + '_' + area + '_' + gender) + self.plotted = self.plotted + 1 return plt def plot_boxes(self, category_title, max_count=5): - df_category_area = self.df[self.pc.get_items(category_title)] + """ + creates a boxplot + + :param category_title: title for the plot + :param max_count: Limit the number of columns to plot + :return: the plot + """ + + df_category_area = self.data[self.cat.get_items(category_title)] df_category_area.columns = replace_cryptic_names(df_category_area.columns) # choose 5 most relevant scales (highest mean) - df_category_area = df_category_area.reindex(df_category_area.mean().sort_values(ascending=False).index, axis=1) + df_category_area = df_category_area.reindex( + df_category_area.mean().sort_values(ascending=False).index, axis=1) df_category_area = df_category_area.iloc[:, : max_count] # make a nice plot surrounding @@ -426,71 +602,86 @@ class Plotter: fig.set_figheight(6.0) # plot the plot - boxprops = dict(linestyle='-', facecolor='k', linewidth=3, color='k') - medianprops = dict(linestyle='-', linewidth=3, color='b') - meanprops = dict(color='b') - ax = fig.add_subplot(111) - ax.axhspan(4, 7, facecolor='green', alpha=0.2) - ax.axhspan(1, 4, facecolor='red', alpha=0.2) - ax.grid(axis='y') - ax.boxplot(df_category_area, showmeans=True, showfliers=True, patch_artist=True, - boxprops=boxprops, medianprops=medianprops, meanprops=meanprops) - ax.set_ylim(0.5, 7.5) - - ax.set_xticks([tick + 1 for tick in range(len(df_category_area.columns))], - labels=translate_scales(df_category_area.columns, True), rotation=0) - ax.xaxis.set_label_position('top') - ax.xaxis.tick_top() - self.plotted = self.plotted + 1 + boxprops = {'linestyle': '-', 'facecolor': 'k', 'linewidth': 3, 'color': 'k'} + medianprops = {'linestyle': '-', 'linewidth': 3, 'color': 'b'} + meanprops = {'color': 'b'} + axes = fig.add_subplot(111) + axes.axhspan(4, 7, facecolor='green', alpha=0.2) + axes.axhspan(1, 4, facecolor='red', alpha=0.2) + axes.grid(axis='y') + axes.boxplot(df_category_area, showmeans=True, showfliers=True, patch_artist=True, + boxprops=boxprops, medianprops=medianprops, meanprops=meanprops) + axes.set_ylim(0.5, 7.5) + + axes.set_xticks([tick + 1 for tick in range(len(df_category_area.columns))], + labels=translate_scales(df_category_area.columns, True), rotation=0) + axes.xaxis.set_label_position('top') + axes.xaxis.tick_top() save_plot(plt, 'box_' + category_title.value) + self.plotted = self.plotted + 1 return plt - def m_scatter(self, x, y, ax=None, m=None, **kw): - import matplotlib.markers as mmarkers - if not ax: ax = plt.gca() - sc = ax.scatter(x, y, **kw) - if (m is not None) and (len(m) == len(x)): - paths = [] - for marker in m: - if isinstance(marker, mmarkers.MarkerStyle): - marker_obj = marker - else: - marker_obj = mmarkers.MarkerStyle(marker) - path = marker_obj.get_path().transformed( - marker_obj.get_transform()) - paths.append(path) - sc.set_paths(paths) - return sc - class Statistics: - def __init__(self, df, pc) -> None: + """ + Object in which all static calculation happens + """ + + def __init__(self, data, cat) -> None: + """ + init for statistics + + :param data: interview data + :param cat: product categories + """ super().__init__() - self.df = df - self.pc = pc + self.data = data + self.cat = cat def ranking_of_scales(self, ranked_list): - all_scales = replace_cryptic_names(self.pc.get_items(Categories.VR)) # does not matter which category + """ + This method takes a list of ranked scales as input and + returns a list containing the rank of each scale in the VR category. + + :param ranked_list: + :return: ordered vector + """ + all_scales = replace_cryptic_names(self.cat.get_items(Categories.VR)) # not VR specific order_vector = list(all_scales) j = 0 + # For each scale determine its rank in the ranked_list and store it in order_vector for scale in all_scales: rank = float("nan") - for i in range(len(ranked_list)): - if scale == ranked_list[i]: + for i, ranked_element in enumerate(ranked_list): + if scale == ranked_element: rank = i order_vector[j] = rank j += 1 return order_vector def difference_of_distance_of_ranked_scales(self, list_1, list_2): + """ + distance vector of the lists; creates ranking and measures the distance + + :param list_1: + :param list_2: + :return: distance vector between both lists + """ order_vector_1 = self.ranking_of_scales(list_1) order_vector_2 = self.ranking_of_scales(list_2) distance_vector = abs(np.subtract(order_vector_1, order_vector_2)) return distance_vector def similarity_by_ranked_scales(self, all_rankings): + """ + compute the similarities of all scales + + :param all_rankings: ranked scales + :return: comparison matrix + """ result_vector = pd.DataFrame([], - columns=['rate', 'distance', 'similarity', 'comparison'], index=[0]) + columns=['rate', 'distance', 'similarity', 'comparison'], + index=[0]) result_vector.set_index('comparison') for title_first, vector_first in all_rankings.items(): for title_second, vector_second in all_rankings.items(): @@ -513,7 +704,14 @@ class Statistics: return result_vector.sort_values(by=['rate']) def kpod_clustering(self, all_rankings): - col_names = ['source'] + (replace_cryptic_names(self.pc.get_items(Categories.VR))) + """ + cluster rankings based on the kpod algorithm + + :param all_rankings: ranked scales + :return: each scale with a value between 0 and 1 + """ + + col_names = ['source'] + (replace_cryptic_names(self.cat.get_items(Categories.VR))) df_result_vector = pd.DataFrame([], columns=col_names, index=[0]) for title, ranking in all_rankings.items(): order_vector = self.ranking_of_scales(ranking) @@ -522,8 +720,10 @@ class Statistics: df_result_vector.set_index('source', drop=True, inplace=True) # cluster data with very sparse data using kPOD by Chi JT, Chi EC, Baraniuk RG (2016). - # “-POD: A Method for -Means Clustering of Missing Data.†The American Statistician, 70, 91–99. - # doi: 10.1080/00031305.2015.1086685, http://www.tandfonline.com/doi/abs/10.1080/00031305.2015.1086685. + # “-POD: A Method for -Means Clustering of Missing Data.†+ # The American Statistician, 70, 91–99. + # doi: 10.1080/00031305.2015.1086685, + # http://www.tandfonline.com/doi/abs/10.1080/00031305.2015.1086685. # # 50 suggested clusters are gathered and the mean gets displayed @@ -531,58 +731,66 @@ class Statistics: if result_assignment[0] == 0: result_assignment = abs(np.subtract(result_assignment, 1)) iteration_count = 5 - for i in range(iteration_count): + for _ in range(iteration_count): iterate_kpod = k_pod(df_result_vector[1:], 2)[0] if iterate_kpod[0] == 0: iterate_kpod = abs(np.subtract(iterate_kpod, 1)) result_assignment = result_assignment + iterate_kpod result_assignment = result_assignment / (iteration_count + 1) - df_result = pd.DataFrame(np.transpose([list(df_result_vector.index[1:]), list(result_assignment)]), - columns=['source', 'group']) + df_result = pd.DataFrame( + np.transpose([list(df_result_vector.index[1:]), list(result_assignment)]), + columns=['source', 'group']) df_result.set_index('source', inplace=True) return df_result def calc_corr(self): + """ + calculates correlation between different product categories + + :return: correlation matrix + """ df_means_dashboard = pd.DataFrame( - {"dashboard": self.df[self.pc.get_items(Categories.Dashboard)].mean()}) - df_means_lms = pd.DataFrame({"lms": self.df[self.pc.get_items(Categories.LMS)].mean()}) - df_means_games = pd.DataFrame({"games": self.df[self.pc.get_items(Categories.Games)].mean()}) - df_means_vr = pd.DataFrame({"vr": self.df[self.pc.get_items(Categories.VR)].mean()}) + {"dashboard": self.data[self.cat.get_items(Categories.DASHBOARD)].mean()}) + df_means_lms = pd.DataFrame({"lms": self.data[self.cat.get_items(Categories.LMS)].mean()}) + df_means_games = pd.DataFrame( + {"games": self.data[self.cat.get_items(Categories.GAMES)].mean()}) + df_means_vr = pd.DataFrame({"vr": self.data[self.cat.get_items(Categories.VR)].mean()}) df_means_dashboard.index = replace_cryptic_names(df_means_dashboard.index) df_means_lms.index = replace_cryptic_names(df_means_lms.index) df_means_games.index = replace_cryptic_names(df_means_games.index) df_means_vr.index = replace_cryptic_names(df_means_vr.index) - df_all_means = pd.concat([df_means_lms, df_means_dashboard, df_means_games, df_means_vr], axis=1) + df_all_means = pd.concat([df_means_lms, df_means_dashboard, df_means_games, df_means_vr], + axis=1) result = df_all_means.corr() pval = df_all_means.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*result.shape) - p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x <= t])) - return result.round(2).astype(str) + p - - def welch(self): - - return + pval_with_corr_stars = pval.applymap( + lambda x: ''.join(['*' for t in [.05, .01, .001] if x <= t])) + return result.round(2).astype(str) + pval_with_corr_stars def mean_time(self): - return self.df["TIME_SUM"].mean() - + """ + determines the mean time spent with the questionnaire -def setup_dataframes(): - return Interviews(), Variables(), Values(), ProductCategories(), Rankings() + :return: mean time + """ + return self.data["TIME_SUM"].mean() if __name__ == '__main__': print('---------- Creating plots --------------') - interview, variables, values, product_categories, rankings = setup_dataframes() + interview, values, product_categories, rankings = Interviews(), Values(), \ + ProductCategories(), Rankings() print(interview) df_filtered = interview.get_filtered(product_categories) statistics = Statistics(df_filtered, product_categories) plotter = Plotter(df_filtered, product_categories) - print('average time to complete the questionnaire: ' + time.strftime('%M:%S', time.gmtime(statistics.mean_time()))) + print('average time to complete the questionnaire: ' + time.strftime('%M:%S', time.gmtime( + statistics.mean_time()))) - plotter.plot_groups(variables.get_groups()) + plotter.plot_groups() plotter.plot_gender() plotter.plot_disciplines()