Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
R
RSE 23 Group Assignment Shervud Pitawanik Franz
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
Niklas Franz
RSE 23 Group Assignment Shervud Pitawanik Franz
Commits
030b099c
Commit
030b099c
authored
1 year ago
by
Sortofamudkip
Browse files
Options
Downloads
Plain Diff
Merge branch '18-write-tests'
parents
d632d964
cda0fa31
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
src/Dataset.py
+72
-1
72 additions, 1 deletion
src/Dataset.py
src/Plotter.py
+129
-1
129 additions, 1 deletion
src/Plotter.py
src/test_dataset.py
+124
-0
124 additions, 0 deletions
src/test_dataset.py
src/test_plotter.py
+15
-0
15 additions, 0 deletions
src/test_plotter.py
with
340 additions
and
2 deletions
src/Dataset.py
+
72
−
1
View file @
030b099c
import
numpy
as
np
import
pandas
as
pd
import
logging
class
Dataset
:
...
...
@@ -11,8 +12,23 @@ class Dataset:
dataset_filename (str): the path of the dataset,
relative to the location of the file calling this function.
"""
raw_dataframe
=
pd
.
read_csv
(
dataset_filename
,
encoding
=
"
windows-1254
"
)
if
type
(
dataset_filename
)
!=
str
:
logging
.
error
(
"
parameter `dataset_filename` is not a string
"
)
raise
ValueError
(
f
"
{
dataset_filename
}
is not a string
"
)
if
not
dataset_filename
.
endswith
(
"
.csv
"
):
logging
.
error
(
"
dataset filename should be CSV
"
)
raise
OSError
(
f
"
{
dataset_filename
}
is not a CSV file.
"
)
try
:
raw_dataframe
=
pd
.
read_csv
(
dataset_filename
,
encoding
=
"
windows-1254
"
)
logging
.
info
(
"
Dataframe successfully loaded
"
)
except
FileNotFoundError
as
e
:
logging
.
error
(
"
CSV file not found
"
)
raise
e
self
.
dataframe
=
self
.
preprocess_dataset
(
raw_dataframe
)
logging
.
info
(
"
Dataset class successfully initialised
"
)
def
preprocess_dataset
(
self
,
raw_dataframe
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
"""
preprocess dataframe immediately after loading it.
...
...
@@ -25,6 +41,12 @@ class Dataset:
Returns:
pd.DataFrame: resulting preprocessed dataframe.
"""
if
type
(
raw_dataframe
)
!=
pd
.
DataFrame
:
logging
.
error
(
"
parameter `raw_dataframe` is not a pandas DataFrame
"
)
raise
ValueError
(
f
"
{
raw_dataframe
}
is not a pandas DataFrame
"
)
dataframe
=
self
.
_drop_unnecessary_columns
(
raw_dataframe
)
# for conveneince
...
...
@@ -39,6 +61,10 @@ class Dataset:
return
dataframe
def
get_is_competitive_col
(
self
,
dataframe
:
pd
.
DataFrame
):
if
type
(
dataframe
)
!=
pd
.
DataFrame
:
logging
.
error
(
"
parameter `dataframe` is not a pandas DataFrame
"
)
raise
ValueError
(
f
"
{
dataframe
}
is not a pandas DataFrame
"
)
is_competitive_col
=
np
.
zeros
(
shape
=
len
(
dataframe
))
is_competitive_col
[
(
dataframe
[
"
whyplay
"
]
==
"
improving
"
)
...
...
@@ -64,6 +90,10 @@ class Dataset:
Returns:
pd.DataFrame: the dataframe.
"""
if
type
(
dataframe
)
!=
pd
.
DataFrame
:
logging
.
error
(
"
parameter `dataframe` is not a pandas DataFrame
"
)
raise
ValueError
(
f
"
{
dataframe
}
is not a pandas DataFrame
"
)
rows_to_drop
=
(
[
"
League
"
,
...
...
@@ -92,6 +122,10 @@ class Dataset:
Returns:
pd.DataFrame: the dataframe.
"""
if
type
(
dataframe
)
!=
pd
.
DataFrame
:
logging
.
error
(
"
parameter `dataframe` is not a pandas DataFrame
"
)
raise
ValueError
(
f
"
{
dataframe
}
is not a pandas DataFrame
"
)
# drop rows where users did not accept to having their data used
dataframe
=
dataframe
.
drop
(
dataframe
[
dataframe
[
"
accept
"
]
!=
"
Accept
"
].
index
,
...
...
@@ -108,6 +142,10 @@ class Dataset:
Returns:
pd.Series: the Is_competitive column.
"""
if
type
(
dataframe
)
!=
pd
.
DataFrame
:
logging
.
error
(
"
parameter `dataframe` is not a pandas DataFrame
"
)
raise
ValueError
(
f
"
{
dataframe
}
is not a pandas DataFrame
"
)
dataframe
[
"
whyplay
"
]
=
dataframe
[
"
whyplay
"
].
str
.
lower
()
most_common_whyplay_reasons
=
list
(
dataframe
.
groupby
(
"
whyplay
"
)
...
...
@@ -138,6 +176,10 @@ class Dataset:
Returns:
pd.Series: the anxiety score column.
"""
if
type
(
dataframe
)
!=
pd
.
DataFrame
:
logging
.
error
(
"
parameter `dataframe` is not a pandas DataFrame
"
)
raise
ValueError
(
f
"
{
dataframe
}
is not a pandas DataFrame
"
)
gad_max
=
21
gad_min
=
0
gad_normalised
=
(
dataframe
[
"
GAD_T
"
]
-
gad_min
)
/
gad_max
...
...
@@ -161,6 +203,10 @@ class Dataset:
Returns:
pd.Series: the boolean narcissist column.
"""
if
type
(
dataframe
)
!=
pd
.
DataFrame
:
logging
.
error
(
"
parameter `dataframe` is not a pandas DataFrame
"
)
raise
ValueError
(
f
"
{
dataframe
}
is not a pandas DataFrame
"
)
return
np
.
where
(
dataframe
[
"
Narcissism
"
]
<=
1.0
,
True
,
False
)
def
get_dataframe
(
self
)
->
pd
.
DataFrame
:
...
...
@@ -183,6 +229,16 @@ class Dataset:
Returns:
pd.Series: The sorted column.
"""
if
type
(
colname
)
!=
str
:
logging
.
error
(
"
parameter `colname` is not a string
"
)
raise
ValueError
(
f
"
{
colname
}
is not a string
"
)
if
colname
not
in
self
.
dataframe
.
columns
:
logging
.
error
(
"
column requested not in dataframe
"
)
raise
KeyError
(
f
"
{
colname
}
is not a column in dataframe
"
)
if
not
(
ascending
is
None
or
type
(
ascending
)
is
bool
):
logging
.
error
(
"
parameter `ascending` is not a bool or None
"
)
raise
ValueError
(
f
"
{
ascending
}
is not a bool or None
"
)
return
self
.
dataframe
[
colname
].
sort_values
(
ascending
=
ascending
)
def
get_unique_column_values
(
self
,
colname
:
str
):
...
...
@@ -195,6 +251,10 @@ class Dataset:
string array: an array of strings containing the unique values
present in the column
"""
if
type
(
colname
)
!=
str
:
logging
.
error
(
"
parameter `colname` is not a string
"
)
raise
ValueError
(
f
"
{
colname
}
is not a string
"
)
return
self
.
dataframe
[
colname
].
explode
().
unique
()
def
get_category_counts
(
...
...
@@ -210,6 +270,17 @@ class Dataset:
Returns:
pd.Series: the counted categories.
"""
if
type
(
colname
)
!=
str
:
logging
.
error
(
"
parameter `colname` is not a string
"
)
raise
ValueError
(
f
"
{
colname
}
is not a string
"
)
if
colname
not
in
self
.
dataframe
.
columns
:
logging
.
error
(
"
column requested not in dataframe
"
)
raise
KeyError
(
f
"
{
colname
}
is not a column in dataframe
"
)
if
not
(
ascending
is
None
or
type
(
ascending
)
is
bool
):
logging
.
error
(
"
parameter `ascending` is not a bool or None
"
)
raise
ValueError
(
f
"
{
ascending
}
is not a bool or None
"
)
grouped_size
=
self
.
dataframe
.
groupby
(
colname
).
size
()
return
(
grouped_size
...
...
This diff is collapsed.
Click to expand it.
src/Plotter.py
+
129
−
1
View file @
030b099c
...
...
@@ -3,10 +3,15 @@ import matplotlib.pyplot as plt
import
pandas
as
pd
import
seaborn
as
sns
from
.Dataset
import
Dataset
import
logging
class
Plotter
:
def
__init__
(
self
,
dataset
:
Dataset
):
if
type
(
dataset
)
!=
Dataset
:
logging
.
error
(
"
dataset parameter is not of type Dataset
"
)
raise
ValueError
(
f
"
{
dataset
}
is not of type Dataset
"
)
self
.
ds
=
dataset
self
.
df
=
dataset
.
get_dataframe
()
...
...
@@ -25,6 +30,46 @@ class Plotter:
if
styling_params
.
get
(
"
title
"
):
ax
.
set_title
(
styling_params
[
"
title
"
])
def
distribution_plot
(
self
,
target
,
styling_params
=
{})
->
None
:
"""
plot a distribution plot.
Args:
target (str, must be present as a column in the dataset),
styling_params (dict)
Returns:
None
"""
# implementing sensible logging and error catching
if
type
(
target
)
!=
str
:
logging
.
error
(
"
parameter target should be a string.
"
)
raise
ValueError
(
"
parameter target should be a string.
"
)
if
not
(
target
in
self
.
df
.
columns
):
logging
.
error
(
"
parameter target cannot be found in the dataset.
"
)
raise
ValueError
(
"
parameter target cannot be found in the dataset.
"
)
if
type
(
styling_params
)
!=
dict
:
logging
.
error
(
"
parameter styling params should be a dict.
"
)
raise
ValueError
(
"
parameter styling params should be a dict.
"
)
# plotting the plot
grouped_data
=
self
.
df
.
groupby
(
target
).
size
()
plt
.
barh
(
grouped_data
.
index
,
grouped_data
.
values
)
print
(
str
(
grouped_data
),
str
(
grouped_data
.
index
),
str
(
grouped_data
.
values
),
)
plt
.
xlabel
(
"
Size
"
)
plt
.
ylabel
(
target
)
plt
.
title
(
f
"
Distribution of
{
target
}
"
)
def
plot_categorical_bar_chart
(
self
,
category1
,
category2
,
styling_params
=
{}
)
->
None
:
...
...
@@ -39,6 +84,36 @@ class Plotter:
Returns:
None
"""
# implementing sensible logging and error catching
if
type
(
category1
)
!=
str
:
logging
.
error
(
"
parameter category1 should be a string.
"
)
raise
ValueError
(
"
parameter category1 should be a string.
"
)
if
not
(
category1
in
self
.
df
.
columns
):
logging
.
error
(
"
parameter category1 cannot be found in the dataset.
"
)
raise
ValueError
(
"
parameter category1 cannot be found in the dataset.
"
)
if
type
(
category2
)
!=
str
:
logging
.
error
(
"
parameter category2 should be a string.
"
)
raise
ValueError
(
"
parameter category2 should be a string.
"
)
if
not
(
category2
in
self
.
df
.
columns
):
logging
.
error
(
"
parameter category2 cannot be found in the dataset.
"
)
raise
ValueError
(
"
parameter category2 cannot be found in the dataset.
"
)
if
type
(
styling_params
)
!=
dict
:
logging
.
error
(
"
parameter styling params should be a dict.
"
)
raise
ValueError
(
"
parameter styling params should be a dict.
"
)
# plotting the plot
ct
=
pd
.
crosstab
(
self
.
df
[
category1
],
self
.
df
[
category2
])
# Calculate percentages by row
ct_percent
=
ct
.
apply
(
lambda
r
:
r
/
r
.
sum
()
*
100
,
axis
=
0
)
...
...
@@ -79,6 +154,33 @@ class Plotter:
Returns:
None
"""
# implementing sensible logging and error catching
if
type
(
target
)
!=
str
:
logging
.
error
(
"
parameter target should be a string.
"
)
raise
ValueError
(
"
parameter target should be a string.
"
)
if
not
(
target
in
self
.
df
.
columns
):
logging
.
error
(
"
parameter target cannot be found in the dataset.
"
)
raise
ValueError
(
"
parameter target cannot be found in the dataset.
"
)
if
type
(
category
)
!=
str
:
logging
.
error
(
"
parameter category should be a string.
"
)
raise
ValueError
(
"
parameter category should be a string.
"
)
if
not
(
category
in
self
.
df
.
columns
):
logging
.
error
(
"
parameter category cannot be found in the dataset.
"
)
raise
ValueError
(
"
parameter category cannot be found in the dataset.
"
)
if
type
(
styling_params
)
!=
dict
:
logging
.
error
(
"
parameter styling params should be a dict.
"
)
raise
ValueError
(
"
parameter styling params should be a dict.
"
)
# plotting the plot
uniques
=
self
.
ds
.
get_unique_column_values
(
category
)
fig
,
ax
=
plt
.
subplots
()
self
.
customize_plot
(
fig
,
ax
,
styling_params
)
...
...
@@ -104,6 +206,33 @@ class Plotter:
Returns:
None
"""
# implementing sensible logging and error catching
if
type
(
target1
)
!=
str
:
logging
.
error
(
"
parameter target1 should be a string.
"
)
raise
ValueError
(
"
parameter target1 should be a string.
"
)
if
not
(
target1
in
self
.
df
.
columns
):
logging
.
error
(
"
parameter target1 cannot be found in the dataset.
"
)
raise
ValueError
(
"
parameter target1 cannot be found in the dataset.
"
)
if
type
(
target2
)
!=
str
:
logging
.
error
(
"
parameter target2 should be a string.
"
)
raise
ValueError
(
"
parameter target2 should be a string.
"
)
if
not
(
target2
in
self
.
df
.
columns
):
logging
.
error
(
"
parameter target2 cannot be found in the dataset.
"
)
raise
ValueError
(
"
parameter target2 cannot be found in the dataset.
"
)
if
type
(
styling_params
)
!=
dict
:
logging
.
error
(
"
parameter styling params should be a dict.
"
)
raise
ValueError
(
"
parameter styling params should be a dict.
"
)
# plotting the plot
fig
,
ax
=
plt
.
subplots
()
self
.
customize_plot
(
fig
,
ax
,
styling_params
)
ax
.
scatter
(
self
.
df
[
target1
],
self
.
df
[
target2
])
...
...
@@ -125,4 +254,3 @@ class Plotter:
plt
.
xlabel
(
"
Size
"
)
plt
.
ylabel
(
target
)
plt
.
title
(
f
"
Distribution of
{
target
}
"
)
This diff is collapsed.
Click to expand it.
src/test_dataset.py
0 → 100644
+
124
−
0
View file @
030b099c
"""
This test file tests the Dataset class in Dataset.py.
"""
from
Dataset
import
Dataset
import
pandas
as
pd
from
pathlib
import
Path
import
pytest
import
numpy
as
np
this_file_dir
=
Path
(
__file__
).
parent
@pytest.fixture
def
the_dataset
()
->
Dataset
:
dataset
=
Dataset
(
str
(
this_file_dir
/
"
../data/GamingStudy_data.csv
"
))
return
dataset
def
test_load_Dataset_class
():
"""
Tests if the dataset is successfully loaded.
"""
dataset
=
Dataset
(
str
(
this_file_dir
/
"
../data/GamingStudy_data.csv
"
))
assert
type
(
dataset
)
==
Dataset
assert
type
(
dataset
.
dataframe
)
==
pd
.
DataFrame
def
test_incorrectly_load_Dataset_class
():
with
pytest
.
raises
(
ValueError
):
dataset
=
Dataset
(
1234
)
# not a string
with
pytest
.
raises
(
OSError
):
dataset
=
Dataset
(
"
aaa.bcd
"
)
# doesn't end with .csv
with
pytest
.
raises
(
FileNotFoundError
):
dataset
=
Dataset
(
str
(
this_file_dir
/
"
./data/GamingStudy_data.csv
"
)
)
# wrong file location
def
test_get_dataframe
(
the_dataset
:
Dataset
):
"""
Tests Dataset.get_dataframe().
"""
assert
type
(
the_dataset
.
get_dataframe
())
==
pd
.
DataFrame
def
test_combined_anxiety_score
(
the_dataset
:
Dataset
):
"""
Tests Dataset.get_combined_anxiety_score().
"""
dataframe
=
the_dataset
.
get_dataframe
()
anxiety_scores
=
the_dataset
.
get_combined_anxiety_score
(
dataframe
)
assert
anxiety_scores
.
dtype
==
float
assert
anxiety_scores
.
min
()
>=
0
assert
anxiety_scores
.
max
()
<=
1
def
test_get_is_narcissist_col
(
the_dataset
:
Dataset
):
"""
Tests Dataset.get_is_narcissist_col().
"""
dataframe
=
the_dataset
.
get_dataframe
()
is_narcissist_row
=
the_dataset
.
get_is_narcissist_col
(
dataframe
)
assert
is_narcissist_row
.
dtype
==
bool
def
test_preprocessed_dataframe
(
the_dataset
:
Dataset
):
"""
Tests that the dataframe is preprocessed correctly.
"""
dataframe
=
the_dataset
.
get_dataframe
()
columns_set
=
set
(
dataframe
.
columns
)
assert
"
League
"
not
in
columns_set
assert
"
Anxiety_score
"
in
columns_set
assert
"
Is_narcissist
"
in
columns_set
def
test_get_sorted_columns
(
the_dataset
:
Dataset
):
"""
Tests Dataset.get_sorted_column().
"""
sorted_GAD1
=
the_dataset
.
get_sorted_column
(
"
GAD_T
"
)
assert
sorted_GAD1
.
iloc
[
0
]
<=
sorted_GAD1
.
iloc
[
-
1
]
@pytest.mark.parametrize
(
"
param
"
,
[
np
.
array
([
1
,
2
,
3
]),
"
123
"
,
3
,
0.1
,
[],
np
,
pd
,
True
,
None
],
)
def
test_catch_non_dataframe
(
the_dataset
:
Dataset
,
param
):
"""
Tests that functions that take pd.DataFrame correctly
catch incorrect input data types.
"""
with
pytest
.
raises
(
ValueError
):
the_dataset
.
preprocess_dataset
(
param
)
the_dataset
.
get_is_competitive_col
(
param
)
the_dataset
.
_drop_unnecessary_columns
(
param
)
the_dataset
.
remove_nonaccepting_rows
(
param
)
the_dataset
.
preprocess_whyplay
(
param
)
the_dataset
.
get_combined_anxiety_score
(
param
)
the_dataset
.
get_is_narcissist_col
(
param
)
@pytest.mark.parametrize
(
"
param
"
,
[
"
true
"
,
"
false
"
,
"
True
"
,
"
False
"
,
1
,
0
,
-
1
],
)
def
test_catch_non_bool
(
the_dataset
:
Dataset
,
param
):
"""
Tests that functions that take bool or None correctly
catch incorrect input data types.
"""
dataframe
=
the_dataset
.
get_dataframe
()
columns_set
=
set
(
dataframe
.
columns
)
with
pytest
.
raises
(
ValueError
):
the_dataset
.
get_category_counts
(
"
GAD_T
"
,
param
)
the_dataset
.
get_sorted_column
(
"
GAD_T
"
,
param
)
@pytest.mark.parametrize
(
"
param
"
,
[
True
,
False
,
None
],
)
def
test_bool_or_none_params
(
the_dataset
:
Dataset
,
param
):
"""
Tests that functions that take bool or None correctly
work as intended.
"""
dataframe
=
the_dataset
.
get_dataframe
()
columns_set
=
set
(
dataframe
.
columns
)
the_dataset
.
get_category_counts
(
"
GAD_T
"
,
param
)
def
test_catch_colname_not_in_df
(
the_dataset
:
Dataset
):
"""
Tests that functions that take colname correctly
catch colnames not in dataset.
"""
with
pytest
.
raises
(
KeyError
):
the_dataset
.
get_category_counts
(
"
GAAAD_T
"
)
the_dataset
.
get_sorted_column
(
"
GAAAD_T
"
)
This diff is collapsed.
Click to expand it.
src/test_plotter.py
0 → 100644
+
15
−
0
View file @
030b099c
from
pathlib
import
Path
from
Dataset
import
Dataset
from
Plotter
import
Plotter
import
pandas
as
pd
this_file_dir
=
Path
(
__file__
).
parent
def
test_load_plotter
():
"""
Tests that the Plotter class can be loaded.
"""
dataset
=
Dataset
(
str
(
this_file_dir
/
"
../data/GamingStudy_data.csv
"
))
plotter
=
Plotter
(
dataset
)
assert
type
(
plotter
.
df
)
==
pd
.
DataFrame
assert
type
(
plotter
.
ds
)
==
Dataset
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment