Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
SMART
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
UNI-KLU
SMART
Commits
b5ab2b36
Commit
b5ab2b36
authored
Jul 29, 2021
by
Alexander Lercher
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Regression instead of classification training
parent
a31e702c
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
57 additions
and
42 deletions
+57
-42
cluster_metrics_calc.py
...oservice/app/processing/data_prep/cluster_metrics_calc.py
+1
-1
layer_metrics_calc.py
...croservice/app/processing/data_prep/layer_metrics_calc.py
+3
-2
train_base.py
...ty-detection-microservice/app/processing/ml/train_base.py
+21
-9
train_cross_context.py
...ion-microservice/app/processing/ml/train_cross_context.py
+16
-15
train_single_context.py
...on-microservice/app/processing/ml/train_single_context.py
+16
-15
No files found.
src/data-hub/proactive-community-detection-microservice/app/processing/data_prep/cluster_metrics_calc.py
View file @
b5ab2b36
...
...
@@ -94,7 +94,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
tuples
.
append
(
cur_metrics
)
if
len
(
tuples
)
==
N
:
label
=
get_evolution_label
(
cur_cluster
.
size
,
data
[
i
+
1
]
.
size
)
label
=
data
[
i
+
1
]
.
size
#
get_evolution_label(cur_cluster.size, data[i+1].size)
yield
list
(
tuples
)
+
[
label
]
############################
def
flatten_metrics_datapoint
(
datapoint
:
list
)
->
Tuple
[
'X'
,
np
.
array
]:
...
...
src/data-hub/proactive-community-detection-microservice/app/processing/data_prep/layer_metrics_calc.py
View file @
b5ab2b36
...
...
@@ -98,7 +98,7 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
# go through all time windows once...
prev_time_key
=
ordered_time_keys
[
0
]
for
current_time_key
in
ordered_time_keys
[
1
:]
:
for
idx
,
current_time_key
in
enumerate
(
ordered_time_keys
[
1
:
-
1
])
:
# ...and load the current and previous layer metrics in the reference_layer
current_layer_metric
=
layer_metrics
[
current_time_key
]
prev_layer_metric
=
layer_metrics
[
prev_time_key
]
...
...
@@ -110,7 +110,8 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
for
cluster_id
in
cluster_ids
:
current_cluster_metric
=
cluster_metrics
[(
current_time_key
,
cluster_id
)]
prev_cluster_metric
=
cluster_metrics
[(
prev_time_key
,
cluster_id
)]
evolution_label
=
get_evolution_label
(
prev_cluster_metric
.
size
,
current_cluster_metric
.
size
)
next_time_key
=
ordered_time_keys
[
idx
+
2
]
evolution_label
=
cluster_metrics
[(
next_time_key
,
cluster_id
)]
.
size
# get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size)
# yield each combination of reference layer metrics to clusters
yield
[
prev_layer_metric_tuple
,
current_layer_metric_tuple
,
int
(
cluster_id
),
evolution_label
]
...
...
src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_base.py
View file @
b5ab2b36
...
...
@@ -29,15 +29,27 @@ def remove_empty_community_class(df):
########################
import
sklearn.metrics
def
print_
report
(
clfs
:
list
,
test_Xs
:
list
,
test_Y
:
'y'
,
titles
:
list
):
def
print_
classification_report
(
clf
,
test_X
,
test_Y
,
title
):
"""
Prints a
ll reports
.
:param clf
s: list of classifiers
to evaluate
:param test_X
s: list of test_X for the corresponding classifier at idx
:param test_Y: true classes
:param title
s: list of titles for the classifiers at idx
Prints a
classification report
.
:param clf
: classifier
to evaluate
:param test_X
: input features X
:param test_Y: true classes
Y
:param title
: title for the report
"""
for
clf
,
test_X
,
title
in
zip
(
clfs
,
test_Xs
,
titles
):
pred_Y
=
clf
.
predict
(
test_X
)
print
(
f
"### {title} ###
\n
"
,
sklearn
.
metrics
.
classification_report
(
y_true
=
test_Y
,
y_pred
=
pred_Y
))
def
print_regression_report
(
clf
,
test_X
,
test_Y
,
title
):
"""
Prints a regression report.
:param clf: regressor to evaluate
:param test_X: input features X
:param test_Y: true prediction values
:param title: title for the report
"""
pred_Y
=
clf
.
predict
(
test_X
)
print
(
f
"### {title} ###
\n
R2-score={sklearn.metrics.r2_score(y_true=test_Y, y_pred=pred_Y)}, "
\
f
"MSE={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=pred_Y)}, "
\
f
"sanity={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=[0]*len(pred_Y))}"
)
########################
\ No newline at end of file
src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_cross_context.py
View file @
b5ab2b36
import
pandas
as
pd
from
pandas
import
DataFrame
from
processing.ml.train_base
import
split_data
,
remove_empty_community_class
,
print_report
from
processing.ml.train_base
import
split_data
,
remove_empty_community_class
,
print_re
gression_re
port
approach
=
'cross_context'
max_sampling_size
=
20000
#######################
import
pickle
from
pathlib
import
Path
...
...
@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, reference_layer_name, scaler=False
with
open
(
f
'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model'
,
'wb'
)
as
f
:
pickle
.
dump
(
model
,
f
)
###################
from
sklearn.ensemble
import
RandomForest
Classifie
r
from
sklearn.ensemble
import
RandomForest
Regresso
r
n_estimators
=
50
criterion
=
'
gini
'
criterion
=
'
mse
'
max_depth
=
None
min_samples_leaf
=
2
min_impurity_decrease
=
1E-5
bootstrap
=
True
####################
from
sklearn.svm
import
LinearSVR
tol
=
1E-4
c
=
1
loss
=
'squared_epsilon_insensitive'
dual
=
False
###############
...
...
@@ -40,8 +47,7 @@ def run_training(use_case):
#######################
training
,
testing
=
split_data
(
df
,
shuffle
=
False
)
#####################
training
=
remove_empty_community_class
(
training
)
testing
=
remove_empty_community_class
(
testing
)
training
.
sample
(
frac
=
min
(
1
,
max_sampling_size
/
len
(
training
)))
.
reset_index
(
drop
=
True
)
#####################
from
sklearn.preprocessing
import
StandardScaler
scaler
=
StandardScaler
()
...
...
@@ -54,20 +60,15 @@ def run_training(use_case):
export_model
(
scaler
,
use_case
,
layer_name
,
reference_layer_name
,
scaler
=
True
)
########################
from
processing
import
DataSampler
sampler
=
DataSampler
()
try
:
train_X
,
train_Y
=
sampler
.
sample_median_size
(
train_X
,
train_Y
,
max_size
=
100000
)
except
ValueError
as
e
:
# not enough points for oversampling
print
(
f
"Could not sample training data, using original distribution: {e}"
)
####################
rfc
=
RandomForestClassifier
(
n_estimators
=
n_estimators
,
criterion
=
criterion
,
max_depth
=
max_depth
,
# RF is a lot better than SVM, but I did not tune hyperparameters for regression
rfc
=
RandomForestRegressor
(
n_estimators
=
n_estimators
,
criterion
=
criterion
,
max_depth
=
max_depth
,
min_samples_leaf
=
min_samples_leaf
,
min_impurity_decrease
=
min_impurity_decrease
,
bootstrap
=
bootstrap
)
# rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
rfc
.
fit
(
train_X
,
train_Y
)
print_re
port
([
rfc
],
[
test_X
],
test_Y
,
[
"X"
]
)
print_re
gression_report
(
rfc
,
test_X
,
test_Y
,
f
"{layer_name} based on {reference_layer_name}"
)
export_model
(
rfc
,
use_case
,
layer_name
,
reference_layer_name
)
\ No newline at end of file
src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_single_context.py
View file @
b5ab2b36
import
pandas
as
pd
from
pandas
import
DataFrame
from
processing.ml.train_base
import
split_data
,
remove_empty_community_class
,
print_report
from
processing.ml.train_base
import
split_data
,
remove_empty_community_class
,
print_re
gression_re
port
approach
=
'single_context'
max_sampling_size
=
20000
#######################
import
pickle
from
pathlib
import
Path
...
...
@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, scaler=False):
with
open
(
f
'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model'
,
'wb'
)
as
f
:
pickle
.
dump
(
model
,
f
)
#####################
from
sklearn.ensemble
import
RandomForest
Classifie
r
from
sklearn.ensemble
import
RandomForest
Regresso
r
n_estimators
=
100
criterion
=
'
gini
'
criterion
=
'
mse
'
max_depth
=
None
min_samples_leaf
=
2
min_impurity_decrease
=
1E-5
bootstrap
=
True
####################
from
sklearn.svm
import
LinearSVR
tol
=
1E-4
c
=
1
loss
=
'squared_epsilon_insensitive'
dual
=
False
###############
...
...
@@ -39,8 +46,7 @@ def run_training(use_case):
#######################
training
,
testing
=
split_data
(
df
,
shuffle
=
False
)
#####################
training
=
remove_empty_community_class
(
training
)
testing
=
remove_empty_community_class
(
testing
)
training
.
sample
(
frac
=
min
(
1
,
max_sampling_size
/
len
(
training
)))
.
reset_index
(
drop
=
True
)
#####################
from
sklearn.preprocessing
import
StandardScaler
scaler
=
StandardScaler
()
...
...
@@ -53,20 +59,15 @@ def run_training(use_case):
export_model
(
scaler
,
use_case
,
layer_name
,
scaler
=
True
)
########################
from
processing
import
DataSampler
sampler
=
DataSampler
()
try
:
train_X
,
train_Y
=
sampler
.
sample_median_size
(
train_X
,
train_Y
,
max_size
=
100000
)
except
ValueError
as
e
:
# not enough points for oversampling
print
(
f
"Could not sample training data, using original distribution: {e}"
)
####################
rfc
=
RandomForestClassifier
(
n_estimators
=
n_estimators
,
criterion
=
criterion
,
max_depth
=
max_depth
,
# RF is 10-20% better compared to SVM, but I did not tune hyperparameters for regression
rfc
=
RandomForestRegressor
(
n_estimators
=
n_estimators
,
criterion
=
criterion
,
max_depth
=
max_depth
,
min_samples_leaf
=
min_samples_leaf
,
min_impurity_decrease
=
min_impurity_decrease
,
bootstrap
=
bootstrap
)
# rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
rfc
.
fit
(
train_X
,
train_Y
)
####################
print_re
port
([
rfc
],
[
test_X
],
test_Y
,
[
"X"
]
)
print_re
gression_report
(
rfc
,
test_X
,
test_Y
,
layer_name
)
####################
export_model
(
rfc
,
use_case
,
layer_name
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment