Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
SMART
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
UNI-KLU
SMART
Commits
c1dc19d0
Commit
c1dc19d0
authored
Jul 22, 2021
by
Alexander Lercher
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added ml training
parent
678477f0
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
199 additions
and
0 deletions
+199
-0
train_base.py
...ty-detection-microservice/app/processing/ml/train_base.py
+43
-0
train_cross_context.py
...ion-microservice/app/processing/ml/train_cross_context.py
+71
-0
train_single_context.py
...on-microservice/app/processing/ml/train_single_context.py
+70
-0
run_training.py
...tive-community-detection-microservice/app/run_training.py
+15
-0
No files found.
src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_base.py
0 → 100644
View file @
c1dc19d0
import
numpy
as
np
import
collections
from
typing
import
Tuple
def
split_data
(
dataframe
,
test_dataset_frac
=
.2
,
shuffle
=
False
)
->
Tuple
[
'training_data'
,
'test_data'
]:
if
shuffle
:
dataframe
=
dataframe
.
sample
(
frac
=
1
)
.
reset_index
(
drop
=
True
)
training_size
=
int
(
len
(
dataframe
)
*
(
1
-
test_dataset_frac
))
train
=
dataframe
[:
training_size
]
.
reset_index
(
drop
=
True
)
test
=
dataframe
[
training_size
:]
.
reset_index
(
drop
=
True
)
y_train
=
train
[
train
.
columns
[
-
1
]]
y_test
=
test
[
test
.
columns
[
-
1
]]
return
train
,
test
#######################
import
pandas
as
pd
from
pandas
import
DataFrame
def
remove_empty_community_class
(
df
):
'''Removes evolution_label -1 from dataset indicating the community stays empty.'''
# res = df.loc[df['evolution_label'] != -1.0]
# res = res.reset_index(drop=True)
# return res
df
[
'evolution_label'
]
=
df
[
'evolution_label'
]
.
replace
(
-
1.0
,
0
)
return
df
########################
import
sklearn.metrics
def
print_report
(
clfs
:
list
,
test_Xs
:
list
,
test_Y
:
'y'
,
titles
:
list
):
"""
Prints all reports.
:param clfs: list of classifiers to evaluate
:param test_Xs: list of test_X for the corresponding classifier at idx
:param test_Y: true classes
:param titles: list of titles for the classifiers at idx
"""
for
clf
,
test_X
,
title
in
zip
(
clfs
,
test_Xs
,
titles
):
pred_Y
=
clf
.
predict
(
test_X
)
print
(
f
"### {title} ###
\n
"
,
sklearn
.
metrics
.
classification_report
(
y_true
=
test_Y
,
y_pred
=
pred_Y
))
########################
\ No newline at end of file
src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_cross_context.py
0 → 100644
View file @
c1dc19d0
import
pandas
as
pd
from
pandas
import
DataFrame
from
processing.ml.train_base
import
split_data
,
remove_empty_community_class
,
print_report
approach
=
'cross_context'
#######################
import
pickle
from
pathlib
import
Path
def
export_model
(
model
,
use_case
,
layer_name
,
reference_layer_name
):
fpath
=
f
'data/{use_case}/ml_output/{approach}'
Path
(
fpath
)
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
open
(
f
'{fpath}/{layer_name}_{reference_layer_name}.model'
,
'wb'
)
as
f
:
pickle
.
dump
(
model
,
f
)
###################
from
sklearn.ensemble
import
RandomForestClassifier
n_estimators
=
50
criterion
=
'gini'
max_depth
=
None
min_samples_leaf
=
2
min_impurity_decrease
=
1E-5
bootstrap
=
True
###############
from
db.repository
import
Repository
repo
=
Repository
()
def
run_training
(
use_case
):
for
layerpair
in
repo
.
get_layer_pairs
(
use_case
):
layer_name
=
layerpair
.
layer
reference_layer_name
=
layerpair
.
reference_layer
df
:
DataFrame
=
pd
.
read_csv
(
f
'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv'
,
index_col
=
0
)
#######################
training
,
testing
=
split_data
(
df
,
shuffle
=
False
)
#####################
training
=
remove_empty_community_class
(
training
)
testing
=
remove_empty_community_class
(
testing
)
#####################
from
sklearn.preprocessing
import
StandardScaler
scaler
=
StandardScaler
()
train_X
=
scaler
.
fit_transform
(
training
)[:,:
-
1
]
# all except y
train_Y
=
training
[
training
.
columns
[
-
1
]]
test_X
=
scaler
.
transform
(
testing
)[:,:
-
1
]
# all except y
test_Y
=
testing
[
testing
.
columns
[
-
1
]]
########################
from
processing
import
DataSampler
sampler
=
DataSampler
()
try
:
train_X
,
train_Y
=
sampler
.
sample_median_size
(
train_X
,
train_Y
,
max_size
=
100000
)
except
ValueError
as
e
:
# not enough points for oversampling
print
(
f
"Could not sample training data, using original distribution: {e}"
)
####################
rfc
=
RandomForestClassifier
(
n_estimators
=
n_estimators
,
criterion
=
criterion
,
max_depth
=
max_depth
,
min_samples_leaf
=
min_samples_leaf
,
min_impurity_decrease
=
min_impurity_decrease
,
bootstrap
=
bootstrap
)
rfc
.
fit
(
train_X
,
train_Y
)
print_report
([
rfc
],
[
test_X
],
test_Y
,
[
"X"
])
export_model
(
rfc
,
use_case
,
layer_name
,
reference_layer_name
)
\ No newline at end of file
src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_single_context.py
0 → 100644
View file @
c1dc19d0
import
pandas
as
pd
from
pandas
import
DataFrame
from
processing.ml.train_base
import
split_data
,
remove_empty_community_class
,
print_report
approach
=
'single_context'
#######################
import
pickle
from
pathlib
import
Path
def
export_model
(
model
,
use_case
,
layer_name
):
fpath
=
f
'data/{use_case}/ml_output/{approach}'
Path
(
fpath
)
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
open
(
f
'{fpath}/{layer_name}.model'
,
'wb'
)
as
f
:
pickle
.
dump
(
model
,
f
)
#####################
from
sklearn.ensemble
import
RandomForestClassifier
n_estimators
=
100
criterion
=
'gini'
max_depth
=
None
min_samples_leaf
=
2
min_impurity_decrease
=
1E-5
bootstrap
=
True
###############
from
db.repository
import
Repository
repo
=
Repository
()
def
run_training
(
use_case
):
for
layer
in
repo
.
get_layers_for_use_case
(
use_case
):
layer_name
=
layer
.
layer_name
df
:
DataFrame
=
pd
.
read_csv
(
f
'data/{use_case}/ml_input/single_context/{layer_name}.csv'
,
index_col
=
0
)
#######################
training
,
testing
=
split_data
(
df
,
shuffle
=
False
)
#####################
training
=
remove_empty_community_class
(
training
)
testing
=
remove_empty_community_class
(
testing
)
#####################
from
sklearn.preprocessing
import
StandardScaler
scaler
=
StandardScaler
()
train_X
=
scaler
.
fit_transform
(
training
)[:,:
-
1
]
# all except y
train_Y
=
training
[
training
.
columns
[
-
1
]]
test_X
=
scaler
.
transform
(
testing
)[:,:
-
1
]
# all except y
test_Y
=
testing
[
testing
.
columns
[
-
1
]]
########################
from
processing
import
DataSampler
sampler
=
DataSampler
()
try
:
train_X
,
train_Y
=
sampler
.
sample_median_size
(
train_X
,
train_Y
,
max_size
=
100000
)
except
ValueError
as
e
:
# not enough points for oversampling
print
(
f
"Could not sample training data, using original distribution: {e}"
)
####################
rfc
=
RandomForestClassifier
(
n_estimators
=
n_estimators
,
criterion
=
criterion
,
max_depth
=
max_depth
,
min_samples_leaf
=
min_samples_leaf
,
min_impurity_decrease
=
min_impurity_decrease
,
bootstrap
=
bootstrap
)
rfc
.
fit
(
train_X
,
train_Y
)
####################
print_report
([
rfc
],
[
test_X
],
test_Y
,
[
"X"
])
####################
export_model
(
rfc
,
use_case
,
layer_name
)
\ No newline at end of file
src/data-hub/proactive-community-detection-microservice/app/run_training.py
0 → 100644
View file @
c1dc19d0
import
sys
import
os
modules_path
=
'../../../modules/'
if
os
.
path
.
exists
(
modules_path
):
sys
.
path
.
insert
(
1
,
modules_path
)
from
processing.ml.train_single_context
import
run_single_training
from
processing.ml.train_cross_context
import
run_cross_training
if
__name__
==
'__main__'
:
use_case
=
'community-prediction-youtube-n'
run_single_training
(
use_case
)
run_cross_training
(
use_case
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment