Commit b5ab2b36 authored by Alexander Lercher's avatar Alexander Lercher

Regression instead of classification training

parent a31e702c
...@@ -94,7 +94,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) -> ...@@ -94,7 +94,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
tuples.append(cur_metrics) tuples.append(cur_metrics)
if len(tuples) == N: if len(tuples) == N:
label = get_evolution_label(cur_cluster.size, data[i+1].size) label = data[i+1].size # get_evolution_label(cur_cluster.size, data[i+1].size)
yield list(tuples) + [label] yield list(tuples) + [label]
############################ ############################
def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]: def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
......
...@@ -98,7 +98,7 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference ...@@ -98,7 +98,7 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
# go through all time windows once... # go through all time windows once...
prev_time_key = ordered_time_keys[0] prev_time_key = ordered_time_keys[0]
for current_time_key in ordered_time_keys[1:]: for idx, current_time_key in enumerate(ordered_time_keys[1:-1]):
# ...and load the current and previous layer metrics in the reference_layer # ...and load the current and previous layer metrics in the reference_layer
current_layer_metric = layer_metrics[current_time_key] current_layer_metric = layer_metrics[current_time_key]
prev_layer_metric = layer_metrics[prev_time_key] prev_layer_metric = layer_metrics[prev_time_key]
...@@ -110,7 +110,8 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference ...@@ -110,7 +110,8 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
for cluster_id in cluster_ids: for cluster_id in cluster_ids:
current_cluster_metric = cluster_metrics[(current_time_key, cluster_id)] current_cluster_metric = cluster_metrics[(current_time_key, cluster_id)]
prev_cluster_metric = cluster_metrics[(prev_time_key, cluster_id)] prev_cluster_metric = cluster_metrics[(prev_time_key, cluster_id)]
evolution_label = get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size) next_time_key = ordered_time_keys[idx+2]
evolution_label = cluster_metrics[(next_time_key, cluster_id)].size # get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size)
# yield each combination of reference layer metrics to clusters # yield each combination of reference layer metrics to clusters
yield [prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id), evolution_label] yield [prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id), evolution_label]
......
...@@ -29,15 +29,27 @@ def remove_empty_community_class(df): ...@@ -29,15 +29,27 @@ def remove_empty_community_class(df):
######################## ########################
import sklearn.metrics import sklearn.metrics
def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list): def print_classification_report(clf, test_X, test_Y, title):
""" """
Prints all reports. Prints a classification report.
:param clfs: list of classifiers to evaluate :param clf: classifier to evaluate
:param test_Xs: list of test_X for the corresponding classifier at idx :param test_X: input features X
:param test_Y: true classes :param test_Y: true classes Y
:param titles: list of titles for the classifiers at idx :param title: title for the report
""" """
for clf, test_X, title in zip(clfs, test_Xs, titles): pred_Y = clf.predict(test_X)
pred_Y = clf.predict(test_X) print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
def print_regression_report(clf, test_X, test_Y, title):
"""
Prints a regression report.
:param clf: regressor to evaluate
:param test_X: input features X
:param test_Y: true prediction values
:param title: title for the report
"""
pred_Y = clf.predict(test_X)
print(f"### {title} ###\nR2-score={sklearn.metrics.r2_score(y_true=test_Y, y_pred=pred_Y)}, " \
f"MSE={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=pred_Y)}, " \
f"sanity={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=[0]*len(pred_Y))}")
######################## ########################
\ No newline at end of file
import pandas as pd import pandas as pd
from pandas import DataFrame from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report from processing.ml.train_base import split_data, remove_empty_community_class, print_regression_report
approach = 'cross_context' approach = 'cross_context'
max_sampling_size = 20000
####################### #######################
import pickle import pickle
from pathlib import Path from pathlib import Path
...@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, reference_layer_name, scaler=False ...@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, reference_layer_name, scaler=False
with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f: with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f) pickle.dump(model, f)
################### ###################
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor
n_estimators = 50 n_estimators = 50
criterion = 'gini' criterion = 'mse'
max_depth = None max_depth = None
min_samples_leaf = 2 min_samples_leaf = 2
min_impurity_decrease= 1E-5 min_impurity_decrease= 1E-5
bootstrap=True bootstrap=True
####################
from sklearn.svm import LinearSVR
tol = 1E-4
c = 1
loss = 'squared_epsilon_insensitive'
dual = False
############### ###############
...@@ -40,8 +47,7 @@ def run_training(use_case): ...@@ -40,8 +47,7 @@ def run_training(use_case):
####################### #######################
training, testing = split_data(df, shuffle=False) training, testing = split_data(df, shuffle=False)
##################### #####################
training = remove_empty_community_class(training) training.sample(frac=min(1, max_sampling_size/len(training))).reset_index(drop=True)
testing = remove_empty_community_class(testing)
##################### #####################
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() scaler = StandardScaler()
...@@ -54,20 +60,15 @@ def run_training(use_case): ...@@ -54,20 +60,15 @@ def run_training(use_case):
export_model(scaler, use_case, layer_name, reference_layer_name, scaler=True) export_model(scaler, use_case, layer_name, reference_layer_name, scaler=True)
######################## ########################
from processing import DataSampler # RF is a lot better than SVM, but I did not tune hyperparameters for regression
rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
sampler = DataSampler()
try:
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
except ValueError as e: # not enough points for oversampling
print(f"Could not sample training data, using original distribution: {e}")
####################
rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
bootstrap=bootstrap) bootstrap=bootstrap)
# rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
rfc.fit(train_X, train_Y) rfc.fit(train_X, train_Y)
print_report([rfc], [test_X], test_Y, ["X"]) print_regression_report(rfc, test_X, test_Y, f"{layer_name} based on {reference_layer_name}")
export_model(rfc, use_case, layer_name, reference_layer_name) export_model(rfc, use_case, layer_name, reference_layer_name)
\ No newline at end of file
import pandas as pd import pandas as pd
from pandas import DataFrame from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report from processing.ml.train_base import split_data, remove_empty_community_class, print_regression_report
approach = 'single_context' approach = 'single_context'
max_sampling_size = 20000
####################### #######################
import pickle import pickle
from pathlib import Path from pathlib import Path
...@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, scaler=False): ...@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, scaler=False):
with open(f'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f: with open(f'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f) pickle.dump(model, f)
##################### #####################
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor
n_estimators = 100 n_estimators = 100
criterion = 'gini' criterion = 'mse'
max_depth = None max_depth = None
min_samples_leaf = 2 min_samples_leaf = 2
min_impurity_decrease = 1E-5 min_impurity_decrease = 1E-5
bootstrap=True bootstrap=True
####################
from sklearn.svm import LinearSVR
tol = 1E-4
c = 1
loss = 'squared_epsilon_insensitive'
dual = False
############### ###############
...@@ -39,8 +46,7 @@ def run_training(use_case): ...@@ -39,8 +46,7 @@ def run_training(use_case):
####################### #######################
training, testing = split_data(df, shuffle=False) training, testing = split_data(df, shuffle=False)
##################### #####################
training = remove_empty_community_class(training) training.sample(frac=min(1, max_sampling_size/len(training))).reset_index(drop=True)
testing = remove_empty_community_class(testing)
##################### #####################
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() scaler = StandardScaler()
...@@ -53,20 +59,15 @@ def run_training(use_case): ...@@ -53,20 +59,15 @@ def run_training(use_case):
export_model(scaler, use_case, layer_name, scaler=True) export_model(scaler, use_case, layer_name, scaler=True)
######################## ########################
from processing import DataSampler # RF is 10-20% better compared to SVM, but I did not tune hyperparameters for regression
rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
sampler = DataSampler()
try:
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
except ValueError as e: # not enough points for oversampling
print(f"Could not sample training data, using original distribution: {e}")
####################
rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
bootstrap=bootstrap) bootstrap=bootstrap)
# rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
rfc.fit(train_X, train_Y) rfc.fit(train_X, train_Y)
#################### ####################
print_report([rfc], [test_X], test_Y, ["X"]) print_regression_report(rfc, test_X, test_Y, layer_name)
#################### ####################
export_model(rfc, use_case, layer_name) export_model(rfc, use_case, layer_name)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment