Commit b5ab2b36 authored by Alexander Lercher's avatar Alexander Lercher

Regression instead of classification training

parent a31e702c
......@@ -94,7 +94,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
tuples.append(cur_metrics)
if len(tuples) == N:
label = get_evolution_label(cur_cluster.size, data[i+1].size)
label = data[i+1].size # get_evolution_label(cur_cluster.size, data[i+1].size)
yield list(tuples) + [label]
############################
def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
......
......@@ -98,7 +98,7 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
# go through all time windows once...
prev_time_key = ordered_time_keys[0]
for current_time_key in ordered_time_keys[1:]:
for idx, current_time_key in enumerate(ordered_time_keys[1:-1]):
# ...and load the current and previous layer metrics in the reference_layer
current_layer_metric = layer_metrics[current_time_key]
prev_layer_metric = layer_metrics[prev_time_key]
......@@ -110,7 +110,8 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
for cluster_id in cluster_ids:
current_cluster_metric = cluster_metrics[(current_time_key, cluster_id)]
prev_cluster_metric = cluster_metrics[(prev_time_key, cluster_id)]
evolution_label = get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size)
next_time_key = ordered_time_keys[idx+2]
evolution_label = cluster_metrics[(next_time_key, cluster_id)].size # get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size)
# yield each combination of reference layer metrics to clusters
yield [prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id), evolution_label]
......
......@@ -29,15 +29,27 @@ def remove_empty_community_class(df):
########################
import sklearn.metrics
def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
def print_classification_report(clf, test_X, test_Y, title):
"""
Prints all reports.
:param clfs: list of classifiers to evaluate
:param test_Xs: list of test_X for the corresponding classifier at idx
:param test_Y: true classes
:param titles: list of titles for the classifiers at idx
Prints a classification report.
:param clf: classifier to evaluate
:param test_X: input features X
:param test_Y: true classes Y
:param title: title for the report
"""
for clf, test_X, title in zip(clfs, test_Xs, titles):
pred_Y = clf.predict(test_X)
print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
pred_Y = clf.predict(test_X)
print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
def print_regression_report(clf, test_X, test_Y, title):
"""
Prints a regression report.
:param clf: regressor to evaluate
:param test_X: input features X
:param test_Y: true prediction values
:param title: title for the report
"""
pred_Y = clf.predict(test_X)
print(f"### {title} ###\nR2-score={sklearn.metrics.r2_score(y_true=test_Y, y_pred=pred_Y)}, " \
f"MSE={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=pred_Y)}, " \
f"sanity={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=[0]*len(pred_Y))}")
########################
\ No newline at end of file
import pandas as pd
from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report
from processing.ml.train_base import split_data, remove_empty_community_class, print_regression_report
approach = 'cross_context'
max_sampling_size = 20000
#######################
import pickle
from pathlib import Path
......@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, reference_layer_name, scaler=False
with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f)
###################
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
n_estimators = 50
criterion = 'gini'
criterion = 'mse'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease= 1E-5
bootstrap=True
####################
from sklearn.svm import LinearSVR
tol = 1E-4
c = 1
loss = 'squared_epsilon_insensitive'
dual = False
###############
......@@ -40,8 +47,7 @@ def run_training(use_case):
#######################
training, testing = split_data(df, shuffle=False)
#####################
training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)
training.sample(frac=min(1, max_sampling_size/len(training))).reset_index(drop=True)
#####################
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
......@@ -54,20 +60,15 @@ def run_training(use_case):
export_model(scaler, use_case, layer_name, reference_layer_name, scaler=True)
########################
from processing import DataSampler
sampler = DataSampler()
try:
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
except ValueError as e: # not enough points for oversampling
print(f"Could not sample training data, using original distribution: {e}")
####################
rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
# RF is a lot better than SVM, but I did not tune hyperparameters for regression
rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
bootstrap=bootstrap)
# rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
rfc.fit(train_X, train_Y)
print_report([rfc], [test_X], test_Y, ["X"])
print_regression_report(rfc, test_X, test_Y, f"{layer_name} based on {reference_layer_name}")
export_model(rfc, use_case, layer_name, reference_layer_name)
\ No newline at end of file
import pandas as pd
from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report
from processing.ml.train_base import split_data, remove_empty_community_class, print_regression_report
approach = 'single_context'
max_sampling_size = 20000
#######################
import pickle
from pathlib import Path
......@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, scaler=False):
with open(f'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f)
#####################
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
n_estimators = 100
criterion = 'gini'
criterion = 'mse'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 1E-5
bootstrap=True
####################
from sklearn.svm import LinearSVR
tol = 1E-4
c = 1
loss = 'squared_epsilon_insensitive'
dual = False
###############
......@@ -39,8 +46,7 @@ def run_training(use_case):
#######################
training, testing = split_data(df, shuffle=False)
#####################
training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)
training.sample(frac=min(1, max_sampling_size/len(training))).reset_index(drop=True)
#####################
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
......@@ -53,20 +59,15 @@ def run_training(use_case):
export_model(scaler, use_case, layer_name, scaler=True)
########################
from processing import DataSampler
sampler = DataSampler()
try:
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
except ValueError as e: # not enough points for oversampling
print(f"Could not sample training data, using original distribution: {e}")
####################
rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
# RF is 10-20% better compared to SVM, but I did not tune hyperparameters for regression
rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
bootstrap=bootstrap)
# rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
rfc.fit(train_X, train_Y)
####################
print_report([rfc], [test_X], test_Y, ["X"])
print_regression_report(rfc, test_X, test_Y, layer_name)
####################
export_model(rfc, use_case, layer_name)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment