Regression instead of classification training

b5ab2b36 · Alexander Lercher · a31e702c · b5ab2b36 · b5ab2b36 · b5ab2b36
Commit b5ab2b36 authored Jul 29, 2021 by Alexander Lercher
5 changed files
--- a/src/data-hub/proactive-community-detection-microservice/app/processing/data_prep/cluster_metrics_calc.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/data_prep/cluster_metrics_calc.py
@@ -94,7 +94,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
        tuples.append(cur_metrics)

        if len(tuples) == N:
-            label = get_evolution_label(cur_cluster.size, data[i+1].size)
+            label = data[i+1].size # get_evolution_label(cur_cluster.size, data[i+1].size)
            yield list(tuples) + [label]
 ############################
 def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:

--- a/src/data-hub/proactive-community-detection-microservice/app/processing/data_prep/layer_metrics_calc.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/data_prep/layer_metrics_calc.py
@@ -98,7 +98,7 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
    
    # go through all time windows once...
    prev_time_key = ordered_time_keys[0]
-    for current_time_key in ordered_time_keys[1:]:
+    for idx, current_time_key in enumerate(ordered_time_keys[1:-1]):
        # ...and load the current and previous layer metrics in the reference_layer
        current_layer_metric = layer_metrics[current_time_key]
        prev_layer_metric = layer_metrics[prev_time_key]
@@ -110,7 +110,8 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
        for cluster_id in cluster_ids:
            current_cluster_metric = cluster_metrics[(current_time_key, cluster_id)]
            prev_cluster_metric = cluster_metrics[(prev_time_key, cluster_id)]
-            evolution_label = get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size)
+            next_time_key = ordered_time_keys[idx+2]
+            evolution_label = cluster_metrics[(next_time_key, cluster_id)].size # get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size)

            # yield each combination of reference layer metrics to clusters
            yield [prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id), evolution_label]

--- a/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_base.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_base.py
@@ -29,15 +29,27 @@ def remove_empty_community_class(df):
 ########################
 import sklearn.metrics

-def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
+def print_classification_report(clf, test_X, test_Y, title):
    """
-    Prints all reports.
-    :param clfs: list of classifiers to evaluate
-    :param test_Xs: list of test_X for the corresponding classifier at idx
-    :param test_Y: true classes
-    :param titles: list of titles for the classifiers at idx
+    Prints a classification report.
+    :param clf: classifier to evaluate
+    :param test_X: input features X
+    :param test_Y: true classes Y
+    :param title: title for the report
    """
-    for clf, test_X, title in zip(clfs, test_Xs, titles):
-        pred_Y = clf.predict(test_X)        
-        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
+    pred_Y = clf.predict(test_X)        
+    print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
+
+def print_regression_report(clf, test_X, test_Y, title):
+    """
+    Prints a regression report.
+    :param clf: regressor to evaluate
+    :param test_X: input features X
+    :param test_Y: true prediction values 
+    :param title: title for the report
+    """
+    pred_Y = clf.predict(test_X)        
+    print(f"### {title} ###\nR2-score={sklearn.metrics.r2_score(y_true=test_Y, y_pred=pred_Y)}, " \
+          f"MSE={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=pred_Y)}, " \
+          f"sanity={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=[0]*len(pred_Y))}")
 ########################
\ No newline at end of file
--- a/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_cross_context.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_cross_context.py
 import pandas as pd
 from pandas import DataFrame
-from processing.ml.train_base import split_data, remove_empty_community_class, print_report
+from processing.ml.train_base import split_data, remove_empty_community_class, print_regression_report


 approach = 'cross_context'
+max_sampling_size = 20000
 #######################
 import pickle
 from pathlib import Path
@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, reference_layer_name, scaler=False
    with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
        pickle.dump(model, f)
 ###################
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestRegressor
 n_estimators = 50
-criterion = 'gini'
+criterion = 'mse'
 max_depth = None
 min_samples_leaf = 2
 min_impurity_decrease= 1E-5
 bootstrap=True
+####################
+from sklearn.svm import LinearSVR
+tol = 1E-4
+c = 1
+loss = 'squared_epsilon_insensitive'
+dual = False


 ###############
@@ -40,8 +47,7 @@ def run_training(use_case):
        #######################
        training, testing = split_data(df, shuffle=False)
        #####################
-        training = remove_empty_community_class(training)
-        testing = remove_empty_community_class(testing)
+        training.sample(frac=min(1, max_sampling_size/len(training))).reset_index(drop=True)
        #####################
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
@@ -54,20 +60,15 @@ def run_training(use_case):

        export_model(scaler, use_case, layer_name, reference_layer_name, scaler=True)
        ########################
-        from processing import DataSampler
-
-        sampler = DataSampler()
-        try:
-            train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
-        except ValueError as e: # not enough points for oversampling
-            print(f"Could not sample training data, using original distribution: {e}")
-        ####################
-        rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
+        # RF is a lot better than SVM, but I did not tune hyperparameters for regression
+        rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                                    min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, 
                                    bootstrap=bootstrap)
+        # rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
+
        rfc.fit(train_X, train_Y)
        
-        print_report([rfc], [test_X], test_Y, ["X"])
+        print_regression_report(rfc, test_X, test_Y, f"{layer_name} based on {reference_layer_name}")
        
        export_model(rfc, use_case, layer_name, reference_layer_name)
        
\ No newline at end of file
--- a/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_single_context.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_single_context.py
 import pandas as pd
 from pandas import DataFrame
-from processing.ml.train_base import split_data, remove_empty_community_class, print_report
+from processing.ml.train_base import split_data, remove_empty_community_class, print_regression_report


 approach = 'single_context'
+max_sampling_size = 20000
 #######################
 import pickle 
 from pathlib import Path
@@ -14,13 +15,19 @@ def export_model(model, use_case, layer_name, scaler=False):
    with open(f'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
        pickle.dump(model, f)
 #####################
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestRegressor
 n_estimators = 100
-criterion = 'gini'
+criterion = 'mse'
 max_depth = None
 min_samples_leaf = 2
 min_impurity_decrease = 1E-5
 bootstrap=True
+####################
+from sklearn.svm import LinearSVR
+tol = 1E-4
+c = 1
+loss = 'squared_epsilon_insensitive'
+dual = False


 ###############
@@ -39,8 +46,7 @@ def run_training(use_case):
        #######################
        training, testing = split_data(df, shuffle=False)
        #####################
-        training = remove_empty_community_class(training)
-        testing = remove_empty_community_class(testing)
+        training.sample(frac=min(1, max_sampling_size/len(training))).reset_index(drop=True)
        #####################
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
@@ -53,20 +59,15 @@ def run_training(use_case):

        export_model(scaler, use_case, layer_name, scaler=True)
        ########################
-        from processing import DataSampler
-
-        sampler = DataSampler()
-        try:
-            train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
-        except ValueError as e: # not enough points for oversampling
-            print(f"Could not sample training data, using original distribution: {e}")
-        ####################
-        rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
+        # RF is 10-20% better compared to SVM, but I did not tune hyperparameters for regression
+        rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                                    min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, 
                                    bootstrap=bootstrap)
+        # rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
+
        rfc.fit(train_X, train_Y)
        ####################
-        print_report([rfc], [test_X], test_Y, ["X"])
+        print_regression_report(rfc, test_X, test_Y, layer_name)
        ####################
        export_model(rfc, use_case, layer_name)
        
\ No newline at end of file