added ml training

c1dc19d0 · Alexander Lercher · 678477f0 · c1dc19d0 · c1dc19d0 · c1dc19d0
Commit c1dc19d0 authored Jul 22, 2021 by Alexander Lercher
4 changed files
--- a/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_base.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_base.py
+import numpy as np
+import collections
+from typing import Tuple
+
+def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> Tuple['training_data', 'test_data']:
+    if shuffle:
+        dataframe = dataframe.sample(frac=1).reset_index(drop=True)
+
+    training_size = int(len(dataframe) * (1-test_dataset_frac))
+
+    train = dataframe[:training_size].reset_index(drop=True)
+    test = dataframe[training_size:].reset_index(drop=True)
+
+    y_train = train[train.columns[-1]]
+    y_test = test[test.columns[-1]]
+  
+    return train, test
+#######################
+import pandas as pd
+from pandas import DataFrame
+
+def remove_empty_community_class(df):
+    '''Removes evolution_label -1 from dataset indicating the community stays empty.'''
+    # res = df.loc[df['evolution_label'] != -1.0]
+    # res = res.reset_index(drop=True)
+    # return res
+    df['evolution_label'] = df['evolution_label'].replace(-1.0, 0)
+    return df
+########################
+import sklearn.metrics
+
+def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
+    """
+    Prints all reports.
+    :param clfs: list of classifiers to evaluate
+    :param test_Xs: list of test_X for the corresponding classifier at idx
+    :param test_Y: true classes
+    :param titles: list of titles for the classifiers at idx
+    """
+    for clf, test_X, title in zip(clfs, test_Xs, titles):
+        pred_Y = clf.predict(test_X)        
+        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
+########################
\ No newline at end of file
--- a/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_cross_context.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_cross_context.py
+import pandas as pd
+from pandas import DataFrame
+from processing.ml.train_base import split_data, remove_empty_community_class, print_report
+
+
+approach = 'cross_context'
+#######################
+import pickle
+from pathlib import Path
+
+def export_model(model, use_case, layer_name, reference_layer_name):
+    fpath = f'data/{use_case}/ml_output/{approach}'
+    Path(fpath).mkdir(parents=True, exist_ok=True)
+    with open(f'{fpath}/{layer_name}_{reference_layer_name}.model', 'wb') as f:
+        pickle.dump(model, f)
+###################
+from sklearn.ensemble import RandomForestClassifier
+n_estimators = 50
+criterion = 'gini'
+max_depth = None
+min_samples_leaf = 2
+min_impurity_decrease= 1E-5
+bootstrap=True
+
+
+###############
+
+
+from db.repository import Repository
+
+repo = Repository()
+
+def run_training(use_case):
+    for layerpair in repo.get_layer_pairs(use_case):
+        layer_name = layerpair.layer
+        reference_layer_name = layerpair.reference_layer
+
+
+        df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)
+        #######################
+        training, testing = split_data(df, shuffle=False)
+        #####################
+        training = remove_empty_community_class(training)
+        testing = remove_empty_community_class(testing)
+        #####################
+        from sklearn.preprocessing import StandardScaler
+        scaler = StandardScaler()
+
+        train_X = scaler.fit_transform(training)[:,:-1] # all except y
+        train_Y = training[training.columns[-1]]
+
+        test_X = scaler.transform(testing)[:,:-1] # all except y
+        test_Y = testing[testing.columns[-1]]
+        ########################
+        from processing import DataSampler
+
+        sampler = DataSampler()
+        try:
+            train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
+        except ValueError as e: # not enough points for oversampling
+            print(f"Could not sample training data, using original distribution: {e}")
+        ####################
+        rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
+                                    min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, 
+                                    bootstrap=bootstrap)
+        rfc.fit(train_X, train_Y)
+        
+        print_report([rfc], [test_X], test_Y, ["X"])
+        
+        export_model(rfc, use_case, layer_name, reference_layer_name)
+        
\ No newline at end of file
--- a/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_single_context.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_single_context.py
+import pandas as pd
+from pandas import DataFrame
+from processing.ml.train_base import split_data, remove_empty_community_class, print_report
+
+
+approach = 'single_context'
+#######################
+import pickle 
+from pathlib import Path
+
+def export_model(model, use_case, layer_name):
+    fpath = f'data/{use_case}/ml_output/{approach}'
+    Path(fpath).mkdir(parents=True, exist_ok=True)
+    with open(f'{fpath}/{layer_name}.model', 'wb') as f:
+        pickle.dump(model, f)
+#####################
+from sklearn.ensemble import RandomForestClassifier
+n_estimators = 100
+criterion = 'gini'
+max_depth = None
+min_samples_leaf = 2
+min_impurity_decrease = 1E-5
+bootstrap=True
+
+
+###############
+
+
+from db.repository import Repository
+
+repo = Repository()
+
+def run_training(use_case):
+    for layer in repo.get_layers_for_use_case(use_case):
+        layer_name = layer.layer_name
+
+
+        df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv', index_col=0)
+        #######################
+        training, testing = split_data(df, shuffle=False)
+        #####################
+        training = remove_empty_community_class(training)
+        testing = remove_empty_community_class(testing)
+        #####################
+        from sklearn.preprocessing import StandardScaler
+        scaler = StandardScaler()
+
+        train_X = scaler.fit_transform(training)[:,:-1] # all except y
+        train_Y = training[training.columns[-1]]
+
+        test_X = scaler.transform(testing)[:,:-1] # all except y
+        test_Y = testing[testing.columns[-1]]
+        ########################
+        from processing import DataSampler
+
+        sampler = DataSampler()
+        try:
+            train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
+        except ValueError as e: # not enough points for oversampling
+            print(f"Could not sample training data, using original distribution: {e}")
+        ####################
+        rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
+                                    min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, 
+                                    bootstrap=bootstrap)
+        rfc.fit(train_X, train_Y)
+        ####################
+        print_report([rfc], [test_X], test_Y, ["X"])
+        ####################
+        export_model(rfc, use_case, layer_name)
+        
\ No newline at end of file
--- a/src/data-hub/proactive-community-detection-microservice/app/run_training.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/run_training.py
+import sys
+import os
+modules_path = '../../../modules/'
+if os.path.exists(modules_path):
+    sys.path.insert(1, modules_path)
+
+
+from processing.ml.train_single_context import run_single_training
+from processing.ml.train_cross_context import run_cross_training
+
+if __name__ == '__main__':
+
+    use_case='community-prediction-youtube-n'
+    run_single_training(use_case)
+    run_cross_training(use_case)
\ No newline at end of file