Commit c1dc19d0 authored by Alexander Lercher's avatar Alexander Lercher

added ml training

parent 678477f0
import numpy as np
import collections
from typing import Tuple
def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> Tuple['training_data', 'test_data']:
if shuffle:
dataframe = dataframe.sample(frac=1).reset_index(drop=True)
training_size = int(len(dataframe) * (1-test_dataset_frac))
train = dataframe[:training_size].reset_index(drop=True)
test = dataframe[training_size:].reset_index(drop=True)
y_train = train[train.columns[-1]]
y_test = test[test.columns[-1]]
return train, test
#######################
import pandas as pd
from pandas import DataFrame
def remove_empty_community_class(df):
'''Removes evolution_label -1 from dataset indicating the community stays empty.'''
# res = df.loc[df['evolution_label'] != -1.0]
# res = res.reset_index(drop=True)
# return res
df['evolution_label'] = df['evolution_label'].replace(-1.0, 0)
return df
########################
import sklearn.metrics
def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
"""
Prints all reports.
:param clfs: list of classifiers to evaluate
:param test_Xs: list of test_X for the corresponding classifier at idx
:param test_Y: true classes
:param titles: list of titles for the classifiers at idx
"""
for clf, test_X, title in zip(clfs, test_Xs, titles):
pred_Y = clf.predict(test_X)
print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
########################
\ No newline at end of file
import pandas as pd
from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report
approach = 'cross_context'
#######################
import pickle
from pathlib import Path
def export_model(model, use_case, layer_name, reference_layer_name):
fpath = f'data/{use_case}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}_{reference_layer_name}.model', 'wb') as f:
pickle.dump(model, f)
###################
from sklearn.ensemble import RandomForestClassifier
n_estimators = 50
criterion = 'gini'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease= 1E-5
bootstrap=True
###############
from db.repository import Repository
repo = Repository()
def run_training(use_case):
for layerpair in repo.get_layer_pairs(use_case):
layer_name = layerpair.layer
reference_layer_name = layerpair.reference_layer
df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)
#######################
training, testing = split_data(df, shuffle=False)
#####################
training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)
#####################
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]
test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]
########################
from processing import DataSampler
sampler = DataSampler()
try:
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
except ValueError as e: # not enough points for oversampling
print(f"Could not sample training data, using original distribution: {e}")
####################
rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
bootstrap=bootstrap)
rfc.fit(train_X, train_Y)
print_report([rfc], [test_X], test_Y, ["X"])
export_model(rfc, use_case, layer_name, reference_layer_name)
\ No newline at end of file
import pandas as pd
from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report
approach = 'single_context'
#######################
import pickle
from pathlib import Path
def export_model(model, use_case, layer_name):
fpath = f'data/{use_case}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}.model', 'wb') as f:
pickle.dump(model, f)
#####################
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'gini'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 1E-5
bootstrap=True
###############
from db.repository import Repository
repo = Repository()
def run_training(use_case):
for layer in repo.get_layers_for_use_case(use_case):
layer_name = layer.layer_name
df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv', index_col=0)
#######################
training, testing = split_data(df, shuffle=False)
#####################
training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)
#####################
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]
test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]
########################
from processing import DataSampler
sampler = DataSampler()
try:
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
except ValueError as e: # not enough points for oversampling
print(f"Could not sample training data, using original distribution: {e}")
####################
rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
bootstrap=bootstrap)
rfc.fit(train_X, train_Y)
####################
print_report([rfc], [test_X], test_Y, ["X"])
####################
export_model(rfc, use_case, layer_name)
\ No newline at end of file
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from processing.ml.train_single_context import run_single_training
from processing.ml.train_cross_context import run_cross_training
if __name__ == '__main__':
use_case='community-prediction-youtube-n'
run_single_training(use_case)
run_cross_training(use_case)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment