Commit 13dd065e authored by Alexander Lercher's avatar Alexander Lercher

Merge branch 'feature/community-prediction' into 'develop'

Feature/community prediction

See merge request !48
parents 13427b9e 784399e0
......@@ -77,7 +77,7 @@ Contains the clustering results. Clustering is performed on all nodes inside one
```GET https://articonf1.itec.aau.at:30103/api/use-cases/{use_case}/tables/{table}/layers/{layer_name}/clusters``` returns the identified clusters.
```GET https://articonf1.itec.aau.at:30103/api/use-cases/{use_case}/tables/{table}/layers/{layer_name}/timeslices``` returns the identified clusters partitioned based on their nodes' timestamps.
```GET https://articonf1.itec.aau.at:30103/api/use-cases/{use_case}/tables/{table}/layers/{layer_name}/timeslices``` returns the identified clusters partitioned based on their nodes' [timestamps](schema_information.md#time-slices).
## RunId
When a similarity computation is executed, it has an associated RunId which is used to uniquely identify that execution.
......@@ -93,3 +93,14 @@ Returns the computed similarity. Two clusters belonging to the SAME layer will b
Intermediary data-structure used only by the function which computes the similarity. Clusters are connected only to other clusters belonging to a DIFFERENT layer.
```GET https://articonf1.itec.aau.at:30103/api/use_cases/{use_case}/tables/{table}/connectedClusters``` returns all connected clusters for the given use-case and table.
# Proactive Community Detection Microservice
https://articonf1.itec.aau.at:30105/api/ui/
This microservice contains predictions of the cluster sizes from the clusters in [role stage discovery microservice](https://articonf1.itec.aau.at:30103/api/ui/#!/Clusters/routes_clustersets_get_by_name) for the week following the latest data in SMART. The timestamps used for temporal devision are documented [here](schema_information.md#time-slices).
Example: Layer $L$ contains 3 clusters with sizes 3, 0, 7 in the most recent week $t$. SMART predicts the sizes in the following week $t+1$ as 5, 0, 6 based on each cluster's structural changes over the last $N=3$ weeks, i.e. $t,\ t-1,\ t-2$.
```GET https://articonf1.itec.aau.at:30105/api/use-cases/{use_case}/tables/{table}/layers/{layer_name}/predictions```
contains the size predictions for all clusters of a layer derived as described above.
\ No newline at end of file
......@@ -119,4 +119,41 @@ __Example:__ The node for the initial trace of the pizzashop and the layer ```na
"fullName": "name+description",
"firstTopping": "toppings[0]//name"
}
}
\ No newline at end of file
}
# Clusters
## Time Slices
Time slices are clusters split into time windows. Currently SMART creates one time slice per week for each cluster.
Currently, the timestamp information is _not_ integrated into the table schema mapping and _not_ converted into a UNIX timestamp during upload.
The following fields are considered timestamps during the partitioning:
```yaml
vialog-enum:
video: created
change: /
car-sharing-official:
car: /
hash: /
media: /
offer: available
publication: date
travel: startDate
travelCancelledBy: moment
travelFinishedBy: moment
travelStartedBy: moment
travelSuggestedEndPlaces: /
travelUsers: /
user: /
offerEndPlaces: /
smart-energy:
smart-energy: Timestamp
crowd-journalism-enum:
video: creationTimestamp
tag: /
classification: lastUpdate
event: /
purchase: timestamp
```
# contains raw data for machine learning
data/
# backup data for machine learning debugging
data_bak/
\ No newline at end of file
......@@ -34,14 +34,13 @@ class Repository(MongoRepositoryBase):
super().drop_collection(collection_)
#region LayerPair
#region Use Case
def add_use_case(self, use_case: str):
super().insert_entry(self._use_case_collection, {'name':use_case})
def get_use_cases(self) -> List[str]:
entries = super().get_entries(self._use_case_collection)
return [e['name'] for e in entries]
#endregion
#region Layers
......@@ -53,7 +52,7 @@ class Repository(MongoRepositoryBase):
entries = super().get_entries(self._layer_collection, projection={'_id': 0})
return [LayerDao(e) for e in entries]
def get_layers_for_use_case(self, use_case: str) -> LayerDao:
def get_layers_for_use_case(self, use_case: str) -> List[LayerDao]:
entries = super().get_entries(self._layer_collection, selection={'use_case': use_case})
return [LayerDao(e) for e in entries]
......@@ -132,6 +131,13 @@ class Repository(MongoRepositoryBase):
entries = super().get_entries(self._prediction_result_collection, selection={'use_case': use_case}, projection={'_id': 0})
return [PredictionResult.create_from_dict(e) for e in entries]
def get_prediction_results_for_layer(self, use_case: str, use_case_table: str, layer_name: str) -> List[PredictionResult]:
entries = super().get_entries(self._prediction_result_collection, selection={'use_case': use_case, 'table': use_case_table, 'layer': layer_name}, projection={'_id': 0})
return [PredictionResult.create_from_dict(e) for e in entries]
def delete_all_prediction_results(self):
super().drop_collection(self._prediction_result_collection)
def delete_prediction_results(self, use_case: str):
super().delete_many(self._prediction_result_collection, selection={'use_case': use_case})
#endregion
......@@ -7,7 +7,7 @@ import json
import os
from entities import TimeWindow, Cluster
def store_metrics_for_clusters(use_case: str, layer_name: str, feature_names: List[str]):
def store_metrics_for_clusters(use_case: str, table: str, layer_name: str, feature_names: List[str]):
'''
:param layer_name: Name of the layer for which multiple time windows exist
:param feature_names: Features of the layer
......@@ -15,7 +15,7 @@ def store_metrics_for_clusters(use_case: str, layer_name: str, feature_names: Li
print(f"Working on {layer_name} cluster metrics")
# load global cluster centers
path_in = f'data/{use_case}/raw/clusters/{layer_name}.json'
path_in = f'data/{use_case}/{table}/raw/clusters/{layer_name}.json'
with open(path_in, 'r') as file:
clusters = json.loads(file.read())
cluster_centers: Dict[str, Tuple[float]] = {
......@@ -24,9 +24,9 @@ def store_metrics_for_clusters(use_case: str, layer_name: str, feature_names: Li
if cluster['label'] != 'noise'
}
path_in = f'data/{use_case}/raw/timeslices/{layer_name}'
Path(f'data/{use_case}/cluster_metrics/').mkdir(parents=True, exist_ok=True)
path_out = f'data/{use_case}/cluster_metrics/{layer_name}.json'
path_in = f'data/{use_case}/{table}/raw/timeslices/{layer_name}'
Path(f'data/{use_case}/{table}/cluster_metrics/').mkdir(parents=True, exist_ok=True)
path_out = f'data/{use_case}/{table}/cluster_metrics/{layer_name}.json'
complete_clusters: List[Cluster] = []
......@@ -54,7 +54,7 @@ import collections
import numpy as np
from typing import Iterable, Tuple
def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) -> Iterable[list]:
def create_metrics_training_data(use_case: str, table: str, layer_name: str, N: int = 3) -> Iterable[list]:
"""
Loads the metrics training data for an individual layer from disk.
A single metrics training data point should look like this:
......@@ -70,7 +70,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
:param layer_name: the name of the layer metrics json file
"""
path_in = f"data/{use_case}/cluster_metrics/{layer_name}.json"
path_in = f"data/{use_case}/{table}/cluster_metrics/{layer_name}.json"
with open(path_in, 'r') as file:
data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]
......@@ -94,7 +94,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
tuples.append(cur_metrics)
if len(tuples) == N:
label = get_evolution_label(cur_cluster.size, data[i+1].size)
label = data[i+1].size # get_evolution_label(cur_cluster.size, data[i+1].size)
yield list(tuples) + [label]
############################
def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
......@@ -122,9 +122,9 @@ def balance_dataset(df: DataFrame) -> DataFrame:
# nothing happening here, balance only on real training, not during prep
return df
def store_training_data(use_case: str, layer_name: str):
def store_training_data(use_case: str, table: str, layer_name: str):
# load metrics data from disk
data: Iterable = create_metrics_training_data(use_case=use_case, layer_name=layer_name)
data: Iterable = create_metrics_training_data(use_case=use_case, table=table, layer_name=layer_name)
# flatten and convert to df
df = convert_metrics_data_to_dataframe(data, columns=COLUMNS, flattening_method=flatten_metrics_datapoint)
......@@ -135,8 +135,8 @@ def store_training_data(use_case: str, layer_name: str):
# shuffle
df = df.sample(frac=1).reset_index(drop=True)
Path(f'data/{use_case}/ml_input/single_context/').mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv')
Path(f'data/{use_case}/{table}/ml_input/single_context/').mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{use_case}/{table}/ml_input/single_context/{layer_name}.csv')
#######################
......@@ -159,13 +159,13 @@ def run(use_case=None):
for use_case in use_cases:
print(f"Executing cluster metrics calc for use case {use_case}")
layers = [[l.layer_name, l.properties] for l in repo.get_layers_for_use_case(use_case)]
layers = repo.get_layers_for_use_case(use_case)
##################
for layer in layers:
store_metrics_for_clusters(use_case, layer[0], layer[1])
store_metrics_for_clusters(layer.use_case, layer.use_case_table, layer.layer_name, layer.properties)
###################
for name, _ in layers:
print(f"Storing training data for {name}")
store_training_data(use_case, layer_name=name)
for layer in layers:
print(f"Storing training data for {layer.layer_name}")
store_training_data(layer.use_case, layer.use_case_table, layer.layer_name)
\ No newline at end of file
......@@ -9,11 +9,11 @@ import os
from entities import TimeWindow, Layer
from processing import ClusterMetricsCalculatorFactory
def store_metrics_for_layers(use_case: str, layer_name: str, feature_names: List[str]):
def store_metrics_for_layers(use_case: str, table: str, layer_name: str, feature_names: List[str]):
print(f"Working on {layer_name} layer metrics")
# load global cluster centers
path_in = f'data/{use_case}/raw/clusters/{layer_name}.json'
path_in = f'data/{use_case}/{table}/raw/clusters/{layer_name}.json'
with open(path_in, 'r') as file:
clusters = json.loads(file.read())
cluster_centers: Dict[str, Tuple[float]] = {
......@@ -24,7 +24,7 @@ def store_metrics_for_layers(use_case: str, layer_name: str, feature_names: List
# load time windows
all_layers: List[Layer] = []
path_in = f'data/{use_case}/raw/timeslices/{layer_name}'
path_in = f'data/{use_case}/{table}/raw/timeslices/{layer_name}'
for root, _, files in os.walk(path_in):
for f in files:
with open(os.path.join(root, f), 'r') as file:
......@@ -35,8 +35,8 @@ def store_metrics_for_layers(use_case: str, layer_name: str, feature_names: List
all_layers.append(layer)
# store the layer metrics
Path(f'data/{use_case}/layer_metrics/').mkdir(parents=True, exist_ok=True)
path_out = f'data/{use_case}/layer_metrics/{layer_name}.json'
Path(f'data/{use_case}/{table}/layer_metrics/').mkdir(parents=True, exist_ok=True)
path_out = f'data/{use_case}/{table}/layer_metrics/{layer_name}.json'
with open(path_out, 'w') as file:
file.write(json.dumps([l.__dict__ for l in all_layers]))
#########################
......@@ -63,7 +63,7 @@ from typing import Iterable, List, Dict, Any
import json
from entities import Layer, Cluster
def create_layer_metrics_training_data(use_case: str, layer_name: str, reference_layer: str, N: int = 2) -> Iterable:
def create_layer_metrics_training_data(use_case: str, table: str, layer_name: str, reference_layer: str, N: int = 2) -> Iterable:
"""
Loads the metrics training data for an individual layer from disk.
......@@ -83,12 +83,12 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
if N != 2:
raise NotImplementedError("N is not implemented and fixed to 2!")
with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:
with open(f'data/{use_case}/{table}/cluster_metrics/{layer_name}.json') as file:
cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]
cluster_ids = {c.cluster_id for c in cluster_metrics}
cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}
with open(f'data/{use_case}/layer_metrics/{reference_layer}.json') as file:
with open(f'data/{use_case}/{table}/layer_metrics/{reference_layer}.json') as file:
layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]
layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}
......@@ -98,7 +98,7 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
# go through all time windows once...
prev_time_key = ordered_time_keys[0]
for current_time_key in ordered_time_keys[1:]:
for idx, current_time_key in enumerate(ordered_time_keys[1:-1]):
# ...and load the current and previous layer metrics in the reference_layer
current_layer_metric = layer_metrics[current_time_key]
prev_layer_metric = layer_metrics[prev_time_key]
......@@ -110,7 +110,8 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
for cluster_id in cluster_ids:
current_cluster_metric = cluster_metrics[(current_time_key, cluster_id)]
prev_cluster_metric = cluster_metrics[(prev_time_key, cluster_id)]
evolution_label = get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size)
next_time_key = ordered_time_keys[idx+2]
evolution_label = cluster_metrics[(next_time_key, cluster_id)].size # get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size)
# yield each combination of reference layer metrics to clusters
yield [prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id), evolution_label]
......@@ -149,9 +150,9 @@ def balance_dataset(df: DataFrame) -> DataFrame:
# nothing happening here, balance only on real training, not during prep
return df
def store_training_data(use_case: str, layer_name: str, reference_layer_name: str):
def store_training_data(use_case: str, table: str, layer_name: str, reference_layer_name: str):
# load metrics data from disk
data: Iterable = create_layer_metrics_training_data(use_case=use_case, layer_name=layer_name, reference_layer=reference_layer_name)
data: Iterable = create_layer_metrics_training_data(use_case=use_case, table=table, layer_name=layer_name, reference_layer=reference_layer_name)
# convert to X and Y
df = convert_metrics_data_to_dataframe(data, columns=get_columns(N=2), flattening_method=flatten_layer_metrics_datapoint)
......@@ -162,8 +163,8 @@ def store_training_data(use_case: str, layer_name: str, reference_layer_name: st
# shuffle
df = df.sample(frac=1).reset_index(drop=True)
Path(f'data/{use_case}/ml_input/cross_context/').mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv')
Path(f'data/{use_case}/{table}/ml_input/cross_context/').mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{use_case}/{table}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv')
#########################
......@@ -186,16 +187,16 @@ def run(use_case=None):
for use_case in use_cases:
print(f"Executing layer metrics calc for use case {use_case}")
layers = [[l.layer_name, l.properties] for l in repo.get_layers_for_use_case(use_case)]
layers = repo.get_layers_for_use_case(use_case)
layer_pairs = repo.get_layer_pairs(use_case)
################
for layer in layers:
try:
store_metrics_for_layers(use_case, layer[0], layer[1])
store_metrics_for_layers(layer.use_case, layer.use_case_table, layer.layer_name, layer.properties)
except FileNotFoundError:
pass
###############
for ld in layer_pairs:
print(f"Storing training data for {ld.layer} with L_R={ld.reference_layer}")
store_training_data(use_case, layer_name=ld.layer, reference_layer_name=ld.reference_layer)
\ No newline at end of file
store_training_data(ld.use_case, table=ld.table, layer_name=ld.layer, reference_layer_name=ld.reference_layer)
\ No newline at end of file
......@@ -4,6 +4,7 @@ from processing.data_prep.layer_metrics_calc import run as lrun
from pathlib import Path
import json
import os
from typing import List
from db.repository import Repository
......@@ -12,11 +13,11 @@ repo = Repository()
def store_clusters_as_files(use_case):
path_ = f'data/{use_case}/raw/clusters/'
Path(path_).mkdir(parents=True, exist_ok=True)
layers = repo.get_layers_for_use_case(use_case)
for l in layers:
path_ = f'data/{l.use_case}/{l.use_case_table}/raw/clusters/'
Path(path_).mkdir(parents=True, exist_ok=True)
clusters = repo.get_clusters_for_layer(use_case, l.use_case_table, l.layer_name)
with open(os.path.join(path_, f'{l.layer_name}.json'), 'w') as file_:
......@@ -24,30 +25,25 @@ def store_clusters_as_files(use_case):
def store_time_slices_as_files(use_case):
path_ = f'data/{use_case}/raw/timeslices/'
layers = repo.get_layers_for_use_case(use_case)
for l in layers:
Path(os.path.join(path_, l.layer_name)).mkdir(parents=True, exist_ok=True)
path_ = f'data/{l.use_case}/{l.use_case_table}/raw/timeslices/{l.layer_name}/'
Path(path_).mkdir(parents=True, exist_ok=True)
time_slices = repo.get_time_slices_for_layer(use_case, l.use_case_table, l.layer_name)
for ts in time_slices:
with open(os.path.join(path_, l.layer_name, f'{ts.time}.json'), 'w') as file_:
with open(os.path.join(path_, f'{ts.time}.json'), 'w') as file_:
file_.write(json.dumps(ts.to_serializable_dict()))
def run(use_case=None):
def run(use_cases: List[str] = None):
'''Prepares training data for single and cross-context using the file system (data/)'''
if use_case is not None:
use_cases = [use_case]
else:
if use_cases is None:
use_cases = repo.get_use_cases()
for use_case in use_cases:
store_clusters_as_files(use_case)
store_time_slices_as_files(use_case)
crun(use_case)
lrun(use_case)
......@@ -11,8 +11,8 @@ def increase_time_window(time_window_id: str) -> str:
from typing import Tuple
import pickle
def load_ml_models(use_case, method, layer_name, reference_layer_name=None) -> Tuple['scaler', 'clf']:
path_ = f'data/{use_case}/ml_output/{method}/{layer_name}'
def load_ml_models(use_case, table, method, layer_name, reference_layer_name=None) -> Tuple['scaler', 'clf']:
path_ = f'data/{use_case}/{table}/ml_output/{method}/{layer_name}'
if method == 'single_context':
with open(f'{path_}.model', 'rb') as file:
......
......@@ -44,18 +44,19 @@ repo = Repository()
def run_prediction(use_case: str):
for layerpair in repo.get_layer_pairs(use_case):
table = layerpair.table
layer_name = layerpair.layer
reference_layer_name = layerpair.reference_layer
print(f"Predicting {method} for {use_case}//{layer_name} based on {reference_layer_name}")
print(f"Predicting {method} for {use_case}//{table}//{layer_name} based on {reference_layer_name}")
##########################
with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:
with open(f'data/{use_case}/{table}/cluster_metrics/{layer_name}.json') as file:
cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]
cluster_ids = {c.cluster_id for c in cluster_metrics}
cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}
with open(f'data/{use_case}/layer_metrics/{reference_layer_name}.json') as file:
with open(f'data/{use_case}/{table}/layer_metrics/{reference_layer_name}.json') as file:
layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]
layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}
######################
......@@ -77,7 +78,7 @@ def run_prediction(use_case: str):
# yield each combination of reference layer metrics to clusters
prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)])
#######################
scaler, svc = load_ml_models(use_case, method, layer_name, reference_layer_name)
scaler, svc = load_ml_models(use_case, table, method, layer_name, reference_layer_name)
################
prediction_cluster_ids = []
prediction_time_window = increase_time_window(ordered_time_keys[1])
......@@ -91,8 +92,9 @@ def run_prediction(use_case: str):
prediction_metrics.append(flat_)
prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))
print(np.unique(prediction_results, return_counts=True))
# print(np.unique(prediction_results, return_counts=True))
prediction_results = np.rint(prediction_results) # round to full numbers
for i in range(len(prediction_cluster_ids)):
res = PredictionResult(use_case, use_case, method, layer_name, reference_layer_name, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])
res = PredictionResult(use_case, table, method, layer_name, reference_layer_name, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])
repo.add_prediction_result(res)
......@@ -37,14 +37,19 @@ repo = Repository()
def run_prediction(use_case: str):
for layer in repo.get_layers_for_use_case(use_case):
table = layer.use_case_table
layer_name = layer.layer_name
print(f"Predicting {method} for {use_case}//{layer_name}")
print(f"Predicting {method} for {use_case}//{table}//{layer_name}")
#################
path_in = f"data/{use_case}/cluster_metrics/{layer_name}.json"
path_in = f"data/{use_case}/{table}/cluster_metrics/{layer_name}.json"
with open(path_in, 'r') as file:
data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]
if len(data) == 0:
print(f"No data for predicting {use_case}//{table}//{layer_name}.")
continue
data.sort(key=lambda cl: (eval(cl.cluster_id), eval(cl.time_window_id)))
#####################
cluster_map: Dict['cluster_id', 'time_windows'] = {}
......@@ -57,7 +62,7 @@ def run_prediction(use_case: str):
cluster_map[id_].append(cluster)
####################
scaler, svc = load_ml_models(use_case, method, layer_name)
scaler, svc = load_ml_models(use_case, table, method, layer_name)
#####################
# store id, future time window, and flattened metrics to combine the latter during prediction
prediction_cluster_ids = []
......@@ -74,8 +79,9 @@ def run_prediction(use_case: str):
# predict all at once for speedup
prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))
print(np.unique(prediction_results, return_counts=True))
# print(np.unique(prediction_results, return_counts=True))
prediction_results = np.rint(prediction_results) # round to full numbers
for i in range(len(prediction_cluster_ids)):
res = PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i])
res = PredictionResult(use_case, table, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i])
repo.add_prediction_result(res)
......@@ -29,15 +29,29 @@ def remove_empty_community_class(df):
########################
import sklearn.metrics
def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
def print_classification_report(clf, test_X, test_Y, title):
"""
Prints all reports.
:param clfs: list of classifiers to evaluate
:param test_Xs: list of test_X for the corresponding classifier at idx
:param test_Y: true classes
:param titles: list of titles for the classifiers at idx
Prints a classification report.
:param clf: classifier to evaluate
:param test_X: input features X
:param test_Y: true classes Y
:param title: title for the report
"""
for clf, test_X, title in zip(clfs, test_Xs, titles):
pred_Y = clf.predict(test_X)
print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
pred_Y = clf.predict(test_X)
print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
def print_regression_report(clf, test_X, test_Y, title):
"""
Prints a regression report.
:param clf: regressor to evaluate
:param test_X: input features X
:param test_Y: true prediction values
:param title: title for the report
"""
pred_Y = clf.predict(test_X)
pred_Y = np.rint(pred_Y) # round to full numbers
print(f"### {title} ###\nR2-score={sklearn.metrics.r2_score(y_true=test_Y, y_pred=pred_Y)}, " \
f"MSE={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=pred_Y)}, " \
f"sanity={sklearn.metrics.mean_squared_error(y_true=test_Y, y_pred=[0]*len(pred_Y))}")
########################
\ No newline at end of file
import pandas as pd
from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report
from processing.ml.train_base import split_data, remove_empty_community_class, print_regression_report
approach = 'cross_context'
max_sampling_size = 20000
#######################
import pickle
from pathlib import Path
def export_model(model, use_case, layer_name, reference_layer_name, scaler=False):
fpath = f'data/{use_case}/ml_output/{approach}'
def export_model(model, use_case, table, layer_name, reference_layer_name, scaler=False):
fpath = f'data/{use_case}/{table}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f)
###################
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
n_estimators = 50
criterion = 'gini'
criterion = 'mse'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease= 1E-5
bootstrap=True
####################
from sklearn.svm import LinearSVR
tol = 1E-4
c = 1
loss = 'squared_epsilon_insensitive'
dual = False
###############
......@@ -32,16 +39,19 @@ repo = Repository()
def run_training(use_case):
for layerpair in repo.get_layer_pairs(use_case):
table = layerpair.table
layer_name = layerpair.layer
reference_layer_name = layerpair.reference_layer
df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)
df: DataFrame = pd.read_csv(f'data/{use_case}/{table}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)
if df.empty:
print(f"No data for training {use_case}//{table}//{layer_name} on {reference_layer_name}.")
continue
#######################
training, testing = split_data(df, shuffle=False)
#####################
training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)
training.sample(frac=min(1, max_sampling_size/len(training))).reset_index(drop=True)
#####################
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
......@@ -52,22 +62,17 @@ def run_training(use_case):
test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y
test_Y = testing[testing.columns[-1]]
export_model(scaler, use_case, layer_name, reference_layer_name, scaler=True)
export_model(scaler, use_case, table, layer_name, reference_layer_name, scaler=True)
########################
from processing import DataSampler
sampler = DataSampler()
try:
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
except ValueError as e: # not enough points for oversampling
print(f"Could not sample training data, using original distribution: {e}")
####################
rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
# RF is a lot better than SVM, but I did not tune hyperparameters for regression
rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
bootstrap=bootstrap)
# rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
rfc.fit(train_X, train_Y)
print_report([rfc], [test_X], test_Y, ["X"])
export_model(rfc, use_case, layer_name, reference_layer_name)
####################
print_regression_report(rfc, test_X, test_Y, f"{layer_name} based on {reference_layer_name}")
####################
export_model(rfc, use_case, table, layer_name, reference_layer_name)
\ No newline at end of file
import pandas as pd
from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report
from processing.ml.train_base import split_data, remove_empty_community_class, print_regression_report
approach = 'single_context'
max_sampling_size = 20000
#######################
import pickle
from pathlib import Path
def export_model(model, use_case, layer_name, scaler=False):
fpath = f'data/{use_case}/ml_output/{approach}'
def export_model(model, use_case, table, layer_name, scaler=False):
fpath = f'data/{use_case}/{table}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f)
#####################
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor