Commit dcb2b221 authored by Alexander Lercher's avatar Alexander Lercher

Added support for multiple use-case tables

parent ab96c298
...@@ -34,14 +34,13 @@ class Repository(MongoRepositoryBase): ...@@ -34,14 +34,13 @@ class Repository(MongoRepositoryBase):
super().drop_collection(collection_) super().drop_collection(collection_)
#region LayerPair #region Use Case
def add_use_case(self, use_case: str): def add_use_case(self, use_case: str):
super().insert_entry(self._use_case_collection, {'name':use_case}) super().insert_entry(self._use_case_collection, {'name':use_case})
def get_use_cases(self) -> List[str]: def get_use_cases(self) -> List[str]:
entries = super().get_entries(self._use_case_collection) entries = super().get_entries(self._use_case_collection)
return [e['name'] for e in entries] return [e['name'] for e in entries]
#endregion #endregion
#region Layers #region Layers
...@@ -53,7 +52,7 @@ class Repository(MongoRepositoryBase): ...@@ -53,7 +52,7 @@ class Repository(MongoRepositoryBase):
entries = super().get_entries(self._layer_collection, projection={'_id': 0}) entries = super().get_entries(self._layer_collection, projection={'_id': 0})
return [LayerDao(e) for e in entries] return [LayerDao(e) for e in entries]
def get_layers_for_use_case(self, use_case: str) -> LayerDao: def get_layers_for_use_case(self, use_case: str) -> List[LayerDao]:
entries = super().get_entries(self._layer_collection, selection={'use_case': use_case}) entries = super().get_entries(self._layer_collection, selection={'use_case': use_case})
return [LayerDao(e) for e in entries] return [LayerDao(e) for e in entries]
...@@ -138,4 +137,7 @@ class Repository(MongoRepositoryBase): ...@@ -138,4 +137,7 @@ class Repository(MongoRepositoryBase):
def delete_all_prediction_results(self): def delete_all_prediction_results(self):
super().drop_collection(self._prediction_result_collection) super().drop_collection(self._prediction_result_collection)
def delete_prediction_results(self, use_case: str):
super().delete_many(self._prediction_result_collection, selection={'use_case': use_case})
#endregion #endregion
...@@ -7,7 +7,7 @@ import json ...@@ -7,7 +7,7 @@ import json
import os import os
from entities import TimeWindow, Cluster from entities import TimeWindow, Cluster
def store_metrics_for_clusters(use_case: str, layer_name: str, feature_names: List[str]): def store_metrics_for_clusters(use_case: str, table: str, layer_name: str, feature_names: List[str]):
''' '''
:param layer_name: Name of the layer for which multiple time windows exist :param layer_name: Name of the layer for which multiple time windows exist
:param feature_names: Features of the layer :param feature_names: Features of the layer
...@@ -15,7 +15,7 @@ def store_metrics_for_clusters(use_case: str, layer_name: str, feature_names: Li ...@@ -15,7 +15,7 @@ def store_metrics_for_clusters(use_case: str, layer_name: str, feature_names: Li
print(f"Working on {layer_name} cluster metrics") print(f"Working on {layer_name} cluster metrics")
# load global cluster centers # load global cluster centers
path_in = f'data/{use_case}/raw/clusters/{layer_name}.json' path_in = f'data/{use_case}/{table}/raw/clusters/{layer_name}.json'
with open(path_in, 'r') as file: with open(path_in, 'r') as file:
clusters = json.loads(file.read()) clusters = json.loads(file.read())
cluster_centers: Dict[str, Tuple[float]] = { cluster_centers: Dict[str, Tuple[float]] = {
...@@ -24,9 +24,9 @@ def store_metrics_for_clusters(use_case: str, layer_name: str, feature_names: Li ...@@ -24,9 +24,9 @@ def store_metrics_for_clusters(use_case: str, layer_name: str, feature_names: Li
if cluster['label'] != 'noise' if cluster['label'] != 'noise'
} }
path_in = f'data/{use_case}/raw/timeslices/{layer_name}' path_in = f'data/{use_case}/{table}/raw/timeslices/{layer_name}'
Path(f'data/{use_case}/cluster_metrics/').mkdir(parents=True, exist_ok=True) Path(f'data/{use_case}/{table}/cluster_metrics/').mkdir(parents=True, exist_ok=True)
path_out = f'data/{use_case}/cluster_metrics/{layer_name}.json' path_out = f'data/{use_case}/{table}/cluster_metrics/{layer_name}.json'
complete_clusters: List[Cluster] = [] complete_clusters: List[Cluster] = []
...@@ -54,7 +54,7 @@ import collections ...@@ -54,7 +54,7 @@ import collections
import numpy as np import numpy as np
from typing import Iterable, Tuple from typing import Iterable, Tuple
def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) -> Iterable[list]: def create_metrics_training_data(use_case: str, table: str, layer_name: str, N: int = 3) -> Iterable[list]:
""" """
Loads the metrics training data for an individual layer from disk. Loads the metrics training data for an individual layer from disk.
A single metrics training data point should look like this: A single metrics training data point should look like this:
...@@ -70,7 +70,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) -> ...@@ -70,7 +70,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
:param layer_name: the name of the layer metrics json file :param layer_name: the name of the layer metrics json file
""" """
path_in = f"data/{use_case}/cluster_metrics/{layer_name}.json" path_in = f"data/{use_case}/{table}/cluster_metrics/{layer_name}.json"
with open(path_in, 'r') as file: with open(path_in, 'r') as file:
data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())] data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]
...@@ -122,9 +122,9 @@ def balance_dataset(df: DataFrame) -> DataFrame: ...@@ -122,9 +122,9 @@ def balance_dataset(df: DataFrame) -> DataFrame:
# nothing happening here, balance only on real training, not during prep # nothing happening here, balance only on real training, not during prep
return df return df
def store_training_data(use_case: str, layer_name: str): def store_training_data(use_case: str, table: str, layer_name: str):
# load metrics data from disk # load metrics data from disk
data: Iterable = create_metrics_training_data(use_case=use_case, layer_name=layer_name) data: Iterable = create_metrics_training_data(use_case=use_case, table=table, layer_name=layer_name)
# flatten and convert to df # flatten and convert to df
df = convert_metrics_data_to_dataframe(data, columns=COLUMNS, flattening_method=flatten_metrics_datapoint) df = convert_metrics_data_to_dataframe(data, columns=COLUMNS, flattening_method=flatten_metrics_datapoint)
...@@ -135,8 +135,8 @@ def store_training_data(use_case: str, layer_name: str): ...@@ -135,8 +135,8 @@ def store_training_data(use_case: str, layer_name: str):
# shuffle # shuffle
df = df.sample(frac=1).reset_index(drop=True) df = df.sample(frac=1).reset_index(drop=True)
Path(f'data/{use_case}/ml_input/single_context/').mkdir(parents=True, exist_ok=True) Path(f'data/{use_case}/{table}/ml_input/single_context/').mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv') df.to_csv(f'data/{use_case}/{table}/ml_input/single_context/{layer_name}.csv')
####################### #######################
...@@ -159,13 +159,13 @@ def run(use_case=None): ...@@ -159,13 +159,13 @@ def run(use_case=None):
for use_case in use_cases: for use_case in use_cases:
print(f"Executing cluster metrics calc for use case {use_case}") print(f"Executing cluster metrics calc for use case {use_case}")
layers = [[l.layer_name, l.properties] for l in repo.get_layers_for_use_case(use_case)] layers = repo.get_layers_for_use_case(use_case)
################## ##################
for layer in layers: for layer in layers:
store_metrics_for_clusters(use_case, layer[0], layer[1]) store_metrics_for_clusters(layer.use_case, layer.use_case_table, layer.layer_name, layer.properties)
################### ###################
for name, _ in layers: for layer in layers:
print(f"Storing training data for {name}") print(f"Storing training data for {layer.layer_name}")
store_training_data(use_case, layer_name=name) store_training_data(layer.use_case, layer.use_case_table, layer.layer_name)
\ No newline at end of file
...@@ -9,11 +9,11 @@ import os ...@@ -9,11 +9,11 @@ import os
from entities import TimeWindow, Layer from entities import TimeWindow, Layer
from processing import ClusterMetricsCalculatorFactory from processing import ClusterMetricsCalculatorFactory
def store_metrics_for_layers(use_case: str, layer_name: str, feature_names: List[str]): def store_metrics_for_layers(use_case: str, table: str, layer_name: str, feature_names: List[str]):
print(f"Working on {layer_name} layer metrics") print(f"Working on {layer_name} layer metrics")
# load global cluster centers # load global cluster centers
path_in = f'data/{use_case}/raw/clusters/{layer_name}.json' path_in = f'data/{use_case}/{table}/raw/clusters/{layer_name}.json'
with open(path_in, 'r') as file: with open(path_in, 'r') as file:
clusters = json.loads(file.read()) clusters = json.loads(file.read())
cluster_centers: Dict[str, Tuple[float]] = { cluster_centers: Dict[str, Tuple[float]] = {
...@@ -24,7 +24,7 @@ def store_metrics_for_layers(use_case: str, layer_name: str, feature_names: List ...@@ -24,7 +24,7 @@ def store_metrics_for_layers(use_case: str, layer_name: str, feature_names: List
# load time windows # load time windows
all_layers: List[Layer] = [] all_layers: List[Layer] = []
path_in = f'data/{use_case}/raw/timeslices/{layer_name}' path_in = f'data/{use_case}/{table}/raw/timeslices/{layer_name}'
for root, _, files in os.walk(path_in): for root, _, files in os.walk(path_in):
for f in files: for f in files:
with open(os.path.join(root, f), 'r') as file: with open(os.path.join(root, f), 'r') as file:
...@@ -35,8 +35,8 @@ def store_metrics_for_layers(use_case: str, layer_name: str, feature_names: List ...@@ -35,8 +35,8 @@ def store_metrics_for_layers(use_case: str, layer_name: str, feature_names: List
all_layers.append(layer) all_layers.append(layer)
# store the layer metrics # store the layer metrics
Path(f'data/{use_case}/layer_metrics/').mkdir(parents=True, exist_ok=True) Path(f'data/{use_case}/{table}/layer_metrics/').mkdir(parents=True, exist_ok=True)
path_out = f'data/{use_case}/layer_metrics/{layer_name}.json' path_out = f'data/{use_case}/{table}/layer_metrics/{layer_name}.json'
with open(path_out, 'w') as file: with open(path_out, 'w') as file:
file.write(json.dumps([l.__dict__ for l in all_layers])) file.write(json.dumps([l.__dict__ for l in all_layers]))
######################### #########################
...@@ -63,7 +63,7 @@ from typing import Iterable, List, Dict, Any ...@@ -63,7 +63,7 @@ from typing import Iterable, List, Dict, Any
import json import json
from entities import Layer, Cluster from entities import Layer, Cluster
def create_layer_metrics_training_data(use_case: str, layer_name: str, reference_layer: str, N: int = 2) -> Iterable: def create_layer_metrics_training_data(use_case: str, table: str, layer_name: str, reference_layer: str, N: int = 2) -> Iterable:
""" """
Loads the metrics training data for an individual layer from disk. Loads the metrics training data for an individual layer from disk.
...@@ -83,12 +83,12 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference ...@@ -83,12 +83,12 @@ def create_layer_metrics_training_data(use_case: str, layer_name: str, reference
if N != 2: if N != 2:
raise NotImplementedError("N is not implemented and fixed to 2!") raise NotImplementedError("N is not implemented and fixed to 2!")
with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file: with open(f'data/{use_case}/{table}/cluster_metrics/{layer_name}.json') as file:
cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())] cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]
cluster_ids = {c.cluster_id for c in cluster_metrics} cluster_ids = {c.cluster_id for c in cluster_metrics}
cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics} cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}
with open(f'data/{use_case}/layer_metrics/{reference_layer}.json') as file: with open(f'data/{use_case}/{table}/layer_metrics/{reference_layer}.json') as file:
layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())] layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]
layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics} layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}
...@@ -150,9 +150,9 @@ def balance_dataset(df: DataFrame) -> DataFrame: ...@@ -150,9 +150,9 @@ def balance_dataset(df: DataFrame) -> DataFrame:
# nothing happening here, balance only on real training, not during prep # nothing happening here, balance only on real training, not during prep
return df return df
def store_training_data(use_case: str, layer_name: str, reference_layer_name: str): def store_training_data(use_case: str, table: str, layer_name: str, reference_layer_name: str):
# load metrics data from disk # load metrics data from disk
data: Iterable = create_layer_metrics_training_data(use_case=use_case, layer_name=layer_name, reference_layer=reference_layer_name) data: Iterable = create_layer_metrics_training_data(use_case=use_case, table=table, layer_name=layer_name, reference_layer=reference_layer_name)
# convert to X and Y # convert to X and Y
df = convert_metrics_data_to_dataframe(data, columns=get_columns(N=2), flattening_method=flatten_layer_metrics_datapoint) df = convert_metrics_data_to_dataframe(data, columns=get_columns(N=2), flattening_method=flatten_layer_metrics_datapoint)
...@@ -163,8 +163,8 @@ def store_training_data(use_case: str, layer_name: str, reference_layer_name: st ...@@ -163,8 +163,8 @@ def store_training_data(use_case: str, layer_name: str, reference_layer_name: st
# shuffle # shuffle
df = df.sample(frac=1).reset_index(drop=True) df = df.sample(frac=1).reset_index(drop=True)
Path(f'data/{use_case}/ml_input/cross_context/').mkdir(parents=True, exist_ok=True) Path(f'data/{use_case}/{table}/ml_input/cross_context/').mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv') df.to_csv(f'data/{use_case}/{table}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv')
######################### #########################
...@@ -187,16 +187,16 @@ def run(use_case=None): ...@@ -187,16 +187,16 @@ def run(use_case=None):
for use_case in use_cases: for use_case in use_cases:
print(f"Executing layer metrics calc for use case {use_case}") print(f"Executing layer metrics calc for use case {use_case}")
layers = [[l.layer_name, l.properties] for l in repo.get_layers_for_use_case(use_case)] layers = repo.get_layers_for_use_case(use_case)
layer_pairs = repo.get_layer_pairs(use_case) layer_pairs = repo.get_layer_pairs(use_case)
################ ################
for layer in layers: for layer in layers:
try: try:
store_metrics_for_layers(use_case, layer[0], layer[1]) store_metrics_for_layers(layer.use_case, layer.use_case_table, layer.layer_name, layer.properties)
except FileNotFoundError: except FileNotFoundError:
pass pass
############### ###############
for ld in layer_pairs: for ld in layer_pairs:
print(f"Storing training data for {ld.layer} with L_R={ld.reference_layer}") print(f"Storing training data for {ld.layer} with L_R={ld.reference_layer}")
store_training_data(use_case, layer_name=ld.layer, reference_layer_name=ld.reference_layer) store_training_data(ld.use_case, table=ld.table, layer_name=ld.layer, reference_layer_name=ld.reference_layer)
\ No newline at end of file \ No newline at end of file
...@@ -4,6 +4,7 @@ from processing.data_prep.layer_metrics_calc import run as lrun ...@@ -4,6 +4,7 @@ from processing.data_prep.layer_metrics_calc import run as lrun
from pathlib import Path from pathlib import Path
import json import json
import os import os
from typing import List
from db.repository import Repository from db.repository import Repository
...@@ -12,11 +13,11 @@ repo = Repository() ...@@ -12,11 +13,11 @@ repo = Repository()
def store_clusters_as_files(use_case): def store_clusters_as_files(use_case):
path_ = f'data/{use_case}/raw/clusters/'
Path(path_).mkdir(parents=True, exist_ok=True)
layers = repo.get_layers_for_use_case(use_case) layers = repo.get_layers_for_use_case(use_case)
for l in layers: for l in layers:
path_ = f'data/{l.use_case}/{l.use_case_table}/raw/clusters/'
Path(path_).mkdir(parents=True, exist_ok=True)
clusters = repo.get_clusters_for_layer(use_case, l.use_case_table, l.layer_name) clusters = repo.get_clusters_for_layer(use_case, l.use_case_table, l.layer_name)
with open(os.path.join(path_, f'{l.layer_name}.json'), 'w') as file_: with open(os.path.join(path_, f'{l.layer_name}.json'), 'w') as file_:
...@@ -24,30 +25,25 @@ def store_clusters_as_files(use_case): ...@@ -24,30 +25,25 @@ def store_clusters_as_files(use_case):
def store_time_slices_as_files(use_case): def store_time_slices_as_files(use_case):
path_ = f'data/{use_case}/raw/timeslices/'
layers = repo.get_layers_for_use_case(use_case) layers = repo.get_layers_for_use_case(use_case)
for l in layers: for l in layers:
Path(os.path.join(path_, l.layer_name)).mkdir(parents=True, exist_ok=True) path_ = f'data/{l.use_case}/{l.use_case_table}/raw/timeslices/{l.layer_name}/'
Path(path_).mkdir(parents=True, exist_ok=True)
time_slices = repo.get_time_slices_for_layer(use_case, l.use_case_table, l.layer_name) time_slices = repo.get_time_slices_for_layer(use_case, l.use_case_table, l.layer_name)
for ts in time_slices: for ts in time_slices:
with open(os.path.join(path_, l.layer_name, f'{ts.time}.json'), 'w') as file_: with open(os.path.join(path_, f'{ts.time}.json'), 'w') as file_:
file_.write(json.dumps(ts.to_serializable_dict())) file_.write(json.dumps(ts.to_serializable_dict()))
def run(use_case=None): def run(use_cases: List[str] = None):
'''Prepares training data for single and cross-context using the file system (data/)''' '''Prepares training data for single and cross-context using the file system (data/)'''
if use_case is not None: if use_cases is None:
use_cases = [use_case]
else:
use_cases = repo.get_use_cases() use_cases = repo.get_use_cases()
for use_case in use_cases: for use_case in use_cases:
store_clusters_as_files(use_case) store_clusters_as_files(use_case)
store_time_slices_as_files(use_case) store_time_slices_as_files(use_case)
crun(use_case) crun(use_case)
lrun(use_case) lrun(use_case)
...@@ -11,8 +11,8 @@ def increase_time_window(time_window_id: str) -> str: ...@@ -11,8 +11,8 @@ def increase_time_window(time_window_id: str) -> str:
from typing import Tuple from typing import Tuple
import pickle import pickle
def load_ml_models(use_case, method, layer_name, reference_layer_name=None) -> Tuple['scaler', 'clf']: def load_ml_models(use_case, table, method, layer_name, reference_layer_name=None) -> Tuple['scaler', 'clf']:
path_ = f'data/{use_case}/ml_output/{method}/{layer_name}' path_ = f'data/{use_case}/{table}/ml_output/{method}/{layer_name}'
if method == 'single_context': if method == 'single_context':
with open(f'{path_}.model', 'rb') as file: with open(f'{path_}.model', 'rb') as file:
......
...@@ -44,18 +44,19 @@ repo = Repository() ...@@ -44,18 +44,19 @@ repo = Repository()
def run_prediction(use_case: str): def run_prediction(use_case: str):
for layerpair in repo.get_layer_pairs(use_case): for layerpair in repo.get_layer_pairs(use_case):
table = layerpair.table
layer_name = layerpair.layer layer_name = layerpair.layer
reference_layer_name = layerpair.reference_layer reference_layer_name = layerpair.reference_layer
print(f"Predicting {method} for {use_case}//{layer_name} based on {reference_layer_name}") print(f"Predicting {method} for {use_case}//{table}//{layer_name} based on {reference_layer_name}")
########################## ##########################
with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file: with open(f'data/{use_case}/{table}/cluster_metrics/{layer_name}.json') as file:
cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())] cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]
cluster_ids = {c.cluster_id for c in cluster_metrics} cluster_ids = {c.cluster_id for c in cluster_metrics}
cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics} cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}
with open(f'data/{use_case}/layer_metrics/{reference_layer_name}.json') as file: with open(f'data/{use_case}/{table}/layer_metrics/{reference_layer_name}.json') as file:
layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())] layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]
layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics} layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}
###################### ######################
...@@ -77,7 +78,7 @@ def run_prediction(use_case: str): ...@@ -77,7 +78,7 @@ def run_prediction(use_case: str):
# yield each combination of reference layer metrics to clusters # yield each combination of reference layer metrics to clusters
prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)]) prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)])
####################### #######################
scaler, svc = load_ml_models(use_case, method, layer_name, reference_layer_name) scaler, svc = load_ml_models(use_case, table, method, layer_name, reference_layer_name)
################ ################
prediction_cluster_ids = [] prediction_cluster_ids = []
prediction_time_window = increase_time_window(ordered_time_keys[1]) prediction_time_window = increase_time_window(ordered_time_keys[1])
...@@ -95,5 +96,5 @@ def run_prediction(use_case: str): ...@@ -95,5 +96,5 @@ def run_prediction(use_case: str):
prediction_results = np.rint(prediction_results) # round to full numbers prediction_results = np.rint(prediction_results) # round to full numbers
for i in range(len(prediction_cluster_ids)): for i in range(len(prediction_cluster_ids)):
res = PredictionResult(use_case, use_case, method, layer_name, reference_layer_name, prediction_cluster_ids[i], prediction_time_window, prediction_results[i]) res = PredictionResult(use_case, table, method, layer_name, reference_layer_name, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])
repo.add_prediction_result(res) repo.add_prediction_result(res)
...@@ -37,11 +37,12 @@ repo = Repository() ...@@ -37,11 +37,12 @@ repo = Repository()
def run_prediction(use_case: str): def run_prediction(use_case: str):
for layer in repo.get_layers_for_use_case(use_case): for layer in repo.get_layers_for_use_case(use_case):
table = layer.use_case_table
layer_name = layer.layer_name layer_name = layer.layer_name
print(f"Predicting {method} for {use_case}//{layer_name}") print(f"Predicting {method} for {use_case}//{table}//{layer_name}")
################# #################
path_in = f"data/{use_case}/cluster_metrics/{layer_name}.json" path_in = f"data/{use_case}/{table}/cluster_metrics/{layer_name}.json"
with open(path_in, 'r') as file: with open(path_in, 'r') as file:
data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())] data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]
...@@ -57,7 +58,7 @@ def run_prediction(use_case: str): ...@@ -57,7 +58,7 @@ def run_prediction(use_case: str):
cluster_map[id_].append(cluster) cluster_map[id_].append(cluster)
#################### ####################
scaler, svc = load_ml_models(use_case, method, layer_name) scaler, svc = load_ml_models(use_case, table, method, layer_name)
##################### #####################
# store id, future time window, and flattened metrics to combine the latter during prediction # store id, future time window, and flattened metrics to combine the latter during prediction
prediction_cluster_ids = [] prediction_cluster_ids = []
...@@ -78,5 +79,5 @@ def run_prediction(use_case: str): ...@@ -78,5 +79,5 @@ def run_prediction(use_case: str):
prediction_results = np.rint(prediction_results) # round to full numbers prediction_results = np.rint(prediction_results) # round to full numbers
for i in range(len(prediction_cluster_ids)): for i in range(len(prediction_cluster_ids)):
res = PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i]) res = PredictionResult(use_case, table, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i])
repo.add_prediction_result(res) repo.add_prediction_result(res)
...@@ -9,8 +9,8 @@ max_sampling_size = 20000 ...@@ -9,8 +9,8 @@ max_sampling_size = 20000
import pickle import pickle
from pathlib import Path from pathlib import Path
def export_model(model, use_case, layer_name, reference_layer_name, scaler=False): def export_model(model, use_case, table, layer_name, reference_layer_name, scaler=False):
fpath = f'data/{use_case}/ml_output/{approach}' fpath = f'data/{use_case}/{table}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True) Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f: with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f) pickle.dump(model, f)
...@@ -39,11 +39,11 @@ repo = Repository() ...@@ -39,11 +39,11 @@ repo = Repository()
def run_training(use_case): def run_training(use_case):
for layerpair in repo.get_layer_pairs(use_case): for layerpair in repo.get_layer_pairs(use_case):
table = layerpair.table
layer_name = layerpair.layer layer_name = layerpair.layer
reference_layer_name = layerpair.reference_layer reference_layer_name = layerpair.reference_layer
df: DataFrame = pd.read_csv(f'data/{use_case}/{table}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)
df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)
####################### #######################
training, testing = split_data(df, shuffle=False) training, testing = split_data(df, shuffle=False)
##################### #####################
...@@ -58,7 +58,7 @@ def run_training(use_case): ...@@ -58,7 +58,7 @@ def run_training(use_case):
test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y
test_Y = testing[testing.columns[-1]] test_Y = testing[testing.columns[-1]]
export_model(scaler, use_case, layer_name, reference_layer_name, scaler=True) export_model(scaler, use_case, table, layer_name, reference_layer_name, scaler=True)
######################## ########################
# RF is a lot better than SVM, but I did not tune hyperparameters for regression # RF is a lot better than SVM, but I did not tune hyperparameters for regression
rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
...@@ -67,8 +67,8 @@ def run_training(use_case): ...@@ -67,8 +67,8 @@ def run_training(use_case):
# rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol) # rfc = LinearSVR(loss=loss, C=c, dual=dual, tol=tol)
rfc.fit(train_X, train_Y) rfc.fit(train_X, train_Y)
####################
print_regression_report(rfc, test_X, test_Y, f"{layer_name} based on {reference_layer_name}") print_regression_report(rfc, test_X, test_Y, f"{layer_name} based on {reference_layer_name}")
####################
export_model(rfc, use_case, layer_name, reference_layer_name) export_model(rfc, use_case, table, layer_name, reference_layer_name)
\ No newline at end of file
...@@ -9,8 +9,8 @@ max_sampling_size = 20000 ...@@ -9,8 +9,8 @@ max_sampling_size = 20000
import pickle import pickle
from pathlib import Path from pathlib import Path
def export_model(model, use_case, layer_name, scaler=False): def export_model(model, use_case, table, layer_name, scaler=False):
fpath = f'data/{use_case}/ml_output/{approach}' fpath = f'data/{use_case}/{table}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True) Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f: with open(f'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f) pickle.dump(model, f)
...@@ -39,10 +39,10 @@ repo = Repository() ...@@ -39,10 +39,10 @@ repo = Repository()
def run_training(use_case): def run_training(use_case):
for layer in repo.get_layers_for_use_case(use_case): for layer in repo.get_layers_for_use_case(use_case):
table = layer.use_case_table
layer_name = layer.layer_name layer_name = layer.layer_name
df: DataFrame = pd.read_csv(f'data/{use_case}/{table}/ml_input/single_context/{layer_name}.csv', index_col=0)
df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv', index_col=0)
####################### #######################
training, testing = split_data(df, shuffle=False) training, testing = split_data(df, shuffle=False)
##################### #####################
...@@ -57,7 +57,7 @@ def run_training(use_case): ...@@ -57,7 +57,7 @@ def run_training(use_case):
test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y
test_Y = testing[testing.columns[-1]] test_Y = testing[testing.columns[-1]]
export_model(scaler, use_case, layer_name, scaler=True) export_model(scaler, use_case, table, layer_name, scaler=True)
######################## ########################
# RF is 10-20% better compared to SVM, but I did not tune hyperparameters for regression # RF is 10-20% better compared to SVM, but I did not tune hyperparameters for regression
rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, rfc = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
...@@ -69,5 +69,5 @@ def run_training(use_case): ...@@ -69,5 +69,5 @@ def run_training(use_case):
#################### ####################
print_regression_report(rfc, test_X, test_Y, layer_name) print_regression_report(rfc, test_X, test_Y, layer_name)
#################### ####################
export_model(rfc, use_case, layer_name) export_model(rfc, use_case, table, layer_name)
\ No newline at end of file
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from processing.data_prep.main import run
if __name__ == '__main__':
'''Creates data/raw files'''
run(use_case='community-prediction-youtube-n')
\ No newline at end of file
...@@ -9,8 +9,10 @@ import urllib3 ...@@ -9,8 +9,10 @@ import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from processing.fetching import fetching from processing.fetching import fetching
from db.repository import Repository
if __name__ == "__main__": if __name__ == "__main__":
'''Fetches all required data from business-logic and role-stage-discovery.''' '''Fetches all required data from business-logic and role-stage-discovery.'''
Repository().DROP(confirm=True)
fetching.fetch(selected_use_cases=['community-prediction-youtube-n'], selected_use_case_tables=None) use_cases = ['vialog-enum', 'car-sharing-official', 'smart-energy', 'crowd-journalism-enum']+['community-prediction-youtube-n', 'community-prediction-taxi']
\ No newline at end of file fetching.fetch(selected_use_cases=use_cases, selected_use_case_tables=None)
\ No newline at end of file
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from db.repository import Repository
from processing.ml.predict_single_context import run_prediction as run_single_prediction
from processing.ml.predict_cross_context import run_prediction as run_cross_prediction
if __name__ == '__main__':
'''Executes the predictions.'''
use_case='community-prediction-youtube-n'
repo = Repository()
repo.delete_all_prediction_results()
run_single_prediction(use_case)
run_cross_prediction(use_case)
\ No newline at end of file
...@@ -4,13 +4,42 @@ modules_path = '../../../modules/' ...@@ -4,13 +4,42 @@ modules_path = '../../../modules/'
if os.path.exists(modules_path): if os.path.exists(modules_path):
sys.path.insert(1, modules_path) sys.path.insert(1, modules_path)
from typing import List
from db.repository import Repository
repo = Repository()
from processing.data_prep.main import run as run_data_prep
def _run_data_preparation(use_cases: List[str] = None):
'''Creates data/raw, data/cluster_metrics, data/layer_metrics, and data/ml_input files.'''
run_data_prep(use_cases)
from processing.ml.train_single_context import run_training as run_single_training from processing.ml.train_single_context import run_training as run_single_training
from processing.ml.train_cross_context import run_training as run_cross_training from processing.ml.train_cross_context import run_training as run_cross_training
def _run_training(use_cases: List[str] = None):
'''Executes the training and creates data/ml_output files.'''
for use_case in use_cases:
run_single_training(use_case)
run_cross_training(use_case)
from processing.ml.predict_single_context import run_prediction as run_single_prediction
from processing.ml.predict_cross_context import run_prediction as run_cross_prediction
def _run_prediction(use_cases: List[str] = None):
'''Executes the predictions and stores them in the DB.'''
for use_case in use_cases:
repo.delete_prediction_results(use_case)
run_single_prediction(use_case)
run_cross_prediction(use_case)
if __name__ == '__main__': if __name__ == '__main__':
'''Executes the training.''' use_cases = ['vialog-enum', 'car-sharing-official', 'smart-energy', 'crowd-journalism-enum']
use_cases = ['community-prediction-youtube-n', 'community-prediction-taxi']
use_case='community-prediction-youtube-n' _run_data_preparation(use_cases)
run_single_training(use_case) _run_training(use_cases)
run_cross_training(use_case) _run_prediction(use_cases)
\ No newline at end of file # TODO file cleanup
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment