Commit 495390ac authored by Alexander Lercher's avatar Alexander Lercher

Cross context prediction

parent a878064e
......@@ -131,4 +131,7 @@ class Repository(MongoRepositoryBase):
def get_prediction_results(self, use_case: str) -> List[PredictionResult]:
entries = super().get_entries(self._prediction_result_collection, selection={'use_case': use_case}, projection={'_id': 0})
return [PredictionResult.create_from_dict(e) for e in entries]
def delete_all_prediction_results(self):
super().drop_collection(self._prediction_result_collection)
#endregion
......@@ -33,8 +33,7 @@ class Cluster:
def get_time_info(self) -> int:
'''Returns the week of the time tuple str, eg. 25 for "(2014, 25)".'''
str_tuple = self.time_window_id
return int(str_tuple.split(',')[1].strip()[:-1])
return eval(self.time_window_id)[1]
def __repr__(self):
return str(self.__dict__)
......
......@@ -53,6 +53,10 @@ class Layer:
self.distances_from_global_centers = self.get_distances_from_global_center(active_clusters)
self.cluster_center_distance_agg_metrics = self.get_center_distance_min_max_avg_sum(active_clusters)
def get_time_info(self) -> int:
'''Returns the week of the time tuple str, eg. 25 for "(2014, 25)".'''
return eval(self.time_window_id)[1]
def get_size_min_max_avg_sum(self, clusters: List[InternalCluster]) -> dict:
'''Returns min, max, avg, and sum of the cluster's absolute sizes.'''
if len(clusters) == 0:
......
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label, convert_metrics_data_to_dataframe
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label, convert_metrics_data_to_dataframe, get_cluster_metrics
from pathlib import Path
#############################
......@@ -86,7 +86,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
tuples = []
continue
cur_metrics = (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))
cur_metrics = get_cluster_metrics(cur_cluster)
# deque function: adding N+1st element will remove oldest one
if len(tuples) == N:
......
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label,convert_metrics_data_to_dataframe
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label,convert_metrics_data_to_dataframe, get_layer_metrics
from pathlib import Path
#################
......@@ -59,21 +59,10 @@ def get_columns(N) -> List[str]:
cols = cols * N
return cols + ['cluster_id'] + ['evolution_label']
######################
def get_cyclic_time_feature_from_time_window(time: str) -> Tuple[float, float]:
return get_cyclic_time_feature(int(time.replace('(', '').replace(')', '').split(',')[1]))
#######################
from typing import Iterable, List, Dict, Any
import json
from entities import Layer, Cluster
def get_layer_metrics(layer: Layer) -> Iterable:
res = [layer.n_nodes, layer.n_clusters, layer.entropy]
res += [layer.cluster_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_relative_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_center_distance_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res.append(get_cyclic_time_feature_from_time_window(layer.time_window_id))
return res
def create_layer_metrics_training_data(use_case: str, layer_name: str, reference_layer: str, N: int = 2) -> Iterable:
"""
Loads the metrics training data for an individual layer from disk.
......
......@@ -45,4 +45,22 @@ def convert_metrics_data_to_dataframe(data: Iterable, columns: list, flattening_
training_data.append(xy)
return pd.DataFrame(data=training_data, columns=columns)
\ No newline at end of file
return pd.DataFrame(data=training_data, columns=columns)
####################
from entities import Cluster, Layer
from typing import Dict, Tuple
def get_cluster_metrics(cur_cluster: Cluster) -> Tuple:
return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2,
cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))
####################
def get_layer_metrics(layer: Layer) -> Iterable:
res = [layer.n_nodes, layer.n_clusters, layer.entropy]
res += [layer.cluster_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_relative_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_center_distance_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res.append(get_cyclic_time_feature(layer.get_time_info()))
return res
###################
\ No newline at end of file
def increase_time_window(time_window_id: str) -> str:
tuple_ = eval(time_window_id)
if tuple_[1] == 52:
# 1st week next year
return (tuple_[0]+1 , 1)
else:
# next week
return str((tuple_[0], tuple_[1]+1))
######################
from typing import Tuple
import pickle
def load_ml_models(use_case, method, layer_name, reference_layer_name=None) -> Tuple['scaler', 'clf']:
path_ = f'data/{use_case}/ml_output/{method}/{layer_name}'
if method == 'single_context':
with open(f'{path_}.model', 'rb') as file:
svc = pickle.load(file)
with open(f'{path_}_scaler.model', 'rb') as file:
scaler = pickle.load(file)
elif method == 'cross_context':
with open(f'{path_}_{reference_layer_name}.model', 'rb') as file:
svc = pickle.load(file)
with open(f'{path_}_{reference_layer_name}_scaler.model', 'rb') as file:
scaler = pickle.load(file)
else:
raise NotImplementedError('Prediction method is not implemented')
return scaler, svc
\ No newline at end of file
from processing.data_prep.metrics_base import get_cyclic_time_feature, get_layer_metrics
from processing.ml.predict_base import increase_time_window, load_ml_models
method = 'cross_context'
N = 2 # Currently N is fixed to 2
####################
import pandas as pd
from pandas import DataFrame
#####################
import json
from entities import Layer, Cluster
import collections
import numpy as np
from typing import Iterable, Tuple, List, Dict, Any
####################
import pickle
#####################
import numpy as np
def flatten_layer_metrics_datapoint(datapoint: list) -> np.array:
'''
Flattens a single layer metrics data point in the form:
[(n_nodes, n_clusters, entropy,
(relative_cluster_size)^M, (distance_from_global_centers)^M,
(time1, time2))^N,
cluster_number]
to:
(X)
'''
flat_list = []
for layer_metric_tuple in datapoint[:-1]: # for all x
flat_list.extend(layer_metric_tuple[0:-1]) # everything before time
flat_list.extend(layer_metric_tuple[-1]) # time1/2
flat_list.append(datapoint[-1]) # cluster num
return np.asarray(flat_list)
#########################
from db.repository import Repository
from db.dao import PredictionResult
repo = Repository()
def run_prediction(use_case: str):
for layerpair in repo.get_layer_pairs(use_case):
layer_name = layerpair.layer
reference_layer_name = layerpair.reference_layer
print(f"Predicting {method} for {use_case}//{layer_name} based on {reference_layer_name}")
##########################
with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:
cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]
cluster_ids = {c.cluster_id for c in cluster_metrics}
cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}
with open(f'data/{use_case}/layer_metrics/{reference_layer_name}.json') as file:
layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]
layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}
######################
# load the time keys chronologically
ordered_time_keys = list(layer_metrics.keys())
ordered_time_keys.sort(key=lambda x: eval(x))
######################
ordered_time_keys = ordered_time_keys[-N:]
#################
prediction_metrics_raw = []
current_layer_metric = layer_metrics[ordered_time_keys[1]]
prev_layer_metric = layer_metrics[ordered_time_keys[0]]
current_layer_metric_tuple = get_layer_metrics(current_layer_metric)
prev_layer_metric_tuple = get_layer_metrics(prev_layer_metric)
for cluster_id in cluster_ids:
# yield each combination of reference layer metrics to clusters
prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)])
#######################
scaler, svc = load_ml_models(use_case, method, layer_name, reference_layer_name)
################
prediction_cluster_ids = []
prediction_time_window = increase_time_window(ordered_time_keys[1])
prediction_metrics = []
for pred in prediction_metrics_raw:
cluster_id = pred[-1]
prediction_cluster_ids.append(cluster_id)
flat_ = flatten_layer_metrics_datapoint(pred)
prediction_metrics.append(flat_)
prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))
print(np.unique(prediction_results, return_counts=True))
for i in range(len(prediction_cluster_ids)):
res = PredictionResult(use_case, use_case, method, layer_name, reference_layer_name, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])
repo.add_prediction_result(res)
from processing.data_prep.metrics_base import get_cyclic_time_feature
from processing.data_prep.metrics_base import get_cyclic_time_feature, get_cluster_metrics
from processing.ml.predict_base import increase_time_window, load_ml_models
N = 3 # Currently N is fixed to 3
method = 'single_context'
......@@ -11,18 +12,11 @@ import json
from entities import Cluster
import collections
import numpy as np
from typing import Iterable, Tuple
from typing import Iterable, Tuple, Dict, List
######################
from typing import Dict
from typing import Tuple
def get_metrics(cur_cluster: Cluster) -> Tuple:
return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2,
cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))
####################
import pickle
#####################
def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
def flatten_metrics_datapoint(datapoint: list) -> np.array:
'''
Flattens a single metrics data point in the form:
[(cluster_size, cluster_variance, cluster_density, cluster_import1, cluster_import2, cluster_range, cluster_center, (time_f1, time_f2))^N]
......@@ -35,16 +29,6 @@ def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
flat_list.extend(entry[-1]) # add time tuple
return np.asarray(flat_list)
######################
def increase_time_window(time_window_id: str):
tuple_ = eval(time_window_id)
if tuple_[1] == 52:
# 1st week next year
return (tuple_[0]+1 , 1)
else:
# next week
return str((tuple_[0], tuple_[1]+1))
#########################
from db.repository import Repository
from db.dao import PredictionResult
......@@ -72,12 +56,8 @@ def run_prediction(use_case: str):
cluster_map[id_] = []
cluster_map[id_].append(cluster)
####################
with open(f'data/{use_case}/ml_output/{method}/{layer_name}.model', 'rb') as file:
svc = pickle.load(file)
####################
with open(f'data/{use_case}/ml_output/{method}/{layer_name}_scaler.model', 'rb') as file:
scaler = pickle.load(file)
####################
scaler, svc = load_ml_models(use_case, method, layer_name)
#####################
# store id, future time window, and flattened metrics to combine the latter during prediction
prediction_cluster_ids = []
......@@ -85,7 +65,7 @@ def run_prediction(use_case: str):
prediction_metrics = []
for cluster_id, time_windows in cluster_map.items():
v = [get_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows
v = [get_cluster_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows
v_flattened = flatten_metrics_datapoint(v)
prediction_cluster_ids.append(cluster_id)
......
......@@ -8,10 +8,10 @@ approach = 'cross_context'
import pickle
from pathlib import Path
def export_model(model, use_case, layer_name, reference_layer_name):
def export_model(model, use_case, layer_name, reference_layer_name, scaler=False):
fpath = f'data/{use_case}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}_{reference_layer_name}.model', 'wb') as f:
with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f)
###################
from sklearn.ensemble import RandomForestClassifier
......@@ -46,11 +46,13 @@ def run_training(use_case):
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_X = scaler.fit_transform(training[training.columns[:-1]]) # all except y
train_Y = training[training.columns[-1]]
test_X = scaler.transform(testing)[:,:-1] # all except y
test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y
test_Y = testing[testing.columns[-1]]
export_model(scaler, use_case, layer_name, reference_layer_name, scaler=True)
########################
from processing import DataSampler
......
......@@ -5,12 +5,18 @@ if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from db.repository import Repository
from processing.ml.predict_single_context import run_prediction as run_single_prediction
# from processing.ml.predict_cross_context import run_prediction as run_cross_prediction
from processing.ml.predict_cross_context import run_prediction as run_cross_prediction
if __name__ == '__main__':
'''Executes the predictions.'''
use_case='community-prediction-youtube-n'
repo = Repository()
repo.delete_all_prediction_results()
run_single_prediction(use_case)
# run_cross_prediction(use_case)
\ No newline at end of file
run_cross_prediction(use_case)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment