pre deploy

parent 0dcf5321
FROM python:3
LABEL maintainer="Alexander Lercher"
ENV http_proxy http://proxy.uni-klu.ac.at:3128/
ENV https_proxy http://proxy.uni-klu.ac.at:3128/
RUN apt-get update
EXPOSE 5000
WORKDIR /app
COPY src/data-hub/federated-training-microservice/app/requirements.txt /app/
RUN pip install -r requirements.txt
COPY src/modules/ /app/
COPY src/data-hub/federated-training-microservice/app/ /app/
RUN chmod a+x main.py
CMD ["python", "./main.py"]
\ No newline at end of file
......@@ -84,4 +84,31 @@ else:
BUSINESS_LOGIC_REST_PORT = 30420
BUSINESS_LOGIC_DB_PORT = 30421
## Federated Learning
if server:
BUSINESS_LOGIC_HOSTNAME = 'federated-learning'
BUSINESS_LOGIC_DB_HOSTNAME = f'{BUSINESS_LOGIC_HOSTNAME}-db'
BUSINESS_LOGIC_REST_PORT = 80
BUSINESS_LOGIC_DB_PORT = 27017
else:
BUSINESS_LOGIC_HOSTNAME = 'articonf1.itec.aau.at'
BUSINESS_LOGIC_DB_HOSTNAME = 'articonf1.itec.aau.at'
BUSINESS_LOGIC_REST_PORT = 30422
BUSINESS_LOGIC_DB_PORT = 30423
#endregion Participation Hub
#region Federated Training
## Federated Training
if server:
BUSINESS_LOGIC_HOSTNAME = 'federated-training'
BUSINESS_LOGIC_DB_HOSTNAME = f'{BUSINESS_LOGIC_HOSTNAME}-db'
BUSINESS_LOGIC_REST_PORT = 80
BUSINESS_LOGIC_DB_PORT = 27017
else:
BUSINESS_LOGIC_HOSTNAME = 'articonf1.itec.aau.at'
BUSINESS_LOGIC_DB_HOSTNAME = 'articonf1.itec.aau.at'
BUSINESS_LOGIC_REST_PORT = 30424
BUSINESS_LOGIC_DB_PORT = 30425
#endregion Federated Training
\ No newline at end of file
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
#from models import MnistModel
from models2 import model_fn
def evaluate(server_state):
keras_model = model_fn
keras_model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)
keras_model.set_weights(server_state)
keras_model.evaluate(central_emnist_test)
def evaluate2(server_state,tff_learning_model):
#TODO: Assign weights to the model
#First idea = server_update function??
evaluation = tff.learning.build_federated_evaluation(tff_learning_model)
# keras_model = MnistModel()
# keras_model.compile(
# loss=tf.keras.losses.SparseCategoricalCrossentropy(),
# metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
# )
# keras_model.set_weights(server_state)
# keras_model.evaluate(central_emnist_test)
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
from models import MnistModel
from models2 import model_fn
#tf_dataset_type = None
#model_weights_type = None
@tf.function
def client_update(model, dataset, server_weights, client_optimizer):
"""Performs training (using the server model weights) on the client's dataset."""
# Initialize the client model with the current server weights.
client_weights = model.trainable_variables
# Assign the server weights to the client model.
tf.nest.map_structure(lambda x, y: x.assign(y),
client_weights, server_weights)
# Use the client_optimizer to update the local model.
for batch in dataset:
with tf.GradientTape() as tape:
# Compute a forward pass on the batch of data
outputs = model.forward_pass(batch)
# Compute the corresponding gradient
grads = tape.gradient(outputs.loss, client_weights)
grads_and_vars = zip(grads, client_weights)
# Apply the gradient using a client optimizer.
client_optimizer.apply_gradients(grads_and_vars)
return client_weights
@tf.function
def server_update(model, mean_client_weights):
"""Updates the server model weights as the average of the client model weights."""
model_weights = model.trainable_variables
# Assign the mean client weights to the server model.
tf.nest.map_structure(lambda x, y: x.assign(y),
model_weights, mean_client_weights)
return model_weights
#Creating the initialization computation
@tff.tf_computation
def server_init():
model = MnistModel() #model_fn()
return model.trainable_variables
@tff.federated_computation
def initialize_fn():
return tff.federated_value(server_init(), tff.SERVER)
whimsy_model = MnistModel()
tf_dataset_type = tff.SequenceType(whimsy_model.input_spec)
model_weights_type = server_init.type_signature.result
@tff.tf_computation(tf_dataset_type, model_weights_type)
def client_update_fn(tf_dataset, server_weights):
model = MnistModel()#model_fn()
client_optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
return client_update(model, tf_dataset, server_weights, client_optimizer)
@tff.tf_computation(model_weights_type)
def server_update_fn(mean_client_weights):
model = MnistModel()#model_fn()
return server_update(model, mean_client_weights)
federated_server_type = tff.FederatedType(model_weights_type, tff.SERVER)
federated_dataset_type = tff.FederatedType(tf_dataset_type, tff.CLIENTS)
@tff.federated_computation(federated_server_type, federated_dataset_type)
def next_fn(server_weights, federated_dataset):
# Broadcast the server weights to the clients.
print("server_weights")
print(str(server_weights.type_signature))
server_weights_at_client = tff.federated_broadcast(server_weights)
# Each client computes their updated weights.
client_weights = tff.federated_map(
client_update_fn, (federated_dataset, server_weights_at_client))
# The server averages these updates.
mean_client_weights = tff.federated_mean(client_weights)
print("mean_client_wieghts")
print(str(mean_client_weights.type_signature))
# The server updates its model.
server_weights = tff.federated_map(server_update_fn, mean_client_weights)
return server_weights
def get_federated_algorithm():
#Creating the next_fn
#Getting the data type, needed explicitily in the modell functions
whimsy_model = MnistModel() #model_fn()
global tf_dataset_type
tf_dataset_type = tff.SequenceType(whimsy_model.input_spec)
print("tf_dataset_type")
print(str(tf_dataset_type))
global model_weights_type
model_weights_type = server_init.type_signature.result
print("model_weights_type")
print(str(model_weights_type))
# finished printing types
federated_server_type = tff.FederatedType(model_weights_type, tff.SERVER)
federated_dataset_type = tff.FederatedType(tf_dataset_type, tff.CLIENTS)
federated_algorithm = tff.templates.IterativeProcess(
initialize_fn=initialize_fn,
next_fn=next_fn
)
return federated_algorithm
def merge_2_states(server_state1, server_state2):
return np.mean( np.array([ server_state, server2_state ]), axis=0 )
\ No newline at end of file
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
#from models import MnistModel
from models2 import model_fn
#tf_dataset_type = None
#model_weights_type = None
@tf.function
def client_update(model, dataset, server_weights, client_optimizer):
"""Performs training (using the server model weights) on the client's dataset."""
# Initialize the client model with the current server weights.
client_weights = model.trainable_variables
# Assign the server weights to the client model.
tf.nest.map_structure(lambda x, y: x.assign(y),
client_weights, server_weights)
# Use the client_optimizer to update the local model.
for batch in dataset:
with tf.GradientTape() as tape:
# Compute a forward pass on the batch of data
outputs = model.forward_pass(batch)
# Compute the corresponding gradient
grads = tape.gradient(outputs.loss, client_weights)
grads_and_vars = zip(grads, client_weights)
# Apply the gradient using a client optimizer.
client_optimizer.apply_gradients(grads_and_vars)
return client_weights
@tf.function
def server_update(model, mean_client_weights):
"""Updates the server model weights as the average of the client model weights."""
model_weights = model.trainable_variables
# Assign the mean client weights to the server model.
tf.nest.map_structure(lambda x, y: x.assign(y),
model_weights, mean_client_weights)
return model_weights
#Creating the initialization computation
@tff.tf_computation
def server_init():
model = model_fn()
return model.trainable_variables
@tff.federated_computation
def initialize_fn():
return tff.federated_value(server_init(), tff.SERVER)
whimsy_model = model_fn()
tf_dataset_type = tff.SequenceType(whimsy_model.input_spec)
model_weights_type = server_init.type_signature.result
@tff.tf_computation(tf_dataset_type, model_weights_type)
def client_update_fn(tf_dataset, server_weights):
model = model_fn()
client_optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
return client_update(model, tf_dataset, server_weights, client_optimizer)
@tff.tf_computation(model_weights_type)
def server_update_fn(mean_client_weights):
model = model_fn()
return server_update(model, mean_client_weights)
federated_server_type = tff.FederatedType(model_weights_type, tff.SERVER)
federated_dataset_type = tff.FederatedType(tf_dataset_type, tff.CLIENTS)
@tff.federated_computation(federated_server_type, federated_dataset_type)
def next_fn(server_weights, federated_dataset):
# Broadcast the server weights to the clients.
print("server_weights")
print(str(server_weights.type_signature))
server_weights_at_client = tff.federated_broadcast(server_weights)
# Each client computes their updated weights.
client_weights = tff.federated_map(
client_update_fn, (federated_dataset, server_weights_at_client))
# The server averages these updates.
mean_client_weights = tff.federated_mean(client_weights)
print("mean_client_wieghts")
print(str(mean_client_weights.type_signature))
# The server updates its model.
server_weights = tff.federated_map(server_update_fn, mean_client_weights)
return server_weights
def get_federated_algorithm():
#Creating the next_fn
#Getting the data type, needed explicitily in the modell functions
whimsy_model = model_fn()
global tf_dataset_type
tf_dataset_type = tff.SequenceType(whimsy_model.input_spec)
print("tf_dataset_type")
print(str(tf_dataset_type))
global model_weights_type
model_weights_type = server_init.type_signature.result
print("model_weights_type")
print(str(model_weights_type))
# finished printing types
federated_server_type = tff.FederatedType(model_weights_type, tff.SERVER)
federated_dataset_type = tff.FederatedType(tf_dataset_type, tff.CLIENTS)
federated_algorithm = tff.templates.IterativeProcess(
initialize_fn=initialize_fn,
next_fn=next_fn
)
return federated_algorithm
def merge_2_states(server1_state, server2_state):
return np.mean( np.array([ server1_state, server2_state ]), axis=0 )
\ No newline at end of file
import collections
import attr
import functools
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
from preprocessing import get_client_ids
from preprocessing import get_federated_train_data
from preprocessing import preprocess
#from models import MnistModel
from federated_training_algorithm import get_federated_algorithm
from federated_training_algorithm import merge_2_states
from evaluation import evaluate
np.random.seed(0)
print("## Starting...")
#GET THE DATA (for now it's the default dataset)
emnist_train, emnist_test = tff.simulation.datasets.emnist.load_data()
######client_ids = get_client_ids(emnist_train) not used
print("## Preprocessing the federated_train_data")
federated_train_data = get_federated_train_data(emnist_train)
print("## Declaring the model")
#it is done in models.py
print("## Declaring the federated algorithm")
federated_algorithm = get_federated_algorithm()
server_state = federated_algorithm.initialize()
for round in range(20):
server_state = federated_algorithm.next(server_state, federated_train_data)
print("server_state type")
print(str(type(server_state)))
print(str(type(server_state[0])))
print("FINISHEEED")
server2_state = federated_algorithm.initialize()
for round in range(2):
server2_state = federated_algorithm.next(server2_state, federated_train_data)
merged_state = merge_2_states(server_state,server2_state)
print("server_state[1]")
print(server_state[1])
print("server2_state[1]")
print(server2_state[1])
print("merged_state[1]")
print(merged_state[1])
# print("federated_algorithm.initialize.type_signature")
# print(str(federated_algorithm.initialize.type_signature)
# print("federated_algorithm.next.type_signature")
# print(str(federated_algorithm.next.type_signature))
# print("## Training the model")
# iterative_process = tff.learning.build_federated_averaging_process(
# MnistModel,
# client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02))
# state = iterative_process.initialize()
# for round_num in range(1, 11):
# state, metrics = iterative_process.next(state, federated_train_data)
# print('round {:2d}, metrics={}'.format(round_num, metrics))
#evaluation = tff.learning.build_federated_evaluation(MnistModel)
#TODO integration
print("## Evaluation of the model")
\ No newline at end of file
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import collections
input_spec_data_global = None
MnistVariables = collections.namedtuple(
'MnistVariables', 'weights bias num_examples loss_sum accuracy_sum')
def create_mnist_variables():
return MnistVariables(
weights=tf.Variable(
lambda: tf.zeros(dtype=tf.float32, shape=(784, 10)),
name='weights',
trainable=True),
bias=tf.Variable(
lambda: tf.zeros(dtype=tf.float32, shape=(10)),
name='bias',
trainable=True),
num_examples=tf.Variable(0.0, name='num_examples', trainable=False),
loss_sum=tf.Variable(0.0, name='loss_sum', trainable=False),
accuracy_sum=tf.Variable(0.0, name='accuracy_sum', trainable=False))
def mnist_forward_pass(variables, batch):
y = tf.nn.softmax(tf.matmul(batch['x'], variables.weights) + variables.bias)
predictions = tf.cast(tf.argmax(y, 1), tf.int32)
flat_labels = tf.reshape(batch['y'], [-1])
loss = -tf.reduce_mean(
tf.reduce_sum(tf.one_hot(flat_labels, 10) * tf.math.log(y), axis=[1]))
accuracy = tf.reduce_mean(
tf.cast(tf.equal(predictions, flat_labels), tf.float32))
num_examples = tf.cast(tf.size(batch['y']), tf.float32)
variables.num_examples.assign_add(num_examples)
variables.loss_sum.assign_add(loss * num_examples)
variables.accuracy_sum.assign_add(accuracy * num_examples)
return loss, predictions
def get_local_mnist_metrics(variables):
return collections.OrderedDict(
num_examples=variables.num_examples,
loss=variables.loss_sum / variables.num_examples,
accuracy=variables.accuracy_sum / variables.num_examples)
@tff.federated_computation
def aggregate_mnist_metrics_across_clients(metrics):
return collections.OrderedDict(
num_examples=tff.federated_sum(metrics.num_examples),
loss=tff.federated_mean(metrics.loss, metrics.num_examples),
accuracy=tff.federated_mean(metrics.accuracy, metrics.num_examples))
class MnistModel(tff.learning.Model):
def __init__(self):
self._variables = create_mnist_variables()
@property
def trainable_variables(self):
return [self._variables.weights, self._variables.bias]
@property
def non_trainable_variables(self):
return []
@property
def local_variables(self):
return [
self._variables.num_examples, self._variables.loss_sum,
self._variables.accuracy_sum
]
@property
def input_spec(self):
return collections.OrderedDict(
x=tf.TensorSpec([None, 784], tf.float32),
y=tf.TensorSpec([None, 1], tf.int32))
@tf.function
def forward_pass(self, batch, training=True):
del training
loss, predictions = mnist_forward_pass(self._variables, batch)
num_exmaples = tf.shape(batch['x'])[0]
return tff.learning.BatchOutput(
loss=loss, predictions=predictions, num_examples=num_exmaples)
@tf.function
def report_local_outputs(self):
return get_local_mnist_metrics(self._variables)
@property
def federated_output_computation(self):
return aggregate_mnist_metrics_across_clients
\ No newline at end of file
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import collections
def create_keras_model(): #### DEFAULT TEST MODEL
return tf.keras.models.Sequential([
tf.keras.layers.InputLayer(input_shape=(784,)),
tf.keras.layers.Dense(10, kernel_initializer='zeros'),
tf.keras.layers.Softmax(),
])
def model_fn():
# We _must_ create a new model here, and _not_ capture it from an external
# scope. TFF will call this within different graph contexts.
keras_model = create_keras_model() ###TODO CAN BE CHANGED TO INCLUDE CALLS TO OTHER MODELS (or it cannot, because you cannot specify which model you want to call?)
return tff.learning.from_keras_model(
keras_model,
input_spec=get_input_spec(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
def get_input_spec():
return collections.OrderedDict(
x=tf.TensorSpec([None, 784], tf.float32),
y=tf.TensorSpec([None, 1], tf.int32))
\ No newline at end of file
import collections
import attr
import functools
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
np.random.seed(0)
emnist_train, emnist_test = tff.simulation.datasets.emnist.load_data()
# NUM_CLIENTS = 10
# BATCH_SIZE = 20
def preprocess(dataset, BATCH_SIZE = 20):
def batch_format_fn(element):
"""Flatten a batch of EMNIST data and return a (features, label) tuple."""
return (tf.reshape(element['pixels'], [-1, 784]),
tf.reshape(element['label'], [-1, 1]))
return dataset.batch(BATCH_SIZE).map(batch_format_fn)
def get_client_ids(emnist_train, NUM_CLIENTS = 10):
return np.random.choice(emnist_train.client_ids, size=NUM_CLIENTS, replace=False)
#client_ids = np.random.choice(emnist_train.client_ids, size=NUM_CLIENTS, replace=False)
def get_federated_train_data(emnist_train, NUM_CLIENTS = 10):
client_ids = get_client_ids(emnist_train, NUM_CLIENTS)
return [preprocess(emnist_train.create_tf_dataset_for_client(x))
for x in client_ids
]
# federated_train_data = [preprocess(emnist_train.create_tf_dataset_for_client(x))
# for x in client_ids
# ]
##################################################################
## Second Dataset
############################################
def preprocess_and_shuffle(dataset):
"""Applies `preprocess_dataset` above and shuffles the result."""
preprocessed = preprocess(dataset)
return preprocessed.shuffle(buffer_size=5)
import collections
import attr
import functools
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
np.random.seed(0)
emnist_train, emnist_test = tff.simulation.datasets.emnist.load_data()
# NUM_CLIENTS = 10
# BATCH_SIZE = 20
def preprocess(dataset, BATCH_SIZE = 20):
def batch_format_fn(element):
"""Flatten a batch of EMNIST data and return a (features, label) tuple."""
return (tf.reshape(element['pixels'], [-1, 784]),
tf.reshape(element['label'], [-1, 1]))
return dataset.batch(BATCH_SIZE).map(batch_format_fn)
def get_client_ids(emnist_train, NUM_CLIENTS = 10):
return np.random.choice(emnist_train.client_ids, size=NUM_CLIENTS, replace=False)
#client_ids = np.random.choice(emnist_train.client_ids, size=NUM_CLIENTS, replace=False)
def get_federated_train_data(emnist_train, NUM_CLIENTS = 10):
client_ids = get_client_ids(emnist_train, NUM_CLIENTS)
return [preprocess(emnist_train.create_tf_dataset_for_client(x))
for x in client_ids
]
######
def preprocess_and_shuffle(dataset):
"""Applies `preprocess_dataset` above and shuffles the result."""
preprocessed = preprocess(dataset)
return preprocessed.shuffle(buffer_size=5)
#########################################################################################
#########################################################################################
#########################################################################################
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
real = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv")
fake = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv")
# dropping rows that have urls as text and date, real's dates look fine, also dropping ones that have no text
fake_drop = fake.drop(index=[9358,15507,15508,18933])
fake_drop = fake_drop.drop(fake_drop.loc[fake_drop.text == ' '].index)
real_drop = real.drop(real.loc[real.text == ' '].index)
# Give labels to data before combining
fake['fake'] = 1
real['fake'] = 0
combined = pd.concat([fake, real])
no_reuters = combined.copy()
no_reuters.text = no_reuters.text.str.replace('Reuters', '')
combined = no_reuters.copy()
## train/test split the text data and labels
features = combined['text']
labels = combined['fake']
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
# the model will remember only the top 2000 most common words
max_words = 2000
max_len = 400
token = Tokenizer(num_words=max_words, lower=True, split=' ')
token.fit_on_texts(X_train.values)
sequences = token.texts_to_sequences(X_train.values)
train_sequences_padded = pad_sequences(sequences, maxlen=max_len)
embed_dim = 50
lstm_out = 64
batch_size = 32
input_shape_var = (max_words, )
model = Sequential()
model.add(Embedding(max_words, embed_dim, input_length = max_len, input_shape=input_shape_var))
model.add(LSTM(lstm_out))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1, name='out_layer'))
model.add(Activation('sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',\
metrics = ['accuracy'])
print(model.summary())
print()
history = model.fit(train_sequences_padded, y_train, batch_size=batch_size, epochs = 5, validation_split=0.2)
test_sequences = token.texts_to_sequences(X_test)
test_sequences_padded = pad_sequences(test_sequences, maxlen=max_len)
model.evaluate(test_sequences_padded, y_test)
###################################################################
experiment_name = "mnist"
method = "tff_training"
client_lr = 1e-2
server_lr = 1e-2
split = 4
NUM_ROUNDS = 5
NUM_EPOCHS = 5
BATCH_SIZE = 20
PREFETCH_BUFFER = 10
this_dir = Path.cwd()
model_dir = this_dir / "saved_models" / experiment_name / method
output_dir = this_dir / "results" / experiment_name / method
if not model_dir.exists():
model_dir.mkdir(parents=True)
if not output_dir.exists():
output_dir.mkdir(parents=True)
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype(np.float32)
y_train = y_train.astype(np.int32)
x_test = x_test.astype(np.float32).reshape(10000, 28, 28, 1)
y_test = y_test.astype(np.int32).reshape(10000, 1)
total_image_count = len(x_train)
image_per_set = int(np.floor(total_image_count/split))
client_train_dataset = collections.OrderedDict()
for i in range(1, split+1):
client_name = "client_" + str(i)
start = image_per_set * (i-1)
end = image_per_set * i
print(f"Adding data from {start} to {end} for client : {client_name}")
data = collections.OrderedDict((('label', y_train[start:end]), ('pixels', x_train[start:end])))
client_train_dataset[client_name] = data
train_dataset = tff.simulation.FromTensorSlicesClientData(client_train_dataset)
sample_dataset = train_dataset.create_tf_dataset_for_client(train_dataset.client_ids[0])
sample_element = next(iter(sample_dataset))
SHUFFLE_BUFFER = image_per_set
\ No newline at end of file
# import numpy as np ##
# import pandas as pd ##
# import seaborn as sns
# import matplotlib.pyplot as plt
# import nltk ##
# from sklearn.preprocessing import LabelBinarizer
# from nltk.corpus import stopwords ##
# from nltk.stem.porter import PorterStemmer #
# from wordcloud import WordCloud,STOPWORDS ##
# from nltk.stem import WordNetLemmatizer #
# from nltk.tokenize import word_tokenize,sent_tokenize #
# from bs4 import BeautifulSoup ##
# import re,string,unicodedata ##
# from keras.preprocessing import text, sequence
# from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
# from sklearn.model_selection import train_test_split
# from string import punctuation ##
# from nltk import pos_tag #
# from nltk.corpus import wordnet #
# import keras
# from keras.models import Sequential
# from keras.layers import Dense,Embedding,LSTM,Dropout
# from keras.callbacks import ReduceLROnPlateau
# import tensorflow as tf
# import tensorflow_federated as tff
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
# def get_dataframe_OLD():
# true = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv")
# false = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv")
# true['category'] = 1
# false['category'] = 0
# df = pd.concat([true,false]) #Merging the 2 datasets
# df['text'] = df['text'] + " " + df['title']
# del df['title']
# del df['subject']
# del df['date']
# return df
# def strip_html(text):
# soup = BeautifulSoup(text, "html.parser")
# return soup.get_text()
# #Removing the square brackets
# def remove_between_square_brackets(text):
# return re.sub('\[[^]]*\]', '', text)
# # Removing URL's
# def remove_between_square_brackets(text):
# return re.sub(r'http\S+', '', text)
# #Removing the stopwords from text
# def remove_stopwords(text):
# stop = set(nltk.corpus.stopwords.words('english'))
# punctuation = list(string.punctuation)
# stop.update(punctuation)
# final_text = []
# for i in text.split():
# if i.strip().lower() not in stop:
# final_text.append(i.strip())
# return " ".join(final_text)
# #Removing the noisy text
# def denoise_text(text):
# text = strip_html(text)
# text = remove_between_square_brackets(text)
# text = remove_stopwords(text)
# return text
# #Apply function on review column
# def data_cleaning():
# df = get_dataframe()
# df['text']=df['text'].apply(denoise_text)
# print("#####")
#data_cleaning()
# x_train,x_test,y_train,y_test = train_test_split(df.text,df.category,random_state = 0)
# max_features = 10000
# maxlen = 300
# tokenizer = text.Tokenizer(num_words=max_features)
# tokenizer.fit_on_texts(x_train)
# tokenized_train = tokenizer.texts_to_sequences(x_train)
# x_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)
# #Defining Neural Network
# model = Sequential()
# #Non-trainable embeddidng layer
# model.add(Embedding(max_features, output_dim=embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False))
# #LSTM
# model.add(LSTM(units=128 , return_sequences = True , recurrent_dropout = 0.25 , dropout = 0.25))
# model.add(LSTM(units=64 , recurrent_dropout = 0.1 , dropout = 0.1))
# model.add(Dense(units = 32 , activation = 'relu'))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(optimizer=keras.optimizers.Adam(lr = 0.01), loss='binary_crossentropy', metrics=['accuracy'])
real = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv")
fake = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv")
# dropping rows that have urls as text and date, real's dates look fine, also dropping ones that have no text
fake_drop = fake.drop(index=[9358,15507,15508,18933])
fake_drop = fake_drop.drop(fake_drop.loc[fake_drop.text == ' '].index)
real_drop = real.drop(real.loc[real.text == ' '].index)
# Give labels to data before combining
fake['fake'] = 1
real['fake'] = 0
combined = pd.concat([fake, real])
no_reuters = combined.copy()
no_reuters.text = no_reuters.text.str.replace('Reuters', '')
combined = no_reuters.copy()
## train/test split the text data and labels
features = combined['text']
labels = combined['fake']
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
# the model will remember only the top 2000 most common words
max_words = 2000
max_len = 400
token = Tokenizer(num_words=max_words, lower=True, split=' ')
token.fit_on_texts(X_train.values)
sequences = token.texts_to_sequences(X_train.values)
train_sequences_padded = pad_sequences(sequences, maxlen=max_len)
embed_dim = 50
lstm_out = 64
batch_size = 32
input_shape_var = (max_words, )
model = Sequential()
model.add(Embedding(max_words, embed_dim, input_length = max_len, input_shape=input_shape_var))
model.add(LSTM(lstm_out))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1, name='out_layer'))
model.add(Activation('sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',\
metrics = ['accuracy'])
print(model.summary())
print()
history = model.fit(train_sequences_padded, y_train, batch_size=batch_size, epochs = 5, validation_split=0.2)
test_sequences = token.texts_to_sequences(X_test)
test_sequences_padded = pad_sequences(test_sequences, maxlen=max_len)
model.evaluate(test_sequences_padded, y_test)
\ No newline at end of file
#import processing.text_processing.global_hyperparams as globals
import global_hyperparams as globals
from model import get_simple_LSTM_model
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import collections
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
def model_fn():
keras_model = get_simple_LSTM_model()
#return tff.learning.from_compiled_keras_model(keras_model, sample_batch) original
return tff.learning.from_keras_model(
keras_model,
input_spec=globals.INPUT_SPEC,
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
def federated_computation_new(train_dataset,test_dataset):
if(globals.INPUT_SPEC == None):
#should never reach this place because INPUT_SPEC is instantiated inside get_preprocessed_train_test_data.
#however, if in the future, processed data is provided without hte preprocessing function it will be none -> therefore assign it here
globals.INPUT_SPEC = train_dataset[0].element_spec
# Training and evaluating the model
iterative_process = tff.learning.build_federated_averaging_process(model_fn,client_optimizer_fn=lambda: tf.keras.optimizers.SGD(lr=0.5))
state = iterative_process.initialize()
print(type(state))
for n in range(globals.EPOCHS):
state, metrics = iterative_process.next(state, train_dataset)
print('round {}, training metrics={}'.format(n+1, metrics))
evaluation = tff.learning.build_federated_evaluation(model_fn)
eval_metrics = evaluation(state.model, train_dataset)
print('Training evaluation metrics={}'.format(eval_metrics))
test_metrics = evaluation(state.model, test_dataset)
print('Test evaluation metrics={}'.format(test_metrics))
return state,metrics
###############################################################################################
def federated_computation_continue(train_dataset,test_dataset,restored_state):
if(globals.INPUT_SPEC == None):
#should never reach this place because INPUT_SPEC is instantiated inside get_preprocessed_train_test_data.
#however, if in the future, processed data is provided without hte preprocessing function it will be none -> therefore assign it here
globals.INPUT_SPEC = train_dataset[0].element_spec
# Training and evaluating the model
iterative_process = tff.learning.build_federated_averaging_process(model_fn,client_optimizer_fn=lambda: tf.keras.optimizers.SGD(lr=0.5))
state = iterative_process.initialize()
state = restored_state
print(type(state))
for n in range(globals.EPOCHS):
state, metrics = iterative_process.next(state, train_dataset)
print('round {}, training metrics={}'.format(n+1, metrics))
evaluation = tff.learning.build_federated_evaluation(model_fn)
eval_metrics = evaluation(state.model, train_dataset)
print('Training evaluation metrics={}'.format(eval_metrics))
test_metrics = evaluation(state.model, test_dataset)
print('Test evaluation metrics={}'.format(test_metrics))
return state,metrics
###############################################################################################
import os
#from processing.text_processing.federated_algorithm import federated_computation_continue
#from processing.text_processing.version_handler import save_state_to_file
print(os.getcwd())
#import processing.text_processing.global_hyperparams as globals
#from processing.text_processing.preprocessing import get_preprocessed_train_test_data
import global_hyperparams as globals
from preprocessing import get_preprocessed_train_test_data
from federated_algorithm import federated_computation_new, federated_computation_continue, save_state_to_file, load_state_from_file
from checkpoint_manager import save_to_file_CSV#,save_state_to_file, load_state_from_file
# globals.initialize()
# train_dataset, test_dataset= get_preprocessed_train_test_data()
# state,metrics = federated_computation_new(train_dataset,test_dataset)
# last_model_id = save_state_to_file(state)
# #model_filename = "ckpt_1622721644"
# #restored_state = load_state_from_file(model_filename)
# #state,metrics = federated_computation_continue(train_dataset, test_dataset, restored_state)
# #last_model_id = save_state_to_file(state)
# trained_metrics= metrics['train']
# save_to_file_CSV(globals.TRAINER_ID,last_model_id,globals.DATASET_ID,trained_metrics['sparse_categorical_accuracy'],trained_metrics['loss'])
# print("DONE")
# print(type(state))
# print(type(metrics))
# print("DONE2")
def start_processing(developer_id:int = 0):
globals.initialize()
train_dataset, test_dataset= get_preprocessed_train_test_data()
state,metrics = federated_computation_new(train_dataset,test_dataset)
trained_metrics= metrics['train']
timestamp = save_state_to_file(state)
globals.TRAINER_ID = developer_id
globals.DATASET_ID = timestamp
written_row = save_to_file_CSV(globals.TRAINER_ID,timestamp,globals.DATASET_ID,trained_metrics['sparse_categorical_accuracy'],trained_metrics['loss'])
return written_row
\ No newline at end of file
import global_hyperparams as globals
from model import get_simple_LSTM_model
import pickle
import tensorflow as tf
import tensorflow_federated as tff
def model_fn():
keras_model = get_simple_LSTM_model()
return tff.learning.from_keras_model(
keras_model,
input_spec=globals.INPUT_SPEC,
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
def federated_computation_new(train_dataset,test_dataset):
if(globals.INPUT_SPEC == None):
#should never reach this place because INPUT_SPEC is instantiated inside get_preprocessed_train_test_data.
#however, if in the future, processed data is provided without hte preprocessing function it will be none -> therefore assign it here
globals.INPUT_SPEC = train_dataset[0].element_spec
# Training and evaluating the model
iterative_process = tff.learning.build_federated_averaging_process(model_fn,client_optimizer_fn=lambda: tf.keras.optimizers.SGD(lr=0.5))
state = iterative_process.initialize()
print(type(state))
for n in range(globals.EPOCHS):
state, metrics = iterative_process.next(state, train_dataset)
print('round {}, training metrics={}'.format(n+1, metrics))
evaluation = tff.learning.build_federated_evaluation(model_fn)
eval_metrics = evaluation(state.model, train_dataset)
print('Training evaluation metrics={}'.format(eval_metrics))
test_metrics = evaluation(state.model, test_dataset)
print('Test evaluation metrics={}'.format(test_metrics))
####################################################################################################################################
#Save Last Trained Model
import pickle
with open("processing/"+globals.USE_CASE+"/last_model",'wb') as f:
pickle.dump(state, f)
return state,metrics
###############################################################################################
def federated_computation_continue(train_dataset,test_dataset,restored_state):
if(globals.INPUT_SPEC == None):
#should never reach this place because INPUT_SPEC is instantiated inside get_preprocessed_train_test_data.
#however, if in the future, processed data is provided without hte preprocessing function it will be none -> therefore assign it here
globals.INPUT_SPEC = train_dataset[0].element_spec
# Training and evaluating the model
iterative_process = tff.learning.build_federated_averaging_process(model_fn,client_optimizer_fn=lambda: tf.keras.optimizers.SGD(lr=0.5))
state = iterative_process.initialize()
state = restored_state
print(type(state))
for n in range(globals.EPOCHS):
state, metrics = iterative_process.next(state, train_dataset)
print('round {}, training metrics={}'.format(n+1, metrics))
evaluation = tff.learning.build_federated_evaluation(model_fn)
eval_metrics = evaluation(state.model, train_dataset)
print('Training evaluation metrics={}'.format(eval_metrics))
test_metrics = evaluation(state.model, test_dataset)
print('Test evaluation metrics={}'.format(test_metrics))
return state,metrics
###############################################################################################
def make_prediction(input_prediction_data):
try:
with open("processing/"+globals.USE_CASE+"/last_model",'rb') as f:
state = pickle.load(f)
except Exception as e:
print(e)
return None
model_for_inference = get_simple_LSTM_model()
state.model.assign_weights_to(model_for_inference)
predictions = model_for_inference.predict_on_batch(input_prediction_data)
return predictions
def initialize(use_case,trainer_id = 0,dataset_id = 0):
global MAX_LENGTH #Lenght of sentences to be fed into the NN. Similar to image size i.e. 100pixels x 100pixels, but it's 1D.
MAX_LENGTH = 40
global VOCAB_SIZE #tells how many words it memorises. each word is stored as an int.
VOCAB_SIZE = 6000
global NUM_CLIENTS #number of clients in the federated dataset
NUM_CLIENTS = 4
global SHUFFLE_BUFFER
SHUFFLE_BUFFER = 5000 #used in preprocessing
global BATCH_SIZE
BATCH_SIZE = 512 #size of the batch of the dataset. which is later fed into the NN.
global INPUT_SPEC #tell the model how the data will look like.
INPUT_SPEC = None #must & will be initialized after data preprocessing. Currently it's being initialised with: train_dataset[0].element_spec
global EMBED_DIM # number of dimension of the embedding of the layer in the model.
EMBED_DIM = 10
global LSTM_OUT # output size of the LSTM layer
LSTM_OUT = 100
global EPOCHS #number of epochs the model will be trained
EPOCHS = 15
global TRAINER_ID # ID of the trainer entity.
TRAINER_ID = trainer_id #0 = Owner of the use_case
global DATASET_ID # ID of the dataset used
DATASET_ID = dataset_id #0 = "Main"/Original dataset
global USE_CASE #Use_case name
USE_CASE = use_case
\ No newline at end of file
Trainer_id,Model_id,Dataset_id,Accuracy,Loss
0,1623766462,1623766462,0.5,nan
0,1623768325,1623768325,0.25,nan
1,1623768510,1623768510,0.75,nan
0,1623857759,1623857759,0.25,nan
0,1623932014,1623932014,0.0,nan
0,1623934119,1623934119,0.25,nan
0,1623935435,1623935435,0.25,nan
1,1624017183,1624017183,1.0,0.0
1,1624018429,1624018429,0.25,nan
1,1624019715,1624019715,1.0,0.0
1,1624021190,1624021190,0.75,nan
import csv
import os
import json
import time
import global_hyperparams as globals
from preprocessing import get_preprocessed_train_test_data, preprocess_single_train_data
from federated_algorithm import federated_computation_new, make_prediction
def start_processing(use_case, developer_id:int = 0):
globals.initialize(use_case,developer_id)
globals.TRAINER_ID = developer_id
train_dataset, test_dataset= get_preprocessed_train_test_data()
state,metrics = federated_computation_new(train_dataset,test_dataset)
trained_metrics= metrics['train']
timestamp = int(time.time())
globals.DATASET_ID = timestamp
written_row = save_to_file_CSV(use_case,globals.TRAINER_ID,timestamp,globals.DATASET_ID,trained_metrics['sparse_categorical_accuracy'],trained_metrics['loss'])
return written_row
def start_prediction(use_case, developer_id:int = -1):
globals.initialize(use_case,developer_id)
raw_input_prediction_data = "Test sentence. And another sentence which is going to be used as a mean for checking if this article is true or not. Also Santa is real"
raw_input_prediction_data = "Donald Trump Sends Out Embarrassing New Year’s Eve Message. This is Disturbing,'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and the very dishonest fake news media. The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year, President Angry Pants tweeted. 2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America! Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t even allow him to rise above the gutter long enough to wish the American citizens a happy new year! Bishop Talbert Swan (@TalbertSwan) December 31, 2017no one likes you Calvin (@calvinstowell) December 31, 2017Your impeachment would make 2018 a great year for America, but I ll also accept regaining control of Congress. Miranda Yaver (@mirandayaver) December 31, 2017Do you hear yourself talk? When you have to include that many people that hate you you have to wonder? Why do the they all hate me? Alan Sandoval (@AlanSandoval13) December 31, 2017Who uses the word Haters in a New Years wish?? Marlene (@marlene399) December 31, 2017You can t just say happy new year? Koren pollitt (@Korencarpenter) December 31, 2017Here s Trump s New Year s Eve tweet from 2016.Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Love! Donald J. Trump (@realDonaldTrump) December 31, 2016This is nothing new for Trump. He s been doing this for years.Trump has directed messages to his enemies and haters for New Year s, Easter, Thanksgiving, and the anniversary of 9/11. pic.twitter.com/4FPAe2KypA Daniel Dale (@ddale8) December 31, 2017Trump s holiday tweets are clearly not presidential.How long did he work at Hallmark before becoming President? Steven Goodine (@SGoodine) December 31, 2017He s always been like this . . . the only difference is that in the last few years, his filter has been breaking down. Roy Schulze (@thbthttt) December 31, 2017Who, apart from a teenager uses the term haters? Wendy (@WendyWhistles) December 31, 2017he s a fucking 5 year old Who Knows (@rainyday80) December 31, 2017So, to all the people who voted for this a hole thinking he would change once he got into power, you were wrong! 70-year-old men don t change and now he s a year older.Photo by Andrew Burton/Getty Images."
input_prediction_data= preprocess_single_train_data(raw_input_prediction_data)
prediction_result = make_prediction(input_prediction_data)
res = prediction_result[0]
if (res[0]>=0.50):
return json.dumps({ "result" : "True" })
else:
return json.dumps({ "result" : "False" })
def save_to_file_CSV(use_case,trainer_id,model_id,dataset_id,accuracy,loss):
filename = "processing/"+use_case+"/ledger.csv"
row = [str(trainer_id),str(model_id),str(dataset_id),str(accuracy),str(loss)]
if not (os.path.exists(filename)):
fields = ['Trainer_id','Model_id','Dataset_id','Accuracy','Loss']
with open(filename, 'w') as csvfile:
csvwriter =csv.writer(csvfile)
csvwriter.writerow(fields)
csvwriter.writerow(row)
else:
with open(filename, 'a') as csvfile:
csvwriter =csv.writer(csvfile)
csvwriter.writerow(row)
return row
#start_processing("text_processing")
#start_prediction("text_processing")
\ No newline at end of file
import global_hyperparams as globals
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
def get_simple_LSTM_model():
model = Sequential()
model.add(Embedding(globals.VOCAB_SIZE, globals.EMBED_DIM, input_length=globals.MAX_LENGTH))
model.add(Dropout(0.3))
model.add(LSTM(globals.LSTM_OUT))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
return model
\ No newline at end of file
import global_hyperparams as globals
from model import get_simple_LSTM_model
import pickle
import tensorflow as tf
import tensorflow_federated as tff
def model_fn():
keras_model = get_simple_LSTM_model()
return tff.learning.from_keras_model(
keras_model,
input_spec=globals.INPUT_SPEC,
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
def federated_computation_new(train_dataset,test_dataset):
if(globals.INPUT_SPEC == None):
#should never reach this place because INPUT_SPEC is instantiated inside get_preprocessed_train_test_data.
#however, if in the future, processed data is provided without hte preprocessing function it will be none -> therefore assign it here
globals.INPUT_SPEC = train_dataset[0].element_spec
# Training and evaluating the model
iterative_process = tff.learning.build_federated_averaging_process(model_fn,client_optimizer_fn=lambda: tf.keras.optimizers.SGD(lr=0.5))
state = iterative_process.initialize()
print(type(state))
for n in range(globals.EPOCHS):
state, metrics = iterative_process.next(state, train_dataset)
print('round {}, training metrics={}'.format(n+1, metrics))
evaluation = tff.learning.build_federated_evaluation(model_fn)
eval_metrics = evaluation(state.model, train_dataset)
print('Training evaluation metrics={}'.format(eval_metrics))
test_metrics = evaluation(state.model, test_dataset)
print('Test evaluation metrics={}'.format(test_metrics))
####################################################################################################################################
#Save Last Trained Model
import pickle
with open("processing/"+globals.USE_CASE+"/last_model",'wb') as f:
pickle.dump(state, f)
return state,metrics
###############################################################################################
def federated_computation_continue(train_dataset,test_dataset,restored_state):
if(globals.INPUT_SPEC == None):
#should never reach this place because INPUT_SPEC is instantiated inside get_preprocessed_train_test_data.
#however, if in the future, processed data is provided without hte preprocessing function it will be none -> therefore assign it here
globals.INPUT_SPEC = train_dataset[0].element_spec
# Training and evaluating the model
iterative_process = tff.learning.build_federated_averaging_process(model_fn,client_optimizer_fn=lambda: tf.keras.optimizers.SGD(lr=0.5))
state = iterative_process.initialize()
state = restored_state
print(type(state))
for n in range(globals.EPOCHS):
state, metrics = iterative_process.next(state, train_dataset)
print('round {}, training metrics={}'.format(n+1, metrics))
evaluation = tff.learning.build_federated_evaluation(model_fn)
eval_metrics = evaluation(state.model, train_dataset)
print('Training evaluation metrics={}'.format(eval_metrics))
test_metrics = evaluation(state.model, test_dataset)
print('Test evaluation metrics={}'.format(test_metrics))
return state,metrics
###############################################################################################
def make_prediction(input_prediction_data):
try:
with open("processing/"+globals.USE_CASE+"/last_model",'rb') as f:
state = pickle.load(f)
except Exception as e:
print(e)
return None
model_for_inference = get_simple_LSTM_model()
state.model.assign_weights_to(model_for_inference)
predictions = model_for_inference.predict_on_batch(input_prediction_data)
return predictions
def initialize(use_case,trainer_id = 0,dataset_id = 0):
global MAX_LENGTH #Lenght of sentences to be fed into the NN. Similar to image size i.e. 100pixels x 100pixels, but it's 1D.
MAX_LENGTH = 40
global VOCAB_SIZE #tells how many words it memorises. each word is stored as an int.
VOCAB_SIZE = 6000
global NUM_CLIENTS #number of clients in the federated dataset
NUM_CLIENTS = 4
global SHUFFLE_BUFFER
SHUFFLE_BUFFER = 5000 #used in preprocessing
global BATCH_SIZE
BATCH_SIZE = 512 #size of the batch of the dataset. which is later fed into the NN.
global INPUT_SPEC #tell the model how the data will look like.
INPUT_SPEC = None #must & will be initialized after data preprocessing. Currently it's being initialised with: train_dataset[0].element_spec
global EMBED_DIM # number of dimension of the embedding of the layer in the model.
EMBED_DIM = 10
global LSTM_OUT # output size of the LSTM layer
LSTM_OUT = 100
global EPOCHS #number of epochs the model will be trained
EPOCHS = 15
global TRAINER_ID # ID of the trainer entity.
TRAINER_ID = trainer_id #0 = Owner of the use_case
global DATASET_ID # ID of the dataset used
DATASET_ID = dataset_id #0 = "Main"/Original dataset
global USE_CASE #Use_case name
USE_CASE = use_case
\ No newline at end of file
Trainer_id,Model_id,Dataset_id,Accuracy,Loss
0,1623766462,1623766462,0.5,nan
0,1623768325,1623768325,0.25,nan
1,1623768510,1623768510,0.75,nan
0,1623857759,1623857759,0.25,nan
0,1623932014,1623932014,0.0,nan
0,1623934119,1623934119,0.25,nan
0,1623935435,1623935435,0.25,nan
1,1624017183,1624017183,1.0,0.0
1,1624018429,1624018429,0.25,nan
1,1624019715,1624019715,1.0,0.0
1,1624021190,1624021190,0.75,nan
import csv
import os
import json
import time
import global_hyperparams as globals
from preprocessing import get_preprocessed_train_test_data, preprocess_single_train_data
from federated_algorithm import federated_computation_new, make_prediction
def start_processing(use_case, developer_id:int = 0):
globals.initialize(use_case,developer_id)
globals.TRAINER_ID = developer_id
train_dataset, test_dataset= get_preprocessed_train_test_data()
state,metrics = federated_computation_new(train_dataset,test_dataset)
trained_metrics= metrics['train']
timestamp = int(time.time())
globals.DATASET_ID = timestamp
written_row = save_to_file_CSV(use_case,globals.TRAINER_ID,timestamp,globals.DATASET_ID,trained_metrics['sparse_categorical_accuracy'],trained_metrics['loss'])
return written_row
def start_prediction(use_case, developer_id:int = -1):
globals.initialize(use_case,developer_id)
raw_input_prediction_data = "Test sentence. And another sentence which is going to be used as a mean for checking if this article is true or not. Also Santa is real"
raw_input_prediction_data = "Donald Trump Sends Out Embarrassing New Year’s Eve Message. This is Disturbing,'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and the very dishonest fake news media. The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year, President Angry Pants tweeted. 2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America! Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t even allow him to rise above the gutter long enough to wish the American citizens a happy new year! Bishop Talbert Swan (@TalbertSwan) December 31, 2017no one likes you Calvin (@calvinstowell) December 31, 2017Your impeachment would make 2018 a great year for America, but I ll also accept regaining control of Congress. Miranda Yaver (@mirandayaver) December 31, 2017Do you hear yourself talk? When you have to include that many people that hate you you have to wonder? Why do the they all hate me? Alan Sandoval (@AlanSandoval13) December 31, 2017Who uses the word Haters in a New Years wish?? Marlene (@marlene399) December 31, 2017You can t just say happy new year? Koren pollitt (@Korencarpenter) December 31, 2017Here s Trump s New Year s Eve tweet from 2016.Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Love! Donald J. Trump (@realDonaldTrump) December 31, 2016This is nothing new for Trump. He s been doing this for years.Trump has directed messages to his enemies and haters for New Year s, Easter, Thanksgiving, and the anniversary of 9/11. pic.twitter.com/4FPAe2KypA Daniel Dale (@ddale8) December 31, 2017Trump s holiday tweets are clearly not presidential.How long did he work at Hallmark before becoming President? Steven Goodine (@SGoodine) December 31, 2017He s always been like this . . . the only difference is that in the last few years, his filter has been breaking down. Roy Schulze (@thbthttt) December 31, 2017Who, apart from a teenager uses the term haters? Wendy (@WendyWhistles) December 31, 2017he s a fucking 5 year old Who Knows (@rainyday80) December 31, 2017So, to all the people who voted for this a hole thinking he would change once he got into power, you were wrong! 70-year-old men don t change and now he s a year older.Photo by Andrew Burton/Getty Images."
input_prediction_data= preprocess_single_train_data(raw_input_prediction_data)
prediction_result = make_prediction(input_prediction_data)
res = prediction_result[0]
if (res[0]>=0.50):
return json.dumps({ "result" : "True" })
else:
return json.dumps({ "result" : "False" })
def save_to_file_CSV(use_case,trainer_id,model_id,dataset_id,accuracy,loss):
filename = "processing/"+use_case+"/ledger.csv"
row = [str(trainer_id),str(model_id),str(dataset_id),str(accuracy),str(loss)]
if not (os.path.exists(filename)):
fields = ['Trainer_id','Model_id','Dataset_id','Accuracy','Loss']
with open(filename, 'w') as csvfile:
csvwriter =csv.writer(csvfile)
csvwriter.writerow(fields)
csvwriter.writerow(row)
else:
with open(filename, 'a') as csvfile:
csvwriter =csv.writer(csvfile)
csvwriter.writerow(row)
return row
#start_processing("text_processing")
#start_prediction("text_processing")
\ No newline at end of file
import global_hyperparams as globals
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
def get_simple_LSTM_model():
model = Sequential()
model.add(Embedding(globals.VOCAB_SIZE, globals.EMBED_DIM, input_length=globals.MAX_LENGTH))
model.add(Dropout(0.3))
model.add(LSTM(globals.LSTM_OUT))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
return model
\ No newline at end of file
import global_hyperparams as globals
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import collections
import tensorflow as tf
def get_raw_data()-> tuple:
real = pd.read_csv("processing/"+globals.USE_CASE+"/db/True.csv")
fake = pd.read_csv("processing/"+globals.USE_CASE+"/db/Fake.csv")
return real,fake
def preprocess_raw_data(real: pd.DataFrame, fake: pd.DataFrame) -> tuple:
# dropping rows that have urls as text and date, real's dates look fine, also dropping ones that have no text
fake_drop = fake.drop(index=[9358,15507,15508,18933])
fake_drop = fake_drop.drop(fake_drop.loc[fake_drop.text == ' '].index)
real_drop = real.drop(real.loc[real.text == ' '].index)
# Give labels to data before combining
fake['label'] = 1
real['label'] = 0
combined = pd.concat([fake, real])
no_reuters = combined.copy()
no_reuters.text = no_reuters.text.str.replace('Reuters', '')
combined = no_reuters.copy()
## train/test split the text data and labels
df_text = combined['text'] #features is now
labels = combined['label'] #or maybe use target? #currently useless???
target = combined['label'].values
tokenizer = Tokenizer(oov_token = "<OOV>", num_words=6000)
tokenizer.fit_on_texts(df_text)
# MAX_LENGTH = 40
# VOCAB_SIZE = 6000
sequences_train = tokenizer.texts_to_sequences(df_text)
padded_train = pad_sequences(sequences_train, padding = 'post', maxlen=globals.MAX_LENGTH)
#Data_train, data_text, label_train, label_test
X_train, X_test, y_train, y_test = train_test_split(padded_train, target, test_size=0.2)
X_train = tf.convert_to_tensor(X_train)
X_test = tf.convert_to_tensor(X_test)
y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)
return X_train,X_test,y_train,y_test
#FED PREPROCESSING
# NUM_CLIENTS = 4
# SHUFFLE_BUFFER = 5000
# BATCH_SIZE = 512
def preprocess(dataset):
def element_fn(x, y):
return collections.OrderedDict([
('x', x),
('y', y)#tf.cast(tf.reshape(y, [1]), tf.float32))
])
return dataset.map(element_fn).shuffle(
globals.SHUFFLE_BUFFER).batch(globals.BATCH_SIZE)
def generate_clients_datasets(n, source_x, source_y):
clients_dataset=[]
for i in range(n):
dataset=tf.data.Dataset.from_tensor_slices(([source_x[i]], [source_y[i]]))
dataset=preprocess(dataset)
clients_dataset.append(dataset)
return clients_dataset
def get_preprocessed_train_test_data() -> tuple:
"""
Preprocesses and returns the train and test datasets
returns the tuple: (train_dataset,test_dataset)
"""
real,fake = get_raw_data()
X_train, X_test, y_train, y_test = preprocess_raw_data(real,fake)
train_dataset=generate_clients_datasets(globals.NUM_CLIENTS, X_train, y_train)
test_dataset=generate_clients_datasets(globals.NUM_CLIENTS, X_test, y_test)
globals.INPUT_SPEC = train_dataset[0].element_spec
print("DONE PREPROCESSING")
return train_dataset,test_dataset
def preprocess_single_train_data(input_data):
input_data = "Test sentence. And another sentence which is going to be used as a mean for checking if this article is true or not. Also Santa is real"
input_data_list = []
input_data_list.append(input_data)
df_text = input_data_list
tokenizer = Tokenizer(oov_token = "<OOV>", num_words=6000)
tokenizer.fit_on_texts(df_text)
sequences_train = tokenizer.texts_to_sequences(df_text)
padded_train = pad_sequences(sequences_train, padding = 'post', maxlen=globals.MAX_LENGTH)
padded_train = tf.convert_to_tensor(padded_train)
return padded_train
#globals.initialize("text_processing")
#preprocess_single_train_data("test")
\ No newline at end of file
import json
import os
from flask import Response, request
import requests
import pandas as pd
import sys
import network_constants
modules_path = './'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
def last(use_case: str):
csv_path = './processing/'+use_case+'/ledger.csv'
try:
df = pd.read_csv(csv_path)
bottom = df.tail(1)
bottom = bottom.to_json(orient ="records")
print(bottom[1:-1])
metricsJson = bottom[1:-1] #remove the list [ { a: "1"} ... ]
return Response(status=200, response=metricsJson)
except Exception as e:
print(e)
return Response(status=400, response="Trained model data doesn't exist")
#FORWARD TO GPU SERVER WITH IP AND PORT
def upload_and_train(use_case: str, developer_id: int):
url = f'https://{network_constants.FEDERATED_TRAINING_HOSTNAME}:{network_constants.FEDERATED_TRAINING_REST_PORT}/api/Developers/use_case/{use_case}/last_train:'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None }
)
return response
use_case_path = 'processing/'+use_case+'/'
sys.path.append(use_case_path)
import main_proc
def upload_and_train(use_case: str, developer_id: int):
#COPY THE NEW DB TO THE FOLDER
#TODO IMPLEMENT HERE
#FORWARD TO GPU SERVER WITH IP AND PORT
#THEN start processing
last_train_metrics = main_proc.start_processing(use_case,developer_id)
print("## Last train metrics")
print (last_train_metrics)
#Trainer_id,Model_id,Dataset_id,Accuracy,Loss
#0,1623160388,0,0.25,nan
metricsJson = trainMetricsToJSON(last_train_metrics)
url = f'https://{network_constants.FEDERATED_TRAINING_HOSTNAME}:{network_constants.FEDERATED_TRAINING_REST_PORT}/api/Developers/use_cases/{use_case}/developer_id/{developer_id}/upload_and_train:'
response = requests.post(
url,
verify = False,
proxies = { "http":None, "https":None },
files= request.files
)
return Response(status=200, response=metricsJson)
return response
def trainMetricsToJSON(last_train_metrics : list):
metricsDict = dict()
metricsDict["Trainer_id"] = last_train_metrics[0]
metricsDict["Model_id"] = last_train_metrics[1]
metricsDict["Dataset_id"] = last_train_metrics[2]
metricsDict["Accuracy"] = last_train_metrics[3]
metricsDict["Loss"] = last_train_metrics[4]
return json.dumps(metricsDict)
upload_and_train("text_processing",1)
last("text_processing")
#upload_and_train("text_processing",1)
#last("text_processing")
......@@ -3,76 +3,34 @@ import os
import sys
import shutil
from flask import Response, request
import requests
import pandas as pd
import network_constants
def last(use_case: str):
csv_path = './processing/'+use_case+'/ledger.csv'
try:
df = pd.read_csv(csv_path)
bottom = df.tail(1)
bottom = bottom.to_json(orient ="records")
print(bottom[1:-1])
metricsJson = bottom[1:-1] #remove the list [ { a: "1"} ... ]
return Response(status=200, response=metricsJson)
except Exception as e:
print(e)
return Response(status=400, response="Trained model data doesn't exist")
#FORWARD TO GPU SERVER WITH IP AND POR
url = f'https://{network_constants.FEDERATED_TRAINING_HOSTNAME}:{network_constants.FEDERATED_TRAINING_REST_PORT}/api/Owners/use_case/{use_case}/last_train:'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None }
)
return response
def upload_and_train(use_case: str):
use_case_path = './processing/'+use_case+'/'
#Remove old files
try:
if os.path.exists(use_case_path):
print("Use_case path")
print(use_case_path)
shutil.rmtree(use_case_path)#Deletes old folder with all the files
except OSError as error:
print(error)
return Response(status=400, response="Error occured when deleteing the old use_case directory")
#Start a new implementation of the model.
#TODO: get the python files and create them locally in the {use_case} folder
try:
os.mkdir(use_case_path)
#COPY DEFAULT FILES
default_path = 'processing/default/'
use_case_path +='/'
#FORWARD TO GPU SERVER WITH IP AND PORT
url = f'https://{network_constants.FEDERATED_TRAINING_HOSTNAME}:{network_constants.FEDERATED_TRAINING_REST_PORT}/api/Owners/use_cases/{use_case}/upload_and_train:'
response = requests.post(
url,
verify = False,
proxies = { "http":None, "https":None },
files= request.files
)
shutil.copyfile(default_path+'main_proc.py',use_case_path+'main_proc.py')
shutil.copyfile(default_path+'__init__.py',use_case_path+'__init__.py')
shutil.copyfile(default_path+'checkpoint_manager.py',use_case_path+'checkpoint_manager.py')
shutil.copyfile(default_path+'federated_algorithm.py',use_case_path+'federated_algorithm.py')
return response
#COPY flask files
#use flask? request.files.getlist('filename')[0] ???
except OSError as error:
print(error)
return Response(status=400, response="Error occured when creating/copying the use_case files")
#TODO: after the files are copied, start training
use_case_path = 'processing/'+use_case+'/'
sys.path.append(use_case_path)
import main_proc
last_train_metrics = main_proc.start_processing(use_case,0)
print (last_train_metrics)
metricsJson = trainMetricsToJSON(last_train_metrics)
return Response(status=200, response=metricsJson)
def trainMetricsToJSON(last_train_metrics : list):
metricsDict = dict()
metricsDict["Trainer_id"] = last_train_metrics[0]
metricsDict["Model_id"] = last_train_metrics[1]
metricsDict["Dataset_id"] = last_train_metrics[2]
metricsDict["Accuracy"] = last_train_metrics[3]
metricsDict["Loss"] = last_train_metrics[4]
return json.dumps(metricsDict)
last("text_processing")
upload_and_train("text_processing") #warning it deletes the files
#last("text_processing")
#upload_and_train("text_processing") #warning it deletes the files
......@@ -2,17 +2,21 @@
from flask import Response, request
import sys
import requests
import network_constants
def check_article(use_case: str):
#body = request.STRING
use_case_path = 'processing/'+use_case+'/'
sys.path.append(use_case_path)
import main_proc
def check_article(use_case: str,data_entry: str):
#FORWARD TO GPU SERVER WITH IP AND PORT
url = f'https://{network_constants.FEDERATED_TRAINING_HOSTNAME}:{network_constants.FEDERATED_TRAINING_REST_PORT}/api/Users/use_case/{use_case}/data_entry/{data_entry}/check_article:'
url = "google.com"#API ENDPOINT WITH IP AND PORT OF GPU SERVER
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None }
)
return response
result = main_proc.start_prediction(use_case)
if result == None:
return Response(status = 404, response="Server doesn't have a trained model. Training of the model should be finished before attempting a prediction.")
return Response(status=200, response=result)
check_article("text_processing")
\ No newline at end of file
#check_article("text_processing")
\ No newline at end of file
......@@ -40,48 +40,48 @@ spec:
hostPath:
path: /srv/articonf
type: Directory
---
apiVersion: v1
kind: Service
metadata:
name: federated-learning-db
spec:
type: LoadBalancer
selector:
app: federated-learning-db
ports:
- name: http
port: 27017
targetPort: 27017
nodePort: 30423
protocol: TCP
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: federated-learning-db
spec:
replicas: 1
selector:
matchLabels:
app: federated-learning-db
template:
metadata:
labels:
app: federated-learning-db
spec:
containers:
- name: federated-learning-db
image: mongo
env:
- name: MONGO_INITDB_ROOT_USERNAME
value: root
- name: MONGO_INITDB_ROOT_PASSWORD
value: root
ports:
- containerPort: 27017
volumeMounts:
- mountPath: /data/db
name: dbdata
volumes:
- name: dbdata
\ No newline at end of file
# ---
# apiVersion: v1
# kind: Service
# metadata:
# name: federated-learning-db
# spec:
# type: LoadBalancer
# selector:
# app: federated-learning-db
# ports:
# - name: http
# port: 27017
# targetPort: 27017
# nodePort: 30423
# protocol: TCP
# ---
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: federated-learning-db
# spec:
# replicas: 1
# selector:
# matchLabels:
# app: federated-learning-db
# template:
# metadata:
# labels:
# app: federated-learning-db
# spec:
# containers:
# - name: federated-learning-db
# image: mongo
# env:
# - name: MONGO_INITDB_ROOT_USERNAME
# value: root
# - name: MONGO_INITDB_ROOT_PASSWORD
# value: root
# ports:
# - containerPort: 27017
# volumeMounts:
# - mountPath: /data/db
# name: dbdata
# volumes:
# - name: dbdata
\ No newline at end of file
swagger: "2.0"
info:
title: Federated Learning microservice
description: This is the documentation for the federated learning microservice.
version: "1.0.0"
consumes:
- "application/json"
produces:
- "application/json"
basePath: "/api"
# Import security definitions from seperate file
securityDefinitions:
$ref: '../../../../modules/security/security_local.yml#securityDefinitions'
paths:
$ref: 'routes.yml#paths'
\ No newline at end of file
......@@ -10,3 +10,4 @@ Trainer_id,Model_id,Dataset_id,Accuracy,Loss
1,1624018429,1624018429,0.25,nan
1,1624019715,1624019715,1.0,0.0
1,1624021190,1624021190,0.75,nan
1,1624284673,1624284673,0.5,nan
......@@ -30,7 +30,10 @@ def upload_and_train(use_case: str, developer_id: int):
import main_proc
#COPY THE NEW DB TO THE FOLDER
#TODO IMPLEMENT HERE
#file_dict = request.files
#db_File_True = file_dict["dataset_file1"]
#db_File_False = file_dict["dataset_file2"]
#TODO IMPLEMENT HERE
#THEN start processing
......@@ -54,5 +57,5 @@ def trainMetricsToJSON(last_train_metrics : list):
return json.dumps(metricsDict)
upload_and_train("text_processing",1)
last("text_processing")
#last("text_processing")
......@@ -74,5 +74,5 @@ def trainMetricsToJSON(last_train_metrics : list):
metricsDict["Loss"] = last_train_metrics[4]
return json.dumps(metricsDict)
last("text_processing")
upload_and_train("text_processing") #warning it deletes the files
#last("text_processing")
#upload_and_train("text_processing") #warning it deletes the files
......@@ -15,4 +15,4 @@ def check_article(use_case: str):
return Response(status = 404, response="Server doesn't have a trained model. Training of the model should be finished before attempting a prediction.")
return Response(status=200, response=result)
check_article("text_processing")
\ No newline at end of file
#check_article("text_processing")
\ No newline at end of file
......@@ -40,48 +40,48 @@ spec:
hostPath:
path: /srv/articonf
type: Directory
---
apiVersion: v1
kind: Service
metadata:
name: federated-training-db
spec:
type: LoadBalancer
selector:
app: federated-training-db
ports:
- name: http
port: 27017
targetPort: 27017
nodePort: 30425
protocol: TCP
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: federated-training-db
spec:
replicas: 1
selector:
matchLabels:
app: federated-training-db
template:
metadata:
labels:
app: federated-training-db
spec:
containers:
- name: federated-training-db
image: mongo
env:
- name: MONGO_INITDB_ROOT_USERNAME
value: root
- name: MONGO_INITDB_ROOT_PASSWORD
value: root
ports:
- containerPort: 27017
volumeMounts:
- mountPath: /data/db
name: dbdata
volumes:
- name: dbdata
\ No newline at end of file
# ---
# apiVersion: v1
# kind: Service
# metadata:
# name: federated-training-db
# spec:
# type: LoadBalancer
# selector:
# app: federated-training-db
# ports:
# - name: http
# port: 27017
# targetPort: 27017
# nodePort: 30425
# protocol: TCP
# ---
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: federated-training-db
# spec:
# replicas: 1
# selector:
# matchLabels:
# app: federated-training-db
# template:
# metadata:
# labels:
# app: federated-training-db
# spec:
# containers:
# - name: federated-training-db
# image: mongo
# env:
# - name: MONGO_INITDB_ROOT_USERNAME
# value: root
# - name: MONGO_INITDB_ROOT_PASSWORD
# value: root
# ports:
# - containerPort: 27017
# volumeMounts:
# - mountPath: /data/db
# name: dbdata
# volumes:
# - name: dbdata
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment