Modularization of Fed Algo

1c2b9a52 · Bogdan Mihai (ARTICONF student) · c53b87b9 · 1c2b9a52 · 1c2b9a52 · 1c2b9a52
Commit 1c2b9a52 authored May 26, 2021 by Bogdan Mihai (ARTICONF student)
7 changed files
--- a/src/participation-hub/federated-learning-microservice/app/processing/__init__.py
+++ b/src/participation-hub/federated-learning-microservice/app/processing/__init__.py
--- a/src/participation-hub/federated-learning-microservice/app/processing/text_processing/__init__.py
+++ b/src/participation-hub/federated-learning-microservice/app/processing/text_processing/__init__.py
--- a/src/participation-hub/federated-learning-microservice/app/processing/text_processing/federated_algorithm.py
+++ b/src/participation-hub/federated-learning-microservice/app/processing/text_processing/federated_algorithm.py
+#import processing.text_processing.global_hyperparams as globals
+import global_hyperparams as globals
+from model import get_simple_LSTM_model
+
+import pandas as pd
+
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import collections
+import numpy as np
+import tensorflow as tf
+import tensorflow_federated as tff
+
+
+
+def model_fn():
+  keras_model = get_simple_LSTM_model()
+  #return tff.learning.from_compiled_keras_model(keras_model, sample_batch) original
+
+  return tff.learning.from_keras_model(
+      keras_model,
+      input_spec=globals.INPUT_SPEC,
+      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
+
+def federated_computation(train_dataset,test_dataset):
+
+    if(globals.INPUT_SPEC == None):
+        #should never reach this place because INPUT_SPEC is instantiated inside get_preprocessed_train_test_data.
+        #however, if in the future, processed data is provided without hte preprocessing function it will be none -> therefore assign it here
+        globals.INPUT_SPEC = train_dataset[0].element_spec
+
+    # Training and evaluating the model
+    iterative_process = tff.learning.build_federated_averaging_process(model_fn,client_optimizer_fn=lambda: tf.keras.optimizers.SGD(lr=0.5))
+    state = iterative_process.initialize()
+
+    for n in range(globals.EPOCHS):
+        state, metrics = iterative_process.next(state, train_dataset)
+        print('round  {}, training metrics={}'.format(n+1, metrics))
+
+    evaluation = tff.learning.build_federated_evaluation(model_fn)
+    eval_metrics = evaluation(state.model, train_dataset)
+    print('Training evaluation metrics={}'.format(eval_metrics))
+
+    test_metrics = evaluation(state.model, test_dataset)
+    print('Test evaluation metrics={}'.format(test_metrics))
--- a/src/participation-hub/federated-learning-microservice/app/processing/text_processing/global_hyperparams.py
+++ b/src/participation-hub/federated-learning-microservice/app/processing/text_processing/global_hyperparams.py
@@ -15,5 +15,7 @@ def initialize():
    INPUT_SPEC = None #must & will be initialized after data preprocessing. Currently it's being initialised with: train_dataset[0].element_spec
    global EMBED_DIM # number of dimension of the embedding of the layer in the model.
    EMBED_DIM = 10 
+    global LSTM_OUT # output size of the LSTM layer
+    LSTM_OUT = 100
    global EPOCHS #number of epochs the model will be trained
    EPOCHS = 5 
\ No newline at end of file
--- a/src/participation-hub/federated-learning-microservice/app/processing/text_processing/main_proc.py
+++ b/src/participation-hub/federated-learning-microservice/app/processing/text_processing/main_proc.py
+import os

-import processing.text_processing.global_hyperparams as globals
+print(os.getcwd())
+
+
+#import processing.text_processing.global_hyperparams as globals
+#from processing.text_processing.preprocessing import get_preprocessed_train_test_data
+
+import global_hyperparams as globals
+from preprocessing import get_preprocessed_train_test_data
+from federated_algorithm import federated_computation

 if __name__ == "__main__":

-    #globals.initialize()
+    globals.initialize()
+    train_dataset, test_dataset= get_preprocessed_train_test_data()
+
+    federated_computation(train_dataset,test_dataset)    
+    print("DONE")
--- a/src/participation-hub/federated-learning-microservice/app/processing/text_processing/model.py
+++ b/src/participation-hub/federated-learning-microservice/app/processing/text_processing/model.py
+
+#import processing.text_processing.global_hyperparams as globals
+import global_hyperparams as globals
+
+import pandas as pd
+
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import collections
+import numpy as np
+import tensorflow as tf
+import tensorflow_federated as tff
+
+
+def get_simple_LSTM_model():
+  model = Sequential()
+  model.add(Embedding(globals.VOCAB_SIZE, globals.EMBED_DIM, input_length=globals.MAX_LENGTH))
+  model.add(Dropout(0.3))
+  model.add(LSTM(globals.LSTM_OUT))
+  model.add(Dropout(0.3))
+  model.add(Dense(64, activation='relu'))
+  model.add(Dropout(0.3))
+  model.add(Dense(1, activation='sigmoid'))
+
+  return model
\ No newline at end of file
--- a/src/participation-hub/federated-learning-microservice/app/processing/text_processing/preprocessing.py
+++ b/src/participation-hub/federated-learning-microservice/app/processing/text_processing/preprocessing.py

-
+#import processing.text_processing.global_hyperparams as globals
+import global_hyperparams as globals
 import pandas as pd

 from sklearn.model_selection import train_test_split
@@ -12,42 +13,99 @@ import numpy as np
 import tensorflow as tf
 import tensorflow_federated as tff

-real = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv")
-fake = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv")
+# real = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv")
+# fake = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv")
+
+
+
+def get_raw_data()-> tuple:
+    real = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv")
+    fake = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv")
+
+    return real,fake
+
+def preprocess_raw_data(real: pd.DataFrame, fake: pd.DataFrame) -> tuple:
+
+    # dropping rows that have urls as text and date, real's dates look fine, also dropping ones that have no text
+    fake_drop = fake.drop(index=[9358,15507,15508,18933])
+    fake_drop = fake_drop.drop(fake_drop.loc[fake_drop.text == ' '].index)
+    real_drop = real.drop(real.loc[real.text == ' '].index)
+
+    # Give labels to data before combining
+    fake['label'] = 1
+    real['label'] = 0
+    combined = pd.concat([fake, real])
+
+    no_reuters = combined.copy()
+    no_reuters.text = no_reuters.text.str.replace('Reuters', '')
+    combined = no_reuters.copy()
+    ## train/test split the text data and labels
+
+    df_text = combined['text'] #features is now
+    labels = combined['label'] #or maybe use target? #currently useless???
+    target = combined['label'].values
+
+    tokenizer = Tokenizer(oov_token = "<OOV>", num_words=6000)
+    tokenizer.fit_on_texts(df_text)
+
+    # MAX_LENGTH = 40
+    # VOCAB_SIZE = 6000
+
+    sequences_train = tokenizer.texts_to_sequences(df_text)
+
+    padded_train = pad_sequences(sequences_train, padding = 'post', maxlen=globals.MAX_LENGTH)
+
+    #Data_train, data_text, label_train, label_test
+    X_train, X_test, y_train, y_test = train_test_split(padded_train, target, test_size=0.2)
+
+    X_train = tf.convert_to_tensor(X_train)
+    X_test = tf.convert_to_tensor(X_test)
+    y_train = tf.convert_to_tensor(y_train)
+    y_test = tf.convert_to_tensor(y_test)
+
+    return X_train,X_test,y_train,y_test 
+
+#FED PREPROCESSING
+
+# NUM_CLIENTS = 4
+# SHUFFLE_BUFFER = 5000
+# BATCH_SIZE = 512
+
+def preprocess(dataset):
+  def element_fn(x, y):
+    return collections.OrderedDict([
+        ('x', x),
+        ('y', y)#tf.cast(tf.reshape(y, [1]), tf.float32))
+    ])

-# dropping rows that have urls as text and date, real's dates look fine, also dropping ones that have no text
-fake_drop = fake.drop(index=[9358,15507,15508,18933])
-fake_drop = fake_drop.drop(fake_drop.loc[fake_drop.text == ' '].index)
-real_drop = real.drop(real.loc[real.text == ' '].index)
+  return dataset.map(element_fn).shuffle(
+      globals.SHUFFLE_BUFFER).batch(globals.BATCH_SIZE)

-# Give labels to data before combining
-fake['label'] = 1
-real['label'] = 0
-combined = pd.concat([fake, real])
+def generate_clients_datasets(n, source_x, source_y):
+    clients_dataset=[]
+    for i in range(n):
+        dataset=tf.data.Dataset.from_tensor_slices(([source_x[i]], [source_y[i]]))
+        dataset=preprocess(dataset)
+        clients_dataset.append(dataset)
+    return clients_dataset

-no_reuters = combined.copy()
-no_reuters.text = no_reuters.text.str.replace('Reuters', '')
-combined = no_reuters.copy()
-## train/test split the text data and labels

-df_text = combined['text'] #features is now
-labels = combined['label'] #or maybe use target?
-target = combined['label'].values
+#^ to be put into db

-tokenizer = Tokenizer(oov_token = "<OOV>", num_words=6000)
-tokenizer.fit_on_texts(df_text)
+def get_preprocessed_train_test_data() -> tuple:
+    """ 
+    Preprocesses and returns the train and test datasets
+    returns the tuple: (train_dataset,test_dataset)    
+    """
+    real,fake = get_raw_data()
+    X_train, X_test, y_train, y_test = preprocess_raw_data(real,fake)    

-MAX_LENGTH = 40
-VOCAB_SIZE = 6000
+    train_dataset=generate_clients_datasets(globals.NUM_CLIENTS, X_train, y_train)
+    test_dataset=generate_clients_datasets(globals.NUM_CLIENTS, X_test, y_test)

-sequences_train = tokenizer.texts_to_sequences(df_text)
+    globals.INPUT_SPEC = train_dataset[0].element_spec
+    print("DONE PREPROCESSING")
+    return train_dataset,test_dataset

-padded_train = pad_sequences(sequences_train, padding = 'post', maxlen=MAX_LENGTH)

-#Data_train, data_text, label_train, label_test
-X_train, X_test, y_train, y_test = train_test_split(padded_train, target, test_size=0.2)

-X_train = tf.convert_to_tensor(X_train)
-X_test = tf.convert_to_tensor(X_test)
-y_train = tf.convert_to_tensor(y_train)
-y_test = tf.convert_to_tensor(y_test)