Modularization of Fed Algo

parent c53b87b9
#import processing.text_processing.global_hyperparams as globals
import global_hyperparams as globals
from model import get_simple_LSTM_model
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import collections
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
def model_fn():
keras_model = get_simple_LSTM_model()
#return tff.learning.from_compiled_keras_model(keras_model, sample_batch) original
return tff.learning.from_keras_model(
keras_model,
input_spec=globals.INPUT_SPEC,
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
def federated_computation(train_dataset,test_dataset):
if(globals.INPUT_SPEC == None):
#should never reach this place because INPUT_SPEC is instantiated inside get_preprocessed_train_test_data.
#however, if in the future, processed data is provided without hte preprocessing function it will be none -> therefore assign it here
globals.INPUT_SPEC = train_dataset[0].element_spec
# Training and evaluating the model
iterative_process = tff.learning.build_federated_averaging_process(model_fn,client_optimizer_fn=lambda: tf.keras.optimizers.SGD(lr=0.5))
state = iterative_process.initialize()
for n in range(globals.EPOCHS):
state, metrics = iterative_process.next(state, train_dataset)
print('round {}, training metrics={}'.format(n+1, metrics))
evaluation = tff.learning.build_federated_evaluation(model_fn)
eval_metrics = evaluation(state.model, train_dataset)
print('Training evaluation metrics={}'.format(eval_metrics))
test_metrics = evaluation(state.model, test_dataset)
print('Test evaluation metrics={}'.format(test_metrics))
...@@ -15,5 +15,7 @@ def initialize(): ...@@ -15,5 +15,7 @@ def initialize():
INPUT_SPEC = None #must & will be initialized after data preprocessing. Currently it's being initialised with: train_dataset[0].element_spec INPUT_SPEC = None #must & will be initialized after data preprocessing. Currently it's being initialised with: train_dataset[0].element_spec
global EMBED_DIM # number of dimension of the embedding of the layer in the model. global EMBED_DIM # number of dimension of the embedding of the layer in the model.
EMBED_DIM = 10 EMBED_DIM = 10
global LSTM_OUT # output size of the LSTM layer
LSTM_OUT = 100
global EPOCHS #number of epochs the model will be trained global EPOCHS #number of epochs the model will be trained
EPOCHS = 5 EPOCHS = 5
\ No newline at end of file
import os
import processing.text_processing.global_hyperparams as globals print(os.getcwd())
#import processing.text_processing.global_hyperparams as globals
#from processing.text_processing.preprocessing import get_preprocessed_train_test_data
import global_hyperparams as globals
from preprocessing import get_preprocessed_train_test_data
from federated_algorithm import federated_computation
if __name__ == "__main__": if __name__ == "__main__":
#globals.initialize() globals.initialize()
train_dataset, test_dataset= get_preprocessed_train_test_data()
federated_computation(train_dataset,test_dataset)
print("DONE")
#import processing.text_processing.global_hyperparams as globals
import global_hyperparams as globals
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import collections
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
def get_simple_LSTM_model():
model = Sequential()
model.add(Embedding(globals.VOCAB_SIZE, globals.EMBED_DIM, input_length=globals.MAX_LENGTH))
model.add(Dropout(0.3))
model.add(LSTM(globals.LSTM_OUT))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
return model
\ No newline at end of file
#import processing.text_processing.global_hyperparams as globals
import global_hyperparams as globals
import pandas as pd import pandas as pd
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
...@@ -12,42 +13,99 @@ import numpy as np ...@@ -12,42 +13,99 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
import tensorflow_federated as tff import tensorflow_federated as tff
real = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv") # real = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv")
fake = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv") # fake = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv")
def get_raw_data()-> tuple:
real = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv")
fake = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv")
return real,fake
def preprocess_raw_data(real: pd.DataFrame, fake: pd.DataFrame) -> tuple:
# dropping rows that have urls as text and date, real's dates look fine, also dropping ones that have no text
fake_drop = fake.drop(index=[9358,15507,15508,18933])
fake_drop = fake_drop.drop(fake_drop.loc[fake_drop.text == ' '].index)
real_drop = real.drop(real.loc[real.text == ' '].index)
# Give labels to data before combining
fake['label'] = 1
real['label'] = 0
combined = pd.concat([fake, real])
no_reuters = combined.copy()
no_reuters.text = no_reuters.text.str.replace('Reuters', '')
combined = no_reuters.copy()
## train/test split the text data and labels
df_text = combined['text'] #features is now
labels = combined['label'] #or maybe use target? #currently useless???
target = combined['label'].values
tokenizer = Tokenizer(oov_token = "<OOV>", num_words=6000)
tokenizer.fit_on_texts(df_text)
# MAX_LENGTH = 40
# VOCAB_SIZE = 6000
sequences_train = tokenizer.texts_to_sequences(df_text)
padded_train = pad_sequences(sequences_train, padding = 'post', maxlen=globals.MAX_LENGTH)
#Data_train, data_text, label_train, label_test
X_train, X_test, y_train, y_test = train_test_split(padded_train, target, test_size=0.2)
X_train = tf.convert_to_tensor(X_train)
X_test = tf.convert_to_tensor(X_test)
y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)
return X_train,X_test,y_train,y_test
#FED PREPROCESSING
# NUM_CLIENTS = 4
# SHUFFLE_BUFFER = 5000
# BATCH_SIZE = 512
def preprocess(dataset):
def element_fn(x, y):
return collections.OrderedDict([
('x', x),
('y', y)#tf.cast(tf.reshape(y, [1]), tf.float32))
])
# dropping rows that have urls as text and date, real's dates look fine, also dropping ones that have no text return dataset.map(element_fn).shuffle(
fake_drop = fake.drop(index=[9358,15507,15508,18933]) globals.SHUFFLE_BUFFER).batch(globals.BATCH_SIZE)
fake_drop = fake_drop.drop(fake_drop.loc[fake_drop.text == ' '].index)
real_drop = real.drop(real.loc[real.text == ' '].index)
# Give labels to data before combining def generate_clients_datasets(n, source_x, source_y):
fake['label'] = 1 clients_dataset=[]
real['label'] = 0 for i in range(n):
combined = pd.concat([fake, real]) dataset=tf.data.Dataset.from_tensor_slices(([source_x[i]], [source_y[i]]))
dataset=preprocess(dataset)
clients_dataset.append(dataset)
return clients_dataset
no_reuters = combined.copy()
no_reuters.text = no_reuters.text.str.replace('Reuters', '')
combined = no_reuters.copy()
## train/test split the text data and labels
df_text = combined['text'] #features is now #^ to be put into db
labels = combined['label'] #or maybe use target?
target = combined['label'].values
tokenizer = Tokenizer(oov_token = "<OOV>", num_words=6000) def get_preprocessed_train_test_data() -> tuple:
tokenizer.fit_on_texts(df_text) """
Preprocesses and returns the train and test datasets
returns the tuple: (train_dataset,test_dataset)
"""
real,fake = get_raw_data()
X_train, X_test, y_train, y_test = preprocess_raw_data(real,fake)
MAX_LENGTH = 40 train_dataset=generate_clients_datasets(globals.NUM_CLIENTS, X_train, y_train)
VOCAB_SIZE = 6000 test_dataset=generate_clients_datasets(globals.NUM_CLIENTS, X_test, y_test)
sequences_train = tokenizer.texts_to_sequences(df_text) globals.INPUT_SPEC = train_dataset[0].element_spec
print("DONE PREPROCESSING")
return train_dataset,test_dataset
padded_train = pad_sequences(sequences_train, padding = 'post', maxlen=MAX_LENGTH)
#Data_train, data_text, label_train, label_test
X_train, X_test, y_train, y_test = train_test_split(padded_train, target, test_size=0.2)
X_train = tf.convert_to_tensor(X_train)
X_test = tf.convert_to_tensor(X_test)
y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment