Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
SMART
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
UNI-KLU
SMART
Commits
1c2b9a52
Commit
1c2b9a52
authored
May 26, 2021
by
Bogdan Mihai (ARTICONF student)
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Modularization of Fed Algo
parent
c53b87b9
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
182 additions
and
32 deletions
+182
-32
__init__.py
...ederated-learning-microservice/app/processing/__init__.py
+0
-0
__init__.py
...g-microservice/app/processing/text_processing/__init__.py
+0
-0
federated_algorithm.py
...ice/app/processing/text_processing/federated_algorithm.py
+49
-0
global_hyperparams.py
...vice/app/processing/text_processing/global_hyperparams.py
+2
-0
main_proc.py
...-microservice/app/processing/text_processing/main_proc.py
+15
-2
model.py
...ning-microservice/app/processing/text_processing/model.py
+28
-0
preprocessing.py
...roservice/app/processing/text_processing/preprocessing.py
+88
-30
No files found.
src/participation-hub/federated-learning-microservice/app/processing/__init__.py
0 → 100644
View file @
1c2b9a52
src/participation-hub/federated-learning-microservice/app/processing/text_processing/__init__.py
0 → 100644
View file @
1c2b9a52
src/participation-hub/federated-learning-microservice/app/processing/text_processing/federated_algorithm.py
0 → 100644
View file @
1c2b9a52
#import processing.text_processing.global_hyperparams as globals
import
global_hyperparams
as
globals
from
model
import
get_simple_LSTM_model
import
pandas
as
pd
from
sklearn.model_selection
import
train_test_split
from
tensorflow.keras.models
import
Sequential
from
tensorflow.keras.layers
import
LSTM
,
Dense
,
Dropout
,
Embedding
from
tensorflow.keras.preprocessing.text
import
Tokenizer
from
tensorflow.keras.preprocessing.sequence
import
pad_sequences
import
collections
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow_federated
as
tff
def
model_fn
():
keras_model
=
get_simple_LSTM_model
()
#return tff.learning.from_compiled_keras_model(keras_model, sample_batch) original
return
tff
.
learning
.
from_keras_model
(
keras_model
,
input_spec
=
globals
.
INPUT_SPEC
,
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
),
metrics
=
[
tf
.
keras
.
metrics
.
SparseCategoricalAccuracy
()])
def
federated_computation
(
train_dataset
,
test_dataset
):
if
(
globals
.
INPUT_SPEC
==
None
):
#should never reach this place because INPUT_SPEC is instantiated inside get_preprocessed_train_test_data.
#however, if in the future, processed data is provided without hte preprocessing function it will be none -> therefore assign it here
globals
.
INPUT_SPEC
=
train_dataset
[
0
]
.
element_spec
# Training and evaluating the model
iterative_process
=
tff
.
learning
.
build_federated_averaging_process
(
model_fn
,
client_optimizer_fn
=
lambda
:
tf
.
keras
.
optimizers
.
SGD
(
lr
=
0.5
))
state
=
iterative_process
.
initialize
()
for
n
in
range
(
globals
.
EPOCHS
):
state
,
metrics
=
iterative_process
.
next
(
state
,
train_dataset
)
print
(
'round {}, training metrics={}'
.
format
(
n
+
1
,
metrics
))
evaluation
=
tff
.
learning
.
build_federated_evaluation
(
model_fn
)
eval_metrics
=
evaluation
(
state
.
model
,
train_dataset
)
print
(
'Training evaluation metrics={}'
.
format
(
eval_metrics
))
test_metrics
=
evaluation
(
state
.
model
,
test_dataset
)
print
(
'Test evaluation metrics={}'
.
format
(
test_metrics
))
src/participation-hub/federated-learning-microservice/app/processing/text_processing/global_hyperparams.py
View file @
1c2b9a52
...
...
@@ -15,5 +15,7 @@ def initialize():
INPUT_SPEC
=
None
#must & will be initialized after data preprocessing. Currently it's being initialised with: train_dataset[0].element_spec
global
EMBED_DIM
# number of dimension of the embedding of the layer in the model.
EMBED_DIM
=
10
global
LSTM_OUT
# output size of the LSTM layer
LSTM_OUT
=
100
global
EPOCHS
#number of epochs the model will be trained
EPOCHS
=
5
\ No newline at end of file
src/participation-hub/federated-learning-microservice/app/processing/text_processing/main_proc.py
View file @
1c2b9a52
import
os
import
processing.text_processing.global_hyperparams
as
globals
print
(
os
.
getcwd
())
#import processing.text_processing.global_hyperparams as globals
#from processing.text_processing.preprocessing import get_preprocessed_train_test_data
import
global_hyperparams
as
globals
from
preprocessing
import
get_preprocessed_train_test_data
from
federated_algorithm
import
federated_computation
if
__name__
==
"__main__"
:
#globals.initialize()
globals
.
initialize
()
train_dataset
,
test_dataset
=
get_preprocessed_train_test_data
()
federated_computation
(
train_dataset
,
test_dataset
)
print
(
"DONE"
)
src/participation-hub/federated-learning-microservice/app/processing/text_processing/model.py
0 → 100644
View file @
1c2b9a52
#import processing.text_processing.global_hyperparams as globals
import
global_hyperparams
as
globals
import
pandas
as
pd
from
sklearn.model_selection
import
train_test_split
from
tensorflow.keras.models
import
Sequential
from
tensorflow.keras.layers
import
LSTM
,
Dense
,
Dropout
,
Embedding
from
tensorflow.keras.preprocessing.text
import
Tokenizer
from
tensorflow.keras.preprocessing.sequence
import
pad_sequences
import
collections
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow_federated
as
tff
def
get_simple_LSTM_model
():
model
=
Sequential
()
model
.
add
(
Embedding
(
globals
.
VOCAB_SIZE
,
globals
.
EMBED_DIM
,
input_length
=
globals
.
MAX_LENGTH
))
model
.
add
(
Dropout
(
0.3
))
model
.
add
(
LSTM
(
globals
.
LSTM_OUT
))
model
.
add
(
Dropout
(
0.3
))
model
.
add
(
Dense
(
64
,
activation
=
'relu'
))
model
.
add
(
Dropout
(
0.3
))
model
.
add
(
Dense
(
1
,
activation
=
'sigmoid'
))
return
model
\ No newline at end of file
src/participation-hub/federated-learning-microservice/app/processing/text_processing/preprocessing.py
View file @
1c2b9a52
#import processing.text_processing.global_hyperparams as globals
import
global_hyperparams
as
globals
import
pandas
as
pd
from
sklearn.model_selection
import
train_test_split
...
...
@@ -12,42 +13,99 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow_federated
as
tff
real
=
pd
.
read_csv
(
"processing/fake_news/prototype_db_fake_real/True.csv"
)
fake
=
pd
.
read_csv
(
"processing/fake_news/prototype_db_fake_real/Fake.csv"
)
# real = pd.read_csv("processing/fake_news/prototype_db_fake_real/True.csv")
# fake = pd.read_csv("processing/fake_news/prototype_db_fake_real/Fake.csv")
def
get_raw_data
()
->
tuple
:
real
=
pd
.
read_csv
(
"processing/fake_news/prototype_db_fake_real/True.csv"
)
fake
=
pd
.
read_csv
(
"processing/fake_news/prototype_db_fake_real/Fake.csv"
)
return
real
,
fake
def
preprocess_raw_data
(
real
:
pd
.
DataFrame
,
fake
:
pd
.
DataFrame
)
->
tuple
:
# dropping rows that have urls as text and date, real's dates look fine, also dropping ones that have no text
fake_drop
=
fake
.
drop
(
index
=
[
9358
,
15507
,
15508
,
18933
])
fake_drop
=
fake_drop
.
drop
(
fake_drop
.
loc
[
fake_drop
.
text
==
' '
]
.
index
)
real_drop
=
real
.
drop
(
real
.
loc
[
real
.
text
==
' '
]
.
index
)
# Give labels to data before combining
fake
[
'label'
]
=
1
real
[
'label'
]
=
0
combined
=
pd
.
concat
([
fake
,
real
])
no_reuters
=
combined
.
copy
()
no_reuters
.
text
=
no_reuters
.
text
.
str
.
replace
(
'Reuters'
,
''
)
combined
=
no_reuters
.
copy
()
## train/test split the text data and labels
df_text
=
combined
[
'text'
]
#features is now
labels
=
combined
[
'label'
]
#or maybe use target? #currently useless???
target
=
combined
[
'label'
]
.
values
tokenizer
=
Tokenizer
(
oov_token
=
"<OOV>"
,
num_words
=
6000
)
tokenizer
.
fit_on_texts
(
df_text
)
# MAX_LENGTH = 40
# VOCAB_SIZE = 6000
sequences_train
=
tokenizer
.
texts_to_sequences
(
df_text
)
padded_train
=
pad_sequences
(
sequences_train
,
padding
=
'post'
,
maxlen
=
globals
.
MAX_LENGTH
)
#Data_train, data_text, label_train, label_test
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
padded_train
,
target
,
test_size
=
0.2
)
X_train
=
tf
.
convert_to_tensor
(
X_train
)
X_test
=
tf
.
convert_to_tensor
(
X_test
)
y_train
=
tf
.
convert_to_tensor
(
y_train
)
y_test
=
tf
.
convert_to_tensor
(
y_test
)
return
X_train
,
X_test
,
y_train
,
y_test
#FED PREPROCESSING
# NUM_CLIENTS = 4
# SHUFFLE_BUFFER = 5000
# BATCH_SIZE = 512
def
preprocess
(
dataset
):
def
element_fn
(
x
,
y
):
return
collections
.
OrderedDict
([
(
'x'
,
x
),
(
'y'
,
y
)
#tf.cast(tf.reshape(y, [1]), tf.float32))
])
# dropping rows that have urls as text and date, real's dates look fine, also dropping ones that have no text
fake_drop
=
fake
.
drop
(
index
=
[
9358
,
15507
,
15508
,
18933
])
fake_drop
=
fake_drop
.
drop
(
fake_drop
.
loc
[
fake_drop
.
text
==
' '
]
.
index
)
real_drop
=
real
.
drop
(
real
.
loc
[
real
.
text
==
' '
]
.
index
)
return
dataset
.
map
(
element_fn
)
.
shuffle
(
globals
.
SHUFFLE_BUFFER
)
.
batch
(
globals
.
BATCH_SIZE
)
# Give labels to data before combining
fake
[
'label'
]
=
1
real
[
'label'
]
=
0
combined
=
pd
.
concat
([
fake
,
real
])
def
generate_clients_datasets
(
n
,
source_x
,
source_y
):
clients_dataset
=
[]
for
i
in
range
(
n
):
dataset
=
tf
.
data
.
Dataset
.
from_tensor_slices
(([
source_x
[
i
]],
[
source_y
[
i
]]))
dataset
=
preprocess
(
dataset
)
clients_dataset
.
append
(
dataset
)
return
clients_dataset
no_reuters
=
combined
.
copy
()
no_reuters
.
text
=
no_reuters
.
text
.
str
.
replace
(
'Reuters'
,
''
)
combined
=
no_reuters
.
copy
()
## train/test split the text data and labels
df_text
=
combined
[
'text'
]
#features is now
labels
=
combined
[
'label'
]
#or maybe use target?
target
=
combined
[
'label'
]
.
values
#^ to be put into db
tokenizer
=
Tokenizer
(
oov_token
=
"<OOV>"
,
num_words
=
6000
)
tokenizer
.
fit_on_texts
(
df_text
)
def
get_preprocessed_train_test_data
()
->
tuple
:
"""
Preprocesses and returns the train and test datasets
returns the tuple: (train_dataset,test_dataset)
"""
real
,
fake
=
get_raw_data
()
X_train
,
X_test
,
y_train
,
y_test
=
preprocess_raw_data
(
real
,
fake
)
MAX_LENGTH
=
40
VOCAB_SIZE
=
6000
train_dataset
=
generate_clients_datasets
(
globals
.
NUM_CLIENTS
,
X_train
,
y_train
)
test_dataset
=
generate_clients_datasets
(
globals
.
NUM_CLIENTS
,
X_test
,
y_test
)
sequences_train
=
tokenizer
.
texts_to_sequences
(
df_text
)
globals
.
INPUT_SPEC
=
train_dataset
[
0
]
.
element_spec
print
(
"DONE PREPROCESSING"
)
return
train_dataset
,
test_dataset
padded_train
=
pad_sequences
(
sequences_train
,
padding
=
'post'
,
maxlen
=
MAX_LENGTH
)
#Data_train, data_text, label_train, label_test
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
padded_train
,
target
,
test_size
=
0.2
)
X_train
=
tf
.
convert_to_tensor
(
X_train
)
X_test
=
tf
.
convert_to_tensor
(
X_test
)
y_train
=
tf
.
convert_to_tensor
(
y_train
)
y_test
=
tf
.
convert_to_tensor
(
y_test
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment