Commit d4ce41ed authored by Bogdan's avatar Bogdan

Updated the schema

parent 17f6cf8b
...@@ -23,7 +23,8 @@ def add_table(use_case: str, table_name: str): ...@@ -23,7 +23,8 @@ def add_table(use_case: str, table_name: str):
"upvotes", "upvotes",
"percentage_upvoted", "percentage_upvoted",
"n_comments", "n_comments",
"subreddit" "subreddit",
"created_at"
] ]
columns = { c : c for c in columns } columns = { c : c for c in columns }
...@@ -50,10 +51,29 @@ def add_table(use_case: str, table_name: str): ...@@ -50,10 +51,29 @@ def add_table(use_case: str, table_name: str):
def add_layers(use_case:str, table_name: str): def add_layers(use_case:str, table_name: str):
layers = [ layers = [
# { #subreddit is string cannot cluster
# "use_case": use_case,
# "table": table_name,
# "name": "Subreddit_Layer",
# "properties": [
# "UniqueID",
# "subreddit",
# "user_id",
# "title",
# "content",
# "permalink",
# "upvotes",
# "percentage_upvoted",
# "n_comments"
# ],
# "cluster_properties": [
# "subreddit"
# ]
# },
{ {
"use_case": use_case, "use_case": use_case,
"table": table_name, "table": table_name,
"name": "Subreddit_Layer", "name": "Upvotes_Layer", #TODO Probably do something like Total Votes? so we can get a popularity?
"properties": [ "properties": [
"UniqueID", "UniqueID",
"subreddit", "subreddit",
...@@ -63,35 +83,18 @@ def add_layers(use_case:str, table_name: str): ...@@ -63,35 +83,18 @@ def add_layers(use_case:str, table_name: str):
"permalink", "permalink",
"upvotes", "upvotes",
"percentage_upvoted", "percentage_upvoted",
"n_comments" "n_comments",
"created_at"
], ],
"cluster_properties": [ "cluster_properties": [
"subreddit"
]
},
{
"use_case": use_case,
"table": table_name,
"name": "User_Layer",
"properties": [
"UniqueID",
"subreddit",
"user_id",
"title",
"content",
"permalink",
"upvotes", "upvotes",
"percentage_upvoted", "percentage_upvoted"
"n_comments"
],
"cluster_properties": [
"user_id"
] ]
}, },
{ {
"use_case": use_case, "use_case": use_case,
"table": table_name, "table": table_name,
"name": "Upvotes_Layer", #TODO Probably do something like Total Votes? so we can get a popularity? "name": "Percentage_Layer",
"properties": [ "properties": [
"UniqueID", "UniqueID",
"subreddit", "subreddit",
...@@ -101,17 +104,17 @@ def add_layers(use_case:str, table_name: str): ...@@ -101,17 +104,17 @@ def add_layers(use_case:str, table_name: str):
"permalink", "permalink",
"upvotes", "upvotes",
"percentage_upvoted", "percentage_upvoted",
"n_comments" "n_comments",
"created_at"
], ],
"cluster_properties": [ "cluster_properties": [
"upvotes",
"percentage_upvoted" "percentage_upvoted"
] ]
}, },
{ {
"use_case": use_case, "use_case": use_case,
"table": table_name, "table": table_name,
"name": "Percentage_Layer", "name": "Engagement_Layer",
"properties": [ "properties": [
"UniqueID", "UniqueID",
"subreddit", "subreddit",
...@@ -121,16 +124,17 @@ def add_layers(use_case:str, table_name: str): ...@@ -121,16 +124,17 @@ def add_layers(use_case:str, table_name: str):
"permalink", "permalink",
"upvotes", "upvotes",
"percentage_upvoted", "percentage_upvoted",
"n_comments" "n_comments",
"created_at"
], ],
"cluster_properties": [ "cluster_properties": [
"percentage_upvoted" "n_comments"
] ]
}, },
{ {
"use_case": use_case, "use_case": use_case,
"table": table_name, "table": table_name,
"name": "Engagement_Layer", "name": "Time_Layer",
"properties": [ "properties": [
"UniqueID", "UniqueID",
"subreddit", "subreddit",
...@@ -140,10 +144,11 @@ def add_layers(use_case:str, table_name: str): ...@@ -140,10 +144,11 @@ def add_layers(use_case:str, table_name: str):
"permalink", "permalink",
"upvotes", "upvotes",
"percentage_upvoted", "percentage_upvoted",
"n_comments" "n_comments",
"created_at"
], ],
"cluster_properties": [ "cluster_properties": [
"n_comments" "created_at"
] ]
} }
] ]
......
This source diff could not be displayed because it is too large. You can view the blob instead.
[{"id": "lat2of", "user_id": "JamesKBoyd", "title": "California man falls to his death while canyoneering at Death Valley", "content": "", "permalink": "/r/news/comments/lat2of/california_man_falls_to_his_death_while/", "upvotes": 0, "percentage_upvoted": 0.5, "n_comments": 3, "subreddit": "news"}, {"id": "lasy7g", "user_id": "watercolornightmares", "title": "How Rich Hospitals Profit From Patients in Car Crashes", "content": "", "permalink": "/r/news/comments/lasy7g/how_rich_hospitals_profit_from_patients_in_car/", "upvotes": 2, "percentage_upvoted": 0.57, "n_comments": 3, "subreddit": "news"}, {"id": "lasrjq", "user_id": "wilmots1", "title": "Moscow court hears case for jailing Putin critic Navalny", "content": "", "permalink": "/r/news/comments/lasrjq/moscow_court_hears_case_for_jailing_putin_critic/", "upvotes": 9, "percentage_upvoted": 0.84, "n_comments": 0, "subreddit": "news"}]
\ No newline at end of file
...@@ -30,7 +30,7 @@ def send_transaction_to_rest_gateway(transaction: dict): ...@@ -30,7 +30,7 @@ def send_transaction_to_rest_gateway(transaction: dict):
if res.status_code >= 400: if res.status_code >= 400:
raise Exception(f"Error while uploading: {str(res.content)}") raise Exception(f"Error while uploading: {str(res.content)}")
print(res) #print(res) Lots of spam
# file to read the data from # file to read the data from
JSON_DATASET = r'reddit_dataset.json' JSON_DATASET = r'reddit_dataset.json'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment