In [4]:
import pandas as pd
df = pd.read_sql_table('resultdb_DailyStormer', DB_PATH)
In [5]:
results = []
import json
for result in df.result:
results.append(json.loads(result))
In [6]:
from collections import defaultdict
results_by_type = defaultdict(list)
for result in results:
results_by_type[next(iter(result))].append(result)
[*zip(results_by_type.keys(), [len(results_by_type[x]) for x in results_by_type.keys()])]
Out[6]:
In [44]:
results_by_type['user'][0]['user'].keys()
Out[44]:
In [78]:
results_by_type.keys()
Out[78]:
In [79]:
len(results_by_type['topic']), len(results_by_type['subpage'])
Out[79]:
In [83]:
list(results_by_type['topic'][0]['topic'].keys())
Out[83]:
In [84]:
topics = []
for topic in results_by_type['topic']:
topics.append(topic['topic'])
In [85]:
df_topics = pd.DataFrame(topics)
In [88]:
print(df_topics.shape)
In [ ]:
subtopics = []
for subtopic in results_by_type['subtopic']
In [102]:
posts = []
for post in results_by_type['subpage']:
posts.extend(post['subpage']['posts'])
In [103]:
df_posts = pd.DataFrame(posts)
df_posts.shape
Out[103]:
In [122]:
df_posts.action_code.value_counts()
Out[122]:
In [13]:
users = []
for row in results_by_type['user']:
users.append(row['user']['user'])
In [15]:
df_users = pd.DataFrame(users)
In [146]:
serializable_users = df_users.drop(['invited_by', 'card_badge', 'groups', 'featured_user_badge_ids', 'custom_fields'], 1)
In [147]:
serializable_users.to_sql('users', 'sqlite:////home/sam/data/stormer/stormer.sql', if_exists='replace', index=False)
In [192]:
serializable_posts = df_posts.drop(['actions_summary', 'link_counts', 'polls'], axis=1)
serializable_posts['reply_to_username'] = serializable_posts.reply_to_user.apply(lambda item: (pd.isnull(item) and np.nan) or item.get('username'))
serializable_posts.drop(['reply_to_user'], inplace=True, axis=1)
In [168]:
import numpy as np
In [ ]:
serializable_posts.to_sql('posts', 'sqlite:////home/sam/data/stormer/stormer.sql', if_exists='replace', index=False)