In [1]:
import pandas as pd
import sqlalchemy
In [2]:
df = pd.read_sql_table('resultdb_DailyStormer', 'sqlite:///resultdb_stormer_17-04-01.sqlite')
In [3]:
results = []
import json
for result in df.result:
results.append(json.loads(result))
In [4]:
from collections import defaultdict
results_by_type = defaultdict(list)
for result in results:
results_by_type[next(iter(result))].append(result)
[*zip(results_by_type.keys(), [len(results_by_type[x]) for x in results_by_type.keys()])]
Out[4]:
In [5]:
results_by_type['user'][0]['user'].keys()
Out[5]:
In [6]:
results_by_type.keys()
Out[6]:
In [7]:
len(results_by_type['topic']), len(results_by_type['subpage'])
Out[7]:
In [8]:
list(results_by_type['topic'][0]['topic'].keys())
Out[8]:
In [9]:
topics = []
for topic in results_by_type['topic']:
topics.append(topic['topic'])
In [10]:
df_topics = pd.DataFrame(topics)
In [11]:
print(df_topics.shape)
In [12]:
# subtopics = []
# for subtopic in results_by_type['subtopic']
In [13]:
posts = []
for post in results_by_type['subpage']:
posts.extend(post['subpage']['posts'])
In [14]:
df_posts = pd.DataFrame(posts)
df_posts.shape
Out[14]:
In [15]:
df_posts.action_code.value_counts()
Out[15]:
In [16]:
users = []
for row in results_by_type['user']:
users.append(row['user']['user'])
In [17]:
df_users = pd.DataFrame(users)
In [18]:
# serializable_users = df_users.drop(['invited_by', 'card_badge', 'groups', 'featured_user_badge_ids', 'custom_fields'], 1)
In [19]:
# serializable_users.to_sql('users', 'postgres://richard:stormer@localhost:5432/stormer', if_exists='replace', index=False)
In [20]:
df_users.to_sql('users', 'postgres://stormer:@localhost:5432/stormer', dtype={
'results': sqlalchemy.types.JSON,
'invited_by': sqlalchemy.types.JSON,
'card_badge': sqlalchemy.types.JSON,
'groups': sqlalchemy.types.JSON,
'featured_user_badge_ids': sqlalchemy.types.JSON,
'custom_fields': sqlalchemy.types.JSON}, if_exists='replace', index=False)
In [21]:
df_posts.dtypes
Out[21]:
In [22]:
df_posts.to_sql('posts', 'postgres://stormer:@localhost:5432/stormer', dtype={
'actions_summary': sqlalchemy.types.JSON,
'action_code': sqlalchemy.types.JSON,
'avatar_template': sqlalchemy.types.JSON,
'cooked': sqlalchemy.types.JSON,
'cooked_hidden': sqlalchemy.types.JSON,
'created_at': sqlalchemy.types.JSON,
'deleted_at': sqlalchemy.types.JSON,
'display_username': sqlalchemy.types.JSON,
'edit_reason': sqlalchemy.types.JSON,
'link_counts': sqlalchemy.types.JSON,
'name': sqlalchemy.types.JSON,
'polls': sqlalchemy.types.JSON,
'primary_group_flair_bg_color': sqlalchemy.types.JSON,
'primary_group_flair_color': sqlalchemy.types.JSON,
'primary_group_flair_url': sqlalchemy.types.JSON,
'primary_group_name': sqlalchemy.types.JSON,
'reply_to_user': sqlalchemy.types.JSON,
'topic_slug': sqlalchemy.types.JSON,
'updated_at': sqlalchemy.types.JSON,
'user_title': sqlalchemy.types.JSON,
'username': sqlalchemy.types.JSON}, if_exists='replace', index=False)
In [23]:
df_topics.dtypes
Out[23]:
In [24]:
df_topics.to_sql('topics', 'postgres://stormer:@localhost:5432/stormer', dtype={
'actions_summary': sqlalchemy.types.JSON,
'archetype': sqlalchemy.types.JSON,
'bookmarked': sqlalchemy.types.JSON,
'created_at': sqlalchemy.types.JSON,
'deleted_at': sqlalchemy.types.JSON,
'deleted_by': sqlalchemy.types.JSON,
'details': sqlalchemy.types.JSON,
'draft': sqlalchemy.types.JSON,
'draft_key': sqlalchemy.types.JSON,
'draft_sequence': sqlalchemy.types.JSON,
'expandable_first_post': sqlalchemy.types.JSON,
'fancy_title': sqlalchemy.types.JSON,
'last_posted_at': sqlalchemy.types.JSON,
'pinned_at': sqlalchemy.types.JSON,
'pinned_until': sqlalchemy.types.JSON,
'post_stream': sqlalchemy.types.JSON,
'slug': sqlalchemy.types.JSON,
'title': sqlalchemy.types.JSON,
'unpinned': sqlalchemy.types.JSON}, if_exists='replace', index=False)
In [25]:
# serializable_posts = df_posts.drop(['actions_summary', 'link_counts', 'polls'], axis=1)
# serializable_posts['reply_to_username'] = serializable_posts.reply_to_user.apply(lambda item: (pd.isnull(item) and np.nan) or item.get('username'))
# serializable_posts.drop(['reply_to_user'], inplace=True, axis=1)
In [26]:
# import numpy as np
In [27]:
# serializable_posts.to_sql('posts', 'sqlite:////home/sam/data/stormer/stormer.sql', if_exists='replace', index=False)
In [28]:
In [28]:
In [ ]: