In [4]:
import pandas as pd

df = pd.read_sql_table('resultdb_DailyStormer', DB_PATH)

In [5]:
results = []
import json
for result in df.result:
    results.append(json.loads(result))

In [6]:
from collections import defaultdict
results_by_type = defaultdict(list)
for result in results:
    results_by_type[next(iter(result))].append(result)

[*zip(results_by_type.keys(), [len(results_by_type[x]) for x in results_by_type.keys()])]


Out[6]:
[('categories', 2), ('user', 4905), ('subpage', 43711), ('topic', 51574)]

In [44]:
results_by_type['user'][0]['user'].keys()


Out[44]:
dict_keys(['users', 'badge_types', 'badges', 'user_badges', 'user'])

In [78]:
results_by_type.keys()


Out[78]:
dict_keys(['subpage', 'user', 'categories', 'topic'])

In [79]:
len(results_by_type['topic']), len(results_by_type['subpage'])


Out[79]:
(41194, 35160)

In [83]:
list(results_by_type['topic'][0]['topic'].keys())


Out[83]:
['posts_count',
 'details',
 'views',
 'fancy_title',
 'highest_post_number',
 'closed',
 'title',
 'word_count',
 'bookmarked',
 'archived',
 'pinned_at',
 'last_posted_at',
 'deleted_by',
 'deleted_at',
 'draft_key',
 'pinned_until',
 'visible',
 'post_stream',
 'category_id',
 'unpinned',
 'draft',
 'id',
 'slug',
 'pinned',
 'draft_sequence',
 'reply_count',
 'has_summary',
 'participant_count',
 'chunk_size',
 'created_at',
 'like_count',
 'user_id',
 'pinned_globally',
 'archetype',
 'actions_summary']

In [84]:
topics = []

for topic in results_by_type['topic']:
    topics.append(topic['topic'])

In [85]:
df_topics = pd.DataFrame(topics)

In [88]:
print(df_topics.shape)


(41194, 36)

In [ ]:
subtopics = []

for subtopic in results_by_type['subtopic']

In [102]:
posts = []

for post in results_by_type['subpage']:
    posts.extend(post['subpage']['posts'])

In [103]:
df_posts = pd.DataFrame(posts)

df_posts.shape


Out[103]:
(703200, 49)

In [122]:
df_posts.action_code.value_counts()


Out[122]:
closed.enabled             46
pinned_globally.enabled    11
closed.disabled             7
pinned.disabled             4
pinned.enabled              2
visible.disabled            1
Name: action_code, dtype: int64

In [13]:
users = []

for row in results_by_type['user']:
    users.append(row['user']['user'])

In [15]:
df_users = pd.DataFrame(users)

In [146]:
serializable_users = df_users.drop(['invited_by', 'card_badge', 'groups', 'featured_user_badge_ids', 'custom_fields'], 1)

In [147]:
serializable_users.to_sql('users', 'sqlite:////home/sam/data/stormer/stormer.sql', if_exists='replace', index=False)

In [192]:
serializable_posts = df_posts.drop(['actions_summary', 'link_counts', 'polls'], axis=1)
serializable_posts['reply_to_username'] = serializable_posts.reply_to_user.apply(lambda item: (pd.isnull(item) and np.nan) or item.get('username'))
serializable_posts.drop(['reply_to_user'], inplace=True, axis=1)

In [168]:
import numpy as np

In [ ]:
serializable_posts.to_sql('posts', 'sqlite:////home/sam/data/stormer/stormer.sql', if_exists='replace', index=False)