In [1]:
import pandas as pd
import sqlalchemy

In [2]:
df = pd.read_sql_table('resultdb_DailyStormer', 'sqlite:///resultdb_stormer_17-04-01.sqlite')

In [3]:
results = []
import json
for result in df.result:
    results.append(json.loads(result))

In [4]:
from collections import defaultdict
results_by_type = defaultdict(list)
for result in results:
    results_by_type[next(iter(result))].append(result)

[*zip(results_by_type.keys(), [len(results_by_type[x]) for x in results_by_type.keys()])]


Out[4]:
[('categories', 2), ('user', 4905), ('subpage', 43711), ('topic', 51574)]

In [5]:
results_by_type['user'][0]['user'].keys()


Out[5]:
dict_keys(['user_badges', 'badge_types', 'user', 'badges', 'users'])

In [6]:
results_by_type.keys()


Out[6]:
dict_keys(['categories', 'user', 'subpage', 'topic'])

In [7]:
len(results_by_type['topic']), len(results_by_type['subpage'])


Out[7]:
(51574, 43711)

In [8]:
list(results_by_type['topic'][0]['topic'].keys())


Out[8]:
['word_count',
 'visible',
 'like_count',
 'highest_post_number',
 'deleted_at',
 'last_posted_at',
 'id',
 'bookmarked',
 'archived',
 'user_id',
 'title',
 'draft_key',
 'archetype',
 'has_summary',
 'views',
 'draft_sequence',
 'pinned',
 'actions_summary',
 'draft',
 'details',
 'closed',
 'unpinned',
 'participant_count',
 'pinned_until',
 'post_stream',
 'fancy_title',
 'chunk_size',
 'posts_count',
 'slug',
 'created_at',
 'deleted_by',
 'reply_count',
 'pinned_globally',
 'category_id',
 'pinned_at']

In [9]:
topics = []

for topic in results_by_type['topic']:
    topics.append(topic['topic'])

In [10]:
df_topics = pd.DataFrame(topics)

In [11]:
print(df_topics.shape)


(51574, 36)

In [12]:
# subtopics = []

# for subtopic in results_by_type['subtopic']

In [13]:
posts = []

for post in results_by_type['subpage']:
    posts.extend(post['subpage']['posts'])

In [14]:
df_posts = pd.DataFrame(posts)

df_posts.shape


Out[14]:
(874220, 49)

In [15]:
df_posts.action_code.value_counts()


Out[15]:
closed.enabled             79
pinned_globally.enabled    13
closed.disabled             7
pinned.disabled             4
pinned.enabled              2
visible.disabled            1
Name: action_code, dtype: int64

In [16]:
users = []

for row in results_by_type['user']:
    users.append(row['user']['user'])

In [17]:
df_users = pd.DataFrame(users)

In [18]:
# serializable_users = df_users.drop(['invited_by', 'card_badge', 'groups', 'featured_user_badge_ids', 'custom_fields'], 1)

In [19]:
# serializable_users.to_sql('users', 'postgres://richard:stormer@localhost:5432/stormer', if_exists='replace', index=False)

In [20]:
df_users.to_sql('users', 'postgres://stormer:@localhost:5432/stormer', dtype={
    'results': sqlalchemy.types.JSON,
    'invited_by': sqlalchemy.types.JSON,
    'card_badge': sqlalchemy.types.JSON,
    'groups': sqlalchemy.types.JSON,
    'featured_user_badge_ids': sqlalchemy.types.JSON,
    'custom_fields': sqlalchemy.types.JSON}, if_exists='replace', index=False)

In [21]:
df_posts.dtypes


Out[21]:
action_code                      object
actions_summary                  object
admin                              bool
avatar_template                  object
avg_time                        float64
can_delete                         bool
can_edit                           bool
can_recover                        bool
can_view_edit_history              bool
can_wiki                           bool
cooked                           object
cooked_hidden                    object
created_at                       object
deleted_at                       object
display_username                 object
edit_reason                      object
hidden                             bool
hidden_reason_id                float64
id                                int64
incoming_link_count               int64
link_counts                      object
moderator                          bool
name                             object
polls                            object
post_number                       int64
post_type                         int64
primary_group_flair_bg_color     object
primary_group_flair_color        object
primary_group_flair_url          object
primary_group_name               object
quote_count                       int64
read                               bool
reads                             int64
reply_count                       int64
reply_to_post_number            float64
reply_to_user                    object
score                           float64
staff                              bool
topic_id                          int64
topic_slug                       object
trust_level                       int64
updated_at                       object
user_deleted                       bool
user_id                           int64
user_title                       object
username                         object
version                           int64
wiki                               bool
yours                              bool
dtype: object

In [22]:
df_posts.to_sql('posts', 'postgres://stormer:@localhost:5432/stormer', dtype={
    'actions_summary': sqlalchemy.types.JSON,
    'action_code': sqlalchemy.types.JSON,
    'avatar_template': sqlalchemy.types.JSON,
    'cooked': sqlalchemy.types.JSON, 
    'cooked_hidden': sqlalchemy.types.JSON, 
    'created_at': sqlalchemy.types.JSON, 
    'deleted_at': sqlalchemy.types.JSON, 
    'display_username': sqlalchemy.types.JSON, 
    'edit_reason': sqlalchemy.types.JSON, 
    'link_counts': sqlalchemy.types.JSON, 
    'name': sqlalchemy.types.JSON, 
    'polls': sqlalchemy.types.JSON, 
    'primary_group_flair_bg_color': sqlalchemy.types.JSON, 
    'primary_group_flair_color': sqlalchemy.types.JSON, 
    'primary_group_flair_url': sqlalchemy.types.JSON, 
    'primary_group_name': sqlalchemy.types.JSON, 
    'reply_to_user': sqlalchemy.types.JSON, 
    'topic_slug': sqlalchemy.types.JSON, 
    'updated_at': sqlalchemy.types.JSON, 
    'user_title': sqlalchemy.types.JSON, 
    'username': sqlalchemy.types.JSON}, if_exists='replace', index=False)

In [23]:
df_topics.dtypes


Out[23]:
actions_summary          object
archetype                object
archived                   bool
bookmarked               object
category_id               int64
chunk_size                int64
closed                     bool
created_at               object
deleted_at               object
deleted_by               object
details                  object
draft                    object
draft_key                object
draft_sequence           object
expandable_first_post    object
fancy_title              object
has_summary                bool
highest_post_number       int64
id                        int64
last_posted_at           object
like_count                int64
participant_count         int64
pinned                     bool
pinned_at                object
pinned_globally            bool
pinned_until             object
post_stream              object
posts_count               int64
reply_count               int64
slug                     object
title                    object
unpinned                 object
user_id                   int64
views                     int64
visible                    bool
word_count                int64
dtype: object

In [24]:
df_topics.to_sql('topics', 'postgres://stormer:@localhost:5432/stormer', dtype={
    'actions_summary': sqlalchemy.types.JSON,
    'archetype': sqlalchemy.types.JSON,
    'bookmarked': sqlalchemy.types.JSON,
    'created_at': sqlalchemy.types.JSON,
    'deleted_at': sqlalchemy.types.JSON,
    'deleted_by': sqlalchemy.types.JSON,
    'details': sqlalchemy.types.JSON,
    'draft': sqlalchemy.types.JSON,
    'draft_key': sqlalchemy.types.JSON,
    'draft_sequence': sqlalchemy.types.JSON,
    'expandable_first_post': sqlalchemy.types.JSON,
    'fancy_title': sqlalchemy.types.JSON,
    'last_posted_at': sqlalchemy.types.JSON,
    'pinned_at': sqlalchemy.types.JSON,
    'pinned_until': sqlalchemy.types.JSON,
    'post_stream': sqlalchemy.types.JSON,
    'slug': sqlalchemy.types.JSON,
    'title': sqlalchemy.types.JSON,
    'unpinned': sqlalchemy.types.JSON}, if_exists='replace', index=False)

In [25]:
# serializable_posts = df_posts.drop(['actions_summary', 'link_counts', 'polls'], axis=1)
# serializable_posts['reply_to_username'] = serializable_posts.reply_to_user.apply(lambda item: (pd.isnull(item) and np.nan) or item.get('username'))
# serializable_posts.drop(['reply_to_user'], inplace=True, axis=1)

In [26]:
# import numpy as np

In [27]:
# serializable_posts.to_sql('posts', 'sqlite:////home/sam/data/stormer/stormer.sql', if_exists='replace', index=False)

In [28]:


In [28]:


In [ ]: