In [16]:
import datetime as dt
from steemdata import SteemData

import pandas as pd
import numpy as np

import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf

# helpers
from toolz import keyfilter

def keep(d, whitelist):
    return keyfilter(lambda k: k in whitelist, d)

def omit(d, blacklist):
    return keyfilter(lambda k: k not in blacklist, d)

In [17]:
db = SteemData().db

In [ ]:

Daily Active Users


In [18]:
# time constraints
time_constraints = {
    '$gte': dt.datetime.now() - dt.timedelta(days=30),
}
conditions = {
    'timestamp': time_constraints,
    'type': {'$in': ['vote', 'comment', 'transfer']},
}
projection = {
    '_id': 0,
    'timestamp': 1,
    'account': 1,
#     'type': 1,
}
ops = list(
    db['AccountOperations'].find(conditions, projection=projection).hint([('timestamp', -1)])
)

In [19]:
ops2 = [{
    'account': x['account'],
    'date': x['timestamp'].date(),
} for x in ops]

In [20]:
from toolz import groupby
from toolz.curried import get

from funcy.colls import pluck
from funcy.seqs import distinct, rest

grouped = groupby(get('date'), ops2)
daily_users = [(k,  len(distinct(pluck('account', v)))) for k, v in grouped.items()]

In [21]:
df = pd.DataFrame(daily_users, columns=['date', 'users'])
df.set_index('date', inplace=True)

In [22]:
df.iloc[1:-1].iplot(
    title='Daily Active Users',
    colors=['blue'],
    theme='white',
    legend=False,
    filename='steemdata-30d-user-count')


/home/user/anaconda3/lib/python3.6/site-packages/cufflinks/plotlytools.py:156: FutureWarning:

pandas.tslib is deprecated and will be removed in a future version.
You can access Timestamp as pandas.Timestamp

Out[22]:

In [ ]:

Top Posts by Languages


In [23]:
# time constraints
time_constraints = {
    '$gte': dt.datetime.now() - dt.timedelta(days=7),
}
conditions = {
    'created': time_constraints,
    'net_votes': {'$gt': 3},
    'children': {'$gt': 1},
}
projection = {
    '_id': 0,
    'identifier': 1,
    'title': 1,
    'author': 1,
    'body': 1,
}
lang_posts = list(db['Posts'].find(conditions, projection=projection))

In [24]:
len(lang_posts)


Out[24]:
34171

In [25]:
from langdetect import detect_langs
from funcy.colls import pluck
from funcy.seqs import first, last
from toolz.functoolz import compose, thread_last
from contextlib import suppress
from collections import Counter

In [26]:
def detect(body):
    with suppress(Exception):
        langs = detect_langs(body)
        if langs:
            return first(langs)
    
    return []

In [27]:
languages = thread_last(
    filter(lambda x: len(x['body']) > 100, lang_posts),
    (pluck, 'body'),
    (map, detect),
    (filter, bool)
)

In [28]:
languages = [x.lang for x in languages if x and x.prob > 0.8]

c = Counter(languages)
c.most_common(10)


Out[28]:
[('en', 27205),
 ('ko', 2007),
 ('es', 796),
 ('de', 437),
 ('id', 399),
 ('hr', 191),
 ('pl', 128),
 ('tl', 99),
 ('af', 79),
 ('ja', 70)]

In [29]:
normalized = [{'language': first(x), 'pct_share': round(last(x) / len(languages) * 100, 3)} for x in c.most_common(10)]

In [30]:
df = pd.DataFrame(normalized)
df.index = range(1,len(df)+1)

In [31]:
df.head(5)


Out[31]:
language pct_share
1 en 85.475
2 ko 6.306
3 es 2.501
4 de 1.373
5 id 1.254

In [38]:
import plotly.plotly as py
import plotly.graph_objs as go

labels = [first(x) for x in c.most_common(7)]
values = [last(x) for x in c.most_common(7)]
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']

trace = go.Pie(labels=labels, values=values,
               hoverinfo='label+percent', textinfo='label', 
               textfont=dict(size=20),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2)))

layout = go.Layout(
#     title='Language Domination',
)

fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='styled_pie_chart')


Out[38]:

In [33]:
## todo, create a distinct filter on author field, to count % as unique persons, not as number of posts

In [ ]: