In [40]:
%matplotlib inline
This notebook explores the prominence and evolution of the metaphors, frames, and narratives used to when discussing Bitcoin.
Frames, Metaphors, Narratives, Memes:
Others that we might consider including:
Words/phrases that would match multiple frames:
Known Issues:
Sanity Checks (TODO):
In [301]:
import pandas as pd
from pandas.io import gbq
import matplotlib.pyplot as plt
import math
In [302]:
project_id = 'open-synthesis'
In [304]:
def make_rules(body):
return f"""
(REGEXP_CONTAINS({body}, "currency") or REGEXP_CONTAINS({body}, "medium of exchange")) as currency,
(REGEXP_CONTAINS({body}, "gold") and not REGEXP_CONTAINS({body}, "gold ?rush")) as gold,
(REGEXP_CONTAINS({body}, "tulip") or REGEXP_CONTAINS({body}, "bubble")) as tulip,
(REGEXP_CONTAINS({body}, "ponzi") or REGEXP_CONTAINS({body}, "pyramid scheme")) as ponzi,
(REGEXP_CONTAINS({body}, "gold ?rush") or REGEXP_CONTAINS({body}, "pick ?axe?")) as gold_rush,
(REGEXP_CONTAINS({body}, "invest") or REGEXP_CONTAINS({body}, "asset")) as invest,
(REGEXP_CONTAINS({body}, "internet protocol") or REGEXP_CONTAINS({body}, "IP")) as internet,
(REGEXP_CONTAINS({body}, "gambling") or REGEXP_CONTAINS({body}, "speculation") or REGEXP_CONTAINS({body}, "lottery")) as gambling,
"""
def make_reddit_comments_query(table_name):
return f"""
SELECT
score,
{make_rules('body')}
body
FROM `open-synthesis.frame.{table_name}`
WHERE score > 0
"""
hn_comments_query = f"""
SELECT
ranking,
{make_rules('text')}
text
FROM `open-synthesis.frame.hn_bitcoin_comments`
WHERE ranking > 0
"""
In [ ]:
bitcoin_comments_2017 = gbq.read_gbq(make_reddit_comments_query('reddit_bitcoin_comments'), project_id=project_id, dialect='standard')
bitcoin_comments_2016 = gbq.read_gbq(make_reddit_comments_query('reddit_bitcoin_comments_2016_09'), project_id=project_id, dialect='standard')
In [157]:
hn_comments_all = gbq.read_gbq(hn_comments_query, project_id=project_id, dialect='standard')
In [303]:
pd.options.display.max_colwidth = 150
bitcoin_comments_2016.query('currency==True')[['score', 'body']][0:10]
Out[303]:
In [168]:
metaphors = ['invest', 'currency', 'gold', 'tulip', 'internet', 'gambling', 'ponzi', 'gold_rush']
def score_histogram(df, score_field='score'):
data = {
m: [math.log10(v) for v in df.query(f'{m}==True and {score_field} >= 1')[[score_field]][score_field].values]
for m in metaphors
}
plt.figure(figsize=(7, 7))
plt.hist(data.values(), stacked=True, label=metaphors, log=True)
plt.legend()
return plt
In [169]:
score_histogram(bitcoin_comments_2016);
score_histogram(bitcoin_comments_2017);
In [170]:
score_histogram(hn_comments_all, score_field='ranking');
Findings:
Looking for natural breaks to determine natural bounds HN comment ranks:
In [295]:
hn_comments_all.hist(column='ranking', bins=10);
query = f'ranking >= 1 and ({" or ".join([f"{m}==True" for m in metaphors])})'
out, bin = pd.qcut(hn_comments_all.query(query).ranking.values, 20, duplicates='drop', retbins=True)
bin
Out[295]:
There's so many comments at the low end that there's not any obvious statistical breaks.
In [300]:
labels = ["zero", "low", "med", "high"]
# just kind of picking some arbitrary cutoffs for now
bins = [0, 1, 10, 20, 844]
def make_binned(m, f='ranking'):
binned = pd.cut(hn_comments_all.query(f'{m}==True and {f} >= 1')[f].values, bins, labels=labels)
return {b: len([v for v in binned if v == b]) for b in labels}
df = pd.DataFrame.from_dict([make_binned(m) for m in metaphors])
df = df.transpose()
df.columns = metaphors
df.rows = labels
print(df)
df['total'] = df[metaphors].sum(axis=1)
df = df.div(df.total, axis='index') * 100
df = df.drop(['total'], axis=1)
print(df)
df.plot.bar(stacked=True);
Discussion
In [ ]: