In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
from os import path
import itertools
import requests
from bs4 import BeautifulSoup
from io import StringIO
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import ScalarFormatter, LogFormatter, LogFormatterExponent, LogFormatterMathtext
import datetime
In [3]:
fontP = FontProperties()
fontP.set_size('small')
In [4]:
%matplotlib inline
In [5]:
def get_word(word):
old_word = word
word = word.strip()
word = word.lower()
word = word.replace(" ", "_")
first_four = "/".join(word[:5])
while True:
request = requests.get("https://projects.fivethirtyeight.com/reddit-data/{}/{}.csv".format(first_four, word))
if request.status_code == requests.codes.NOT_FOUND:
print("Word: '{}' was not found.".format(old_word))
return None
if request.status_code == requests.codes.ok:
return pd.read_csv(StringIO(request.text))
else:
print("We could not get the word '{}'\nsleeping for 10s ...".format(old_word))
sleep(10)
def get_words(words):
frames = [get_word(words[0])]
for word in words[1:]:
df = get_word(word)
if df is not None:
frames.append(df.ngram_fraction)
else:
frames.append(df)
valid_words = []
valid_frames = []
for i, frame in enumerate(frames):
if frame is not None:
valid_words.append(words[i])
valid_frames.append(frames[i])
if not valid_frames:
print("None of the words requested were found.")
return None
if len(valid_frames) > 1:
df = pd.concat(frames, axis=1)
else:
df = frames[0]
df.columns = ["date"] + [x.strip().replace(" ", "_") for x in valid_words]
df.set_index("date", inplace=True)
return df
In [6]:
data = get_word("African American")
data.tail()
Out[6]:
In [7]:
data = get_word("meh")
data.tail()
Out[7]:
In [8]:
data = get_words(["monocultural", "mono-cultural", "mono cultural", "monoculturals", "mono-culturals", "mono culturals",
"biracial", "bi-racial", "bi racial", "biracials", "bi-racials", "bi racials",
"interracial", "inter-racial", "inter racial", "interracials", "inter-racials", "inter racials",
"multiracial", "multi-racial", "multi racial", "multiracials", "multi-racials", "multi racials",
"multiethnic", "multi-ethnic", "multi ethnic", "multiethnics", "multi-ethnics", "multi ethnics",
"multicultural", "multi-cultural", "multi cultural", "multiculturals", "multi-culturals", "multi culturals",
"multiculturalism", "multi-culturalism", "multi culturalism", "multiculturalisms", "multi-culturalisms", "multi culturalisms"])
In [9]:
data = get_words(["monocultural",
"biracial", "bi-racial",
"interracial",
"multiracial",
"multiethnic", "multi-ethnic",
"multicultural", "multi-cultural",
"multiculturalism", "multi-culturalism"])
In [10]:
data["biracial"] = data["biracial"] + data["bi-racial"]
In [11]:
data["multiethnic"] = data["multiethnic"] + data["multi-ethnic"]
In [12]:
data["multicultural"] = data["multicultural"] + data["multi-cultural"]
In [13]:
data["multiculturalism"] = data["multiculturalism"] + data["multi-culturalism"]
In [14]:
data.index = pd.to_datetime(data.index, format="%Y-%m-%d")
In [15]:
days_to_average_over = 90
smoothed_data = data.rolling(window=days_to_average_over).mean()
axis = smoothed_data.plot()
fig = plt.gcf()
fig.set_size_inches((12,6))
_ = axis.legend(prop=fontP, loc="best")
In [52]:
def plot(dataframe, words):
#markers = itertools.cycle(['s', 'D', '*', 'p', 'h', 'o', '+', 'd'])
line_types = itertools.cycle(['-', '--', '-.'])
df = dataframe[words]
days_to_average_over = 90
smoothed_data = df.rolling(window=days_to_average_over).mean()
plt.figure(figsize=(16,6))
for i,word in enumerate(words):
plt.plot(smoothed_data.index, smoothed_data[word], next(line_types),
linewidth=2,color=str(i/float(len(words))))
axis_1 = plt.gca()
axis_1.set_yscale('log') # You can disable
axis_1.set_ylim(ymin=1e-7,ymax=1e-5) # You can disable
axis_1.set_xlim(xmin=datetime.date(2009,1,1), xmax=datetime.date(2015,7,1))
axis_1.yaxis.set_major_formatter(LogFormatterMathtext()) # You can disable
#axis_1.grid(True)
font = {'family': "Times New Roman",
'color': 'black',
'size': 12}
axis_1.set_ylabel("Percentage of N-Grams (%)", font)
axis_1.set_xlabel("Year", font)
#axis_1.yaxis.set_major_formatter(FixedOrderFormatter(-3)) # You can enable
plt.legend(words, loc=2,ncol=7)
In [53]:
words = ["biracial", "multiethnic", "multicultural", "multiculturalism",
"interracial", "monocultural", "multiracial"]
plot(data,words)
In [ ]: