In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
from os import path
import itertools
import requests
from bs4 import BeautifulSoup
from io import StringIO
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import ScalarFormatter, LogFormatter, LogFormatterExponent, LogFormatterMathtext
import datetime

In [3]:
fontP = FontProperties()
fontP.set_size('small')

In [4]:
%matplotlib inline

In [5]:
def get_word(word):
    old_word = word
    word = word.strip()
    word = word.lower()
    word = word.replace(" ", "_")
    first_four = "/".join(word[:5])
    while True:
        request = requests.get("https://projects.fivethirtyeight.com/reddit-data/{}/{}.csv".format(first_four, word))
        if request.status_code == requests.codes.NOT_FOUND:
            print("Word: '{}' was not found.".format(old_word))
            return None

        if request.status_code == requests.codes.ok:
          return pd.read_csv(StringIO(request.text))
        else:
          print("We could not get the word '{}'\nsleeping for 10s ...".format(old_word))
          sleep(10)

def get_words(words):
    frames = [get_word(words[0])]
    for word in words[1:]:
        df = get_word(word)
        if df is not None:
          frames.append(df.ngram_fraction)
        else:
            frames.append(df)
    valid_words = []
    valid_frames = []
    for i, frame in enumerate(frames):
        if frame is not None:
            valid_words.append(words[i])
            valid_frames.append(frames[i])
    if not valid_frames:
        print("None of the words requested were found.")
        return None
    if len(valid_frames) > 1:
      df = pd.concat(frames, axis=1)
    else:
      df = frames[0]
    df.columns = ["date"] + [x.strip().replace(" ", "_") for x in valid_words]
    df.set_index("date", inplace=True)
    return df

In [6]:
data = get_word("African American")
data.tail()


Out[6]:
date ngram_fraction
2873 2015-08-27 0.000003
2874 2015-08-28 0.000002
2875 2015-08-29 0.000003
2876 2015-08-30 0.000003
2877 2015-08-31 0.000003

In [7]:
data = get_word("meh")
data.tail()


Out[7]:
date ngram_fraction
2873 2015-08-27 0.000030
2874 2015-08-28 0.000031
2875 2015-08-29 0.000031
2876 2015-08-30 0.000030
2877 2015-08-31 0.000031

In [8]:
data = get_words(["monocultural", "mono-cultural", "mono cultural", "monoculturals", "mono-culturals", "mono culturals",
                  "biracial", "bi-racial", "bi racial", "biracials", "bi-racials", "bi racials",
                 "interracial", "inter-racial", "inter racial", "interracials", "inter-racials", "inter racials",
                 "multiracial", "multi-racial", "multi racial", "multiracials", "multi-racials", "multi racials",
                 "multiethnic", "multi-ethnic", "multi ethnic", "multiethnics", "multi-ethnics", "multi ethnics",
                 "multicultural", "multi-cultural", "multi cultural", "multiculturals", "multi-culturals", "multi culturals",
                 "multiculturalism", "multi-culturalism", "multi culturalism", "multiculturalisms",  "multi-culturalisms", "multi culturalisms"])


Word: 'mono-cultural' was not found.
Word: 'mono cultural' was not found.
Word: 'monoculturals' was not found.
Word: 'mono-culturals' was not found.
Word: 'mono culturals' was not found.
Word: 'bi racial' was not found.
Word: 'biracials' was not found.
Word: 'bi-racials' was not found.
Word: 'bi racials' was not found.
Word: 'inter racial' was not found.
Word: 'interracials' was not found.
Word: 'inter-racials' was not found.
Word: 'inter racials' was not found.
Word: 'multi-racial' was not found.
Word: 'multi racial' was not found.
Word: 'multiracials' was not found.
Word: 'multi-racials' was not found.
Word: 'multi racials' was not found.
Word: 'multi ethnic' was not found.
Word: 'multiethnics' was not found.
Word: 'multi-ethnics' was not found.
Word: 'multi ethnics' was not found.
Word: 'multi cultural' was not found.
Word: 'multiculturals' was not found.
Word: 'multi-culturals' was not found.
Word: 'multi culturals' was not found.
Word: 'multi culturalism' was not found.
Word: 'multiculturalisms' was not found.
Word: 'multi-culturalisms' was not found.
Word: 'multi culturalisms' was not found.

In [9]:
data = get_words(["monocultural",
                  "biracial", "bi-racial",
                  "interracial",
                 "multiracial",
                 "multiethnic", "multi-ethnic",
                 "multicultural", "multi-cultural",
                 "multiculturalism", "multi-culturalism"])

In [10]:
data["biracial"] = data["biracial"] + data["bi-racial"]

In [11]:
data["multiethnic"] = data["multiethnic"] + data["multi-ethnic"]

In [12]:
data["multicultural"] = data["multicultural"] + data["multi-cultural"]

In [13]:
data["multiculturalism"] = data["multiculturalism"] + data["multi-culturalism"]

In [14]:
data.index = pd.to_datetime(data.index, format="%Y-%m-%d")

In [15]:
days_to_average_over = 90
smoothed_data = data.rolling(window=days_to_average_over).mean()
axis = smoothed_data.plot()
fig = plt.gcf()
fig.set_size_inches((12,6))
_ = axis.legend(prop=fontP, loc="best")



In [52]:
def plot(dataframe, words):
    #markers = itertools.cycle(['s', 'D', '*', 'p', 'h', 'o', '+', 'd'])
    line_types = itertools.cycle(['-', '--', '-.'])
    df = dataframe[words]
    days_to_average_over = 90
    smoothed_data = df.rolling(window=days_to_average_over).mean()
    plt.figure(figsize=(16,6))

    for i,word in enumerate(words):
        plt.plot(smoothed_data.index, smoothed_data[word], next(line_types), 
                 linewidth=2,color=str(i/float(len(words))))
    axis_1 = plt.gca()

    axis_1.set_yscale('log') # You can disable
    axis_1.set_ylim(ymin=1e-7,ymax=1e-5) # You can disable
    axis_1.set_xlim(xmin=datetime.date(2009,1,1), xmax=datetime.date(2015,7,1))
    axis_1.yaxis.set_major_formatter(LogFormatterMathtext())  # You can disable
    #axis_1.grid(True)

    font = {'family': "Times New Roman",
            'color': 'black',
           'size': 12}
    axis_1.set_ylabel("Percentage of N-Grams (%)", font)
    axis_1.set_xlabel("Year", font)
    #axis_1.yaxis.set_major_formatter(FixedOrderFormatter(-3)) # You can enable
    plt.legend(words, loc=2,ncol=7)

In [53]:
words = ["biracial", "multiethnic", "multicultural", "multiculturalism",
         "interracial", "monocultural", "multiracial"]
plot(data,words)


//anaconda/lib/python3.5/site-packages/matplotlib/scale.py:100: RuntimeWarning: invalid value encountered in less_equal
  a[a <= 0.0] = 1e-300

In [ ]: