In [31]:
class Gngram(object):
"""Searches Google Ngram Viewer for search terms and creates pandas dataframes of results
Arguments: ngrams must be list
years is list of length 2, endpoints (inclusive)\
corpus: 15 is English
If case_insensitive = True, will return df_parents and df_expansion;
otherwise just df_parents
Dataframe objects:
df_parents: a dataframe with years as Index and search terms as columns and
words per million words as value
df_expansion: similar, but with df_parent columns expanded in different cases
where applicable (e.g. the, The, THE)
Dict objects (not necessarily useful, but included for the sake of thoroughness):
ngram_type: dict of column name : Goolge ngram defined type,
e.g. {"the (All): NGRAM", "the": "expansion" ... }
parent_expansion: dict of parent column name : list of expansion column names
e.g. {"the (All)": ["The", "the", "THE"]}
expansion_parent: dict of expansion column name: parent column name
e.g. {'the': 'the (All)', 'The', 'the (All)', ...}
Other:
html: the raw html returned from Google Ngrams Viewer
json: the json object of results extracted from the raw html (it's inline in the html)
url: the Google Ngrams Viewer url that returned the results."""
def __init__(self, ngrams = ['example', 'list'], years=[1800, 2008], corpus=15, smoothing=0, case_insensitive=False):
import pandas as pd
import urllib
self.url = "https://books.google.com/ngrams/graph?content="
for i in range(len(ngrams)):
ngrams[i] = ngrams[i].replace("'", "%27")
ngrams[i] = ngrams[i].replace(" ", "+")
if i > 0:
self.url += '%2C+'
self.url += ngrams[i]
if case_insensitive == True:
self.url += "&case_insensitive=on"
self.url += '&year_start='
self.url += str(years[0])
self.url += '&year_end='
self.url += str(years[1])
self.url += '&corpus='
self.url += str(corpus)
self.url += '&smoothing=0'
self.html = urllib.urlopen(self.url)
self.json = ""
for line in self.html:
if "var data" in line:
self.json = line
break
self.json = self.json.replace('var data =', '')
self.json = self.json.rstrip().lstrip()[:-1]
self.json = eval(self.json)
self.df_parents = pd.DataFrame()
self.df_expansions = pd.DataFrame()
self.parent_expansion = {} # dict of lists
self.expansion_parent = {} # reverse of above, just dict
self.ngram_type = {} # pairs column names and types
for i in range(len(self.json)):
ngram = self.json[i]['ngram']
parent = self.json[i]['parent']
timeseries = self.json[i]['timeseries']
for pos in range(len(timeseries)):
timeseries[pos] *= 1000000 # change from proportion to words per million words
self.ngram_type[ngram] = self.json[i]['type']
df_temp = pd.DataFrame({ngram: timeseries}, index=range(years[0], years[1]+1))
if parent == '':
if len(self.df_parents) == 0:
self.df_parents = df_temp.copy()
else:
self.df_parents = pd.concat([self.df_parents, df_temp], axis=1)
else:
if len(self.df_expansions) == 0:
self.df_expansions = df_temp.copy()
else:
self.df_expansions = pd.concat([self.df_expansions, df_temp], axis=1)
if parent not in self.parent_expansion.keys():
self.parent_expansion[parent] = [ngram]
else:
self.parent_expansion[parent].append(ngram)
self.expansion_parent[ngram] = parent
In [ ]: