In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tools import load_nyt_database, word_freq, GetTemporalNIH, GetTemporalNSF, GetTemporalPsyc, IsValidYear
import itertools
from matplotlib.ticker import ScalarFormatter, LogFormatterMathtext
In [1]:
%matplotlib inline
In [3]:
nyt = load_nyt_database()
psyc = GetTemporalPsyc()
nih = GetTemporalNIH()
nsf = GetTemporalNSF()
all_words = set(nsf.columns.tolist() + nih.columns.tolist() + psyc.columns.tolist() + nyt.columns.tolist())
words = [x for x in all_words if x!= "Total"]
books = word_freq(words)
books.rename(columns={k:k.strip() for k in books.columns}, inplace=True)
In [4]:
nyt1 = load_nyt_database()
In [5]:
nyt2 = load_nyt_database(False)
In [10]:
print("NYT Historical Average of # of Articles", nyt2.Total.mean())
In [4]:
cultural_groups = {'bi': ['bicultural', 'biracial', 'biethnic', 'interracial'],
'mixed': ['mixed race', 'mixed ethnicity', 'other race', 'other ethnicity'],
'mono': ['monoracial', 'monoethnic', 'monocultural'],
'multi': ['multiethnic', 'multicultural', 'multiracial', 'polycultural', 'polyethnic', 'polyracial'],
'theory': ['multiculturalism', 'cultural pluralism', 'polyculturalism']}
In [5]:
for source, data in {"Books": books, "NYT": nyt, "Psyc":psyc, "NIH": nih, "NSF": nsf}.items():
for k, v in cultural_groups.items():
v = [v_ for v_ in v if v_ in data]
data[k] = data[v].fillna(0).sum(axis=1)
words.append(k)
words = set(words)
In [13]:
def plot_all(words, norm=True):
df1 = books[words]
y = df1
if norm:
y = y / y.mean()
axis_1 = y.plot(figsize=(24,6))
axis_1.set_title("Google Book Ngrams", fontsize=24)
axis_1.set_xlabel("Year")
_ = axis_1.legend(loc='upper left', ncol=2)
axis_1.set_yscale('log') # You can disable
if norm:
axis_1.set_ylim(1e-2, 1e2) # You can disable
axis_1.yaxis.set_major_formatter(LogFormatterMathtext())
axis_1.grid(True)
axis_1.set_xlim(xmin=1800)
axis_1.set_xlabel("Year", fontsize=20)
axis_1.set_ylabel("Percentage of historical average", fontsize=20)
y = pd.rolling_apply(nyt[words], window=10, func=np.mean, center=False)
if norm:
y = y/ y.mean()
axis_2 = y.plot(figsize=(24,6))
axis_2.set_title("New York Times", fontsize=24)
axis_2.set_yscale('log') # You can disable
axis_2.set_ylabel("Percentage of historical average", fontsize=20)
axis_2.set_xlabel("Year", fontsize=20)
if norm:
axis_2.set_ylim(1e-2, 1e2) # You can disable
else:
axis_2.set_ylim(ymin=1e-7) # You can disable
axis_2.yaxis.set_major_formatter(LogFormatterMathtext())
axis_2.grid(True)
axis_2.set_xlim(xmin=1800)
_ = axis_2.legend(loc='upper left', ncol=2)
for title, data in {"Psyc Info Metadata":psyc, "NIH Grants": nih, "NSF Grants": nsf}.items():
y = data[words]
y = pd.rolling_apply(y, window=3, func=np.mean, center=False)
if norm:
y = y / y.mean()
axis = y.plot(figsize=(24,6))
axis.set_title(title, fontsize=24)
axis.set_xlabel("Year", fontsize=20)
axis.set_ylabel("Percentage of historical average", fontsize=20)
#axis_3.set_yscale('log') # You can disable
axis.set_xlim(xmin=1850)
axis.grid(True)
_ = axis.legend(loc='upper left', ncol=2)
In [14]:
sample = ["multicultural", "biracial", "bicultural"]
plot_all(sample)
In [15]:
def StatsFromSeries(y):
historical_avg = y.mean()
#plt.plot(y.index, historical_avg)
years1 = y[y > 0].index
years2 = y[y > historical_avg].index
years1 = np.nan if not len(years1) else int(years1[0])
years2 = np.nan if not len(years2) else int(years2[0])
record = {"First Appearance": years1,
"Passed Hist Avg": years2,
"Historical Avg": historical_avg
}
return record
In [18]:
records = []
for source, data in {"Books": books, "NYT": nyt, "Psyc":psyc, "NIH": nih, "NSF": nsf}.items():
for word in words:
window = 5 if source not in {"NYT"} else 10
if word not in data:
print("{} is not found in {}".format(word, source))
continue
if source != "Books":
y = pd.rolling_apply(data[word], window=window, func=np.mean, center=False)
else:
y = data[word]
record = StatsFromSeries(y)
record["Word"] = word
record["Source"] = source
records.append(record)
df = pd.DataFrame.from_dict(records)
In [19]:
df[df.Word == "multicultural"].sort_values(by="Passed Hist Avg")
Out[19]:
In [20]:
df[df.Word == "biracial"].sort_values(by="Passed Hist Avg")
Out[20]:
In [21]:
df[df.Word == "mixed race"].sort_values(by="Passed Hist Avg")
Out[21]:
In [22]:
df[df.Word == "monocultural"].sort_values(by="Passed Hist Avg")
Out[22]:
In [23]:
df[df.Word == "monoracial"].sort_values(by="Passed Hist Avg")
Out[23]:
In [24]:
df[df.Word == "monoethnic"].sort_values(by="Passed Hist Avg")
Out[24]:
In [25]:
df[df.Word == "mono"].sort_values(by="Passed Hist Avg")
Out[25]:
In [26]:
df[df.Word == "bi"].sort_values(by="Passed Hist Avg")
Out[26]:
In [27]:
df[df.Word == "mixed"].sort_values(by="Passed Hist Avg")
Out[27]:
In [28]:
df[df.Word == "multi"].sort_values(by="Passed Hist Avg")
Out[28]:
In [29]:
df[df.Word == "theory"].sort_values(by="Passed Hist Avg")
Out[29]:
In [30]:
sample = ["mono", "bi", "multi", "mixed", "theory"]
plot_all(sample)
In [33]:
foo = psyc[psyc.index == 1850]
foo.values[foo.notnull().values]
Out[33]:
In [34]:
from tools import get_psycinfo_database
psycinfo = get_psycinfo_database()
total_pub = pd.DataFrame.from_csv("data/PsycInfo/PsycInfo Articles Review.csv")
total_pub["Year"] = [int(x) for x in total_pub.index.year]
total_pub.set_index("Year", inplace=True)
total_pub.rename(columns={"Articles": "Publications_Count"}, inplace=True)
clean_psycinfo = psycinfo[[IsValidYear(x) for x in psycinfo.Date.values]]
clean_psycinfo = clean_psycinfo.copy()
clean_psycinfo["Year"] = [int(x) for x in clean_psycinfo.Date]
clean_psycinfo["value"] = 1
temporal_psyc = clean_psycinfo.pivot_table(index="Year", columns=["Term"], values="value", aggfunc=np.sum)
#total_counts = total_pub.loc[temporal_psyc.index]
#temporal_psyc.loc[temporal_psyc.index] = (temporal_psyc.values.T / total_counts.values.flatten()).T