In [1]:
import cPickle as pickle
from io import open
import pandas as pd
import numpy as np
import pytz
from types import StringType
In [2]:
import datetime
from datetime import date, datetime
import dateutil
from dateutil import parser
In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
In [4]:
from mpltools import style
style.use("ggplot")
In [5]:
from IPython.html.widgets import (interact, RadioButtons, IntSliderWidget, TextWidget,
DropdownWidget, Dropdown, Select, CheckboxWidget, ToggleButtons)
from IPython.display import display
In [6]:
fh = open("/data/csc/fb_persons/unified_100percentpeople.pkl", "rb")
df = pickle.load(fh)
In [7]:
len(df)
Out[7]:
In [8]:
df.head()
Out[8]:
In [9]:
victorian_age = date(1800, 1, 1)
df = df[np.logical_not(df.date_of_birth.isnull().values)]
df = df[df.date_of_birth > victorian_age]
In [10]:
len(df)
Out[10]:
In [11]:
def replace_nationality(x):
if not x: return np.nan
USA = "USA"
if isinstance(x, StringType) and x.lower() == "united states of america":
return USA
if isinstance(x, tuple):
nationalities = set([y.lower() for y in x])
usa = "united states of america"
if usa in nationalities:
nationalities = nationalities.difference([usa])
nationalities.add(USA)
return tuple(nationalities)
In [12]:
df["nationality"] = df.nationality.apply(func=lambda x: replace_nationality(x))
In [13]:
americans = map(lambda x: (isinstance(x, StringType) and x == 'USA') or
(isinstance(x, tuple) and 'USA' in x),
df.nationality)
df = df[americans]
In [14]:
#rows = np.random.choice(df.index.values, 1000000)
sample = df
In [15]:
def remove_USA(x):
if not x: return np.nan
USA = "USA"
if isinstance(x, StringType) and x.lower() == USA:
return np.nan
if isinstance(x, tuple):
nationalities = set(x)
if USA in nationalities:
nationalities = nationalities.difference([USA])
if not nationalities: return np.nan
if len(nationalities) == 1: return list(nationalities)[0]
return tuple(nationalities)
In [16]:
df["nationality"] = df.nationality.apply(func=lambda x: remove_USA(x))
In [17]:
def dist_plot(field):
fig = plt.figure()
s = sample[field]
total = len(s)
g = s.dropna()
left = len(g)
percentage = 100.0 * left / total
ax = g.value_counts(normalize=True, dropna=True)[:10].plot(kind="bar", title=field)
ax.set_xlabel("{:.1f}% of the {} data is available".format(percentage, field))
In [18]:
fields1 = ["religion", "ethnicity", "profession", "gender"]
_ = interact(dist_plot,
field=RadioButtons(options=zip(fields1, fields1)))
In [142]:
def temporal(df, fields, resolution=10, nan_included=False, start=None, end=None):
df = df[np.logical_not(df.date_of_birth.isnull().values)]
df = df[df.date_of_birth >= start]
df = df[df.date_of_birth <= end]
if nan_included:
bla = df[fields].fillna("nan")
else:
bla = df[fields]
bla.index = [datetime(d.year, d.month, d.day) if d < date(2015, 1, 1) else pd.NaT
for d in df.date_of_birth.values]
criterion = [(bla.index.year//resolution)*resolution]
criterion.extend(fields)
year_grouped = bla.groupby(criterion)
counted = pd.DataFrame(year_grouped.agg(len))
counted = counted.reset_index()
counted = counted.rename(columns={0: "count", "level_0": "yob"})
counted = counted.set_index("yob")
counted.index = [datetime(int(y), 1, 1) for y in counted.index]
final = counted.pivot_table(index=counted.index, values="count", columns=fields)
final = final.fillna(0)
final = final[np.logical_not(final.index.isin([pd.NaT]))]
norm_final = final.div(final.sum(axis=1), axis=0)
return final, norm_final
In [20]:
def temporal_plot(f1, category, f2, plot_type, start_year, resolution, end_year):
fields = [f2]
tmp = sample
if f1 != "Anyone":
tmp = sample.loc[sample[f1] == category]
category = ":"+category
else:
category = ""
fields = list(set(fields))
start = date(start_year, 1, 1)
end = date(end_year, 1, 1)
df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
totals = df1.sum().values
percentages = totals / float(sum(totals))
selected_cols = df1.columns[np.argsort(percentages).flatten()[-6:]].values
ax1 = df1[selected_cols].plot(kind=plot_type, title="Counts of {} for {}{}".format(f2, f1, category))
ax1.legend(loc='upper right', ncol=2, bbox_to_anchor = (2.0, 1.0))
ax2 = df2[selected_cols].plot(kind=plot_type, title="Normalized Counts of {} for {}{}".format(f2, f1, category))
ax2.legend(loc='upper right', ncol=2, bbox_to_anchor = (2.0, 1.0))
In [318]:
fields = ["religion", "ethnicity", "profession", "gender"]
fields2 = ["Anyone"] + fields
_ = interact(temporal_plot,
f1=ToggleButtons(options=zip(fields2, fields2)),
category=TextWidget(value="christian"),
f2=ToggleButtons(options=zip(fields, fields)),
plot_type=ToggleButtons(options={"area": "area", "line": "line"}),
start_year=IntSliderWidget(min=1800, max=2010, step=10, value=1800),
end_year=IntSliderWidget(min=1800, max=2010, step=10, value=2000),
resolution=IntSliderWidget(min=0, max=50, step=5,value=5),
)
In [373]:
f1 = "ethnicity"
f2 = "gender"
start_year = 1750
end_year = 2000
plot_type = "line"
resolution = 20
#fft_axes.set_autoscaley_on(False)
fig, axes = plt.subplots(nrows=1, ncols=1, sharex=False)
#axes.set_autoscalex_on(False)
categories = filter(lambda x: isinstance(x, str), sample[f1].value_counts().index.values)[:6]
for category in categories:
fields = [f2]
tmp = sample
tmp = sample.loc[sample[f1] == category]
category = " ".join([x.capitalize() for x in category.replace('_', ' ').split()])
fields = list(set(fields))
start = date(start_year, 1, 1)
end = date(end_year, 1, 1)
df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
df1[category] = df1["Male"] + df1["Female"]
df2[category] = (df2["Male"] - df2["Female"]) * 100
## Cleaning data (Not kosher)
df2.loc[df2[category] < -50] = 100
ax2 = df2[category].plot(kind=plot_type, ax=axes, alpha=0.75, x_compat=True, linewidth=2, marker="o")
lgd = ax2.legend(loc='upper right', ncol=6, bbox_to_anchor = (1.05, 1.15))
ax2.set_ylabel("Male - Female (%)")
ax2.set_ylim((-20, 120))
(s,e) = ax2.get_xlim()
#_ = ax2.set_xlim((s, e+2500))
_ = ax2.set_xlabel("Year")
fig.set_size_inches((12, 4))
#ax2.annotate('arrowstyle', xy=(0, 5), xycoords='data', xytext=(0, 0), textcoords='offset points', arrowprops=dict(arrowstyle="->"))
plt.savefig("/data/csc/compsocial/freebase/figures/gender_ethnicity_gap.pdf", bbox_inches='tight')# bbox_extra_artist=[lgd])
In [365]:
f1 = "Anyone"
f2 = "gender"
start_year = 1800
end_year = 2020
plot_type = "line"
resolution = 10
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
fields = [f2]
tmp = sample
fields = list(set(fields))
start = date(start_year, 1, 1)
end = date(end_year, 1, 1)
df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
totals = df1.sum().values
percentages = totals / float(sum(totals))
selected_cols = df1.columns[np.argsort(percentages).flatten()[-2:]].values
ax1 = df1[selected_cols].plot(kind=plot_type, ax=axes[0], x_compat=True, alpha=0.75, marker='.', linewidth=2)
lgd1 = ax1.legend(loc='upper right', ncol=1, bbox_to_anchor = (1.275, 1.025))
ax1.set_ylabel("Count")
lgd1.remove()
l,h = ax1.get_ylim()
ax1.set_ylim((-2500, h+ .25*h))
df2["Male"] = df2["Male"] * 100
df2["Female"] = df2["Female"] * 100
ax2 = df2[selected_cols].plot(kind=plot_type, ax=axes[1], alpha=0.75, x_compat=True, marker='.', linewidth=2)
lgd2 = ax2.legend(loc='center', ncol=2, bbox_to_anchor = (0.5, 1.25))
ax2.set_ylabel("Percentage (%)")
_ = ax2.set_xlabel("Year")
#ax2.set_xticks()
fig.set_size_inches((12, 4))
ax2.set_ylim((-5, 110))
_ = plt.savefig("/data/csc/compsocial/freebase/figures/gender_gap.pdf", bbox_inches='tight')# bbox_extra_artist=[lgd])
In [421]:
f1 = "gender"
f2 = "profession"
category = "Female"
start_year = 1950
end_year = 1990
plot_type = "line"
resolution = 5
fields = [f2]
tmp = sample
tmp = sample.loc[sample[f1] == category]
fields = list(set(fields))
start = date(start_year, 1, 1)
end = date(end_year, 1, 1)
df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
totals = df1.sum().values
percentages = totals / float(sum(totals))
counts = False
#fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True)
cats = np.argsort(percentages).flatten()[::-1]
cols = df1.columns[cats].values
cols = [x for x in cols if isinstance(x, str)]
for i, start in enumerate(range(0, 100, 10)):
selected_cols = cols[start: start+7]
if not selected_cols: continue
if counts:
ax = df1[selected_cols].plot(kind=plot_type, ax=axes[0], marker='o', alpha=0.75, linewidth=2)
else:
ax = df2[selected_cols].plot(kind=plot_type, marker='o', alpha=0.75, linewidth=2,
)#ax=axes[i/2, i%2])
ax.set_yscale('log')# yaxis('log')
_ = ax.set_xlabel("Year")
(s,e) = ax.get_xlim()
_ = ax.set_xlim((s-5, e+5))
_ = ax.set_xlim((s-5, e+5))
ax.set_ylabel("$\Pr(Profession \mid Gender=Female)$", fontsize=12)
handles, labels = ax.get_legend_handles_labels()
labels = [x.capitalize().replace('_', ' / ') for x in labels]
ax.legend(handles, labels, loc='upper right', ncol=1, bbox_to_anchor = (1.425, 1.025))
plt.savefig("/data/csc/compsocial/freebase/figures/female_profession_gap_{}.pdf".format(i), bbox_inches='tight')# bbox_extra_artist=[lgd])