import cPickle as pickle
from io import open
import pandas as pd
import numpy as np
import pytz
from types import StringType

import datetime
from datetime import date, datetime
import dateutil
from dateutil import parser

%matplotlib inline
import matplotlib.pyplot as plt

from mpltools import style

from IPython.html.widgets import (interact, RadioButtons, IntSliderWidget, TextWidget,
                                  DropdownWidget, Dropdown, Select, CheckboxWidget, ToggleButtons)
from IPython.display import display

Loading Data

fh = open("/data/csc/fb_persons/unified_100percentpeople.pkl", "rb")
df = pickle.load(fh)

date_of_birth ethnicity gender name nationality place_of_birth profession religion
0 NaN None Female Courtney Jamieson None NaN music_art None
1 NaN None Male Robert Moir None NaN other None
2 1962-04-21 None Male Mehdi Jomaa middle eastern Mahdia (law_politics, stem) None
3 NaN None Male Victoria Shields None NaN None None
4 1992-06-24 None Male Terrick Colston None NaN None None

Considering only people after 1800s

victorian_age = date(1800, 1, 1)
df = df[np.logical_not(df.date_of_birth.isnull().values)]
df = df[df.date_of_birth > victorian_age]

Cleaning Nationalities

def replace_nationality(x):
  if not x: return np.nan
  USA = "USA"
  if isinstance(x, StringType) and x.lower() == "united states of america":
    return USA
  if isinstance(x, tuple):
    nationalities = set([y.lower() for y in x])
    usa = "united states of america"
    if usa in nationalities:
      nationalities = nationalities.difference([usa])
    return tuple(nationalities)

df["nationality"] = df.nationality.apply(func=lambda x: replace_nationality(x))

Considering Only Americans

americans = map(lambda x: (isinstance(x, StringType) and x == 'USA') or 
                          (isinstance(x, tuple) and  'USA' in x),
df = df[americans]

#rows = np.random.choice(df.index.values, 1000000)
sample = df

def remove_USA(x):
  if not x: return np.nan
  USA = "USA"
  if isinstance(x, StringType) and x.lower() == USA:
    return np.nan
  if isinstance(x, tuple):
    nationalities = set(x)
    if USA in nationalities:
      nationalities = nationalities.difference([USA])
      if not nationalities: return np.nan
    if len(nationalities) == 1: return list(nationalities)[0]
    return tuple(nationalities)

df["nationality"] = df.nationality.apply(func=lambda x: remove_USA(x))

Attribute Distribution

def dist_plot(field):
  fig = plt.figure()
  s = sample[field]  
  total = len(s)
  g = s.dropna()
  left = len(g)
  percentage = 100.0 * left / total
  ax = g.value_counts(normalize=True, dropna=True)[:10].plot(kind="bar",  title=field)
  ax.set_xlabel("{:.1f}% of the {} data is available".format(percentage, field))

fields1 = ["religion", "ethnicity", "profession", "gender"]
_ = interact(dist_plot,
         field=RadioButtons(options=zip(fields1, fields1)))

Temporal Attribute Evolution

def temporal(df, fields, resolution=10, nan_included=False, start=None, end=None):
  df = df[np.logical_not(df.date_of_birth.isnull().values)]
  df = df[df.date_of_birth >= start]
  df = df[df.date_of_birth <= end]    
  if nan_included: 
    bla = df[fields].fillna("nan")
    bla = df[fields]
  bla.index = [datetime(d.year, d.month, if d < date(2015, 1, 1) else pd.NaT
               for d in df.date_of_birth.values]
  criterion = [(bla.index.year//resolution)*resolution]
  year_grouped = bla.groupby(criterion)
  counted = pd.DataFrame(year_grouped.agg(len))
  counted = counted.reset_index()
  counted = counted.rename(columns={0: "count", "level_0": "yob"})
  counted = counted.set_index("yob")
  counted.index = [datetime(int(y), 1, 1) for y in counted.index]
  final = counted.pivot_table(index=counted.index, values="count", columns=fields)
  final = final.fillna(0)
  final = final[np.logical_not(final.index.isin([pd.NaT]))]
  norm_final = final.div(final.sum(axis=1), axis=0)
  return final, norm_final

def temporal_plot(f1, category, f2, plot_type, start_year, resolution, end_year):
  fields = [f2]
  tmp = sample
  if f1 != "Anyone":
    tmp = sample.loc[sample[f1] == category]
    category = ":"+category
    category = ""
  fields = list(set(fields))
  start = date(start_year, 1, 1)
  end = date(end_year, 1, 1)
  df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
  totals = df1.sum().values
  percentages = totals / float(sum(totals))
  selected_cols = df1.columns[np.argsort(percentages).flatten()[-6:]].values  
  ax1 = df1[selected_cols].plot(kind=plot_type, title="Counts of {} for {}{}".format(f2, f1, category))
  ax1.legend(loc='upper right', ncol=2, bbox_to_anchor = (2.0, 1.0))
  ax2 = df2[selected_cols].plot(kind=plot_type, title="Normalized Counts of {} for {}{}".format(f2, f1, category))
  ax2.legend(loc='upper right', ncol=2, bbox_to_anchor = (2.0, 1.0))

fields = ["religion", "ethnicity", "profession", "gender"]
fields2 = ["Anyone"] + fields 
_ = interact(temporal_plot,
         f1=ToggleButtons(options=zip(fields2, fields2)),
         f2=ToggleButtons(options=zip(fields, fields)),
         plot_type=ToggleButtons(options={"area": "area", "line": "line"}),
         start_year=IntSliderWidget(min=1800, max=2010, step=10, value=1800),
         end_year=IntSliderWidget(min=1800, max=2010, step=10, value=2000),
         resolution=IntSliderWidget(min=0, max=50, step=5,value=5),

Final Plots

  1. ###Graph the gender gap by ethnicity (African Americans does not have narrowing gender gap.)

f1 = "ethnicity"
f2 = "gender"
start_year = 1750
end_year = 2000
plot_type = "line"
resolution = 20
fig, axes  = plt.subplots(nrows=1, ncols=1, sharex=False)
categories = filter(lambda x: isinstance(x, str), sample[f1].value_counts().index.values)[:6]
for category in categories:
  fields = [f2]
  tmp = sample
  tmp = sample.loc[sample[f1] == category]
  category = " ".join([x.capitalize() for x in category.replace('_', ' ').split()])
  fields = list(set(fields))
  start = date(start_year, 1, 1)
  end = date(end_year, 1, 1)
  df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
  df1[category] = df1["Male"] + df1["Female"]
  df2[category] = (df2["Male"] - df2["Female"]) * 100

  ## Cleaning data (Not kosher)
  df2.loc[df2[category] < -50] = 100

  ax2 = df2[category].plot(kind=plot_type, ax=axes, alpha=0.75, x_compat=True, linewidth=2, marker="o")
lgd = ax2.legend(loc='upper right', ncol=6, bbox_to_anchor = (1.05, 1.15))
ax2.set_ylabel("Male - Female (%)")
ax2.set_ylim((-20, 120))
(s,e) = ax2.get_xlim()
#_ = ax2.set_xlim((s, e+2500))
_ = ax2.set_xlabel("Year")
fig.set_size_inches((12, 4))
#ax2.annotate('arrowstyle', xy=(0, 5),  xycoords='data', xytext=(0, 0), textcoords='offset points', arrowprops=dict(arrowstyle="->"))
plt.savefig("/data/csc/compsocial/freebase/figures/gender_ethnicity_gap.pdf", bbox_inches='tight')# bbox_extra_artist=[lgd])

Anyone by gender (actual/normalized)

f1 = "Anyone"
f2 = "gender"
start_year = 1800
end_year = 2020
plot_type = "line"
resolution = 10

fig, axes  = plt.subplots(nrows=2, ncols=1, sharex=True)

fields = [f2]
tmp = sample
fields = list(set(fields))
start = date(start_year, 1, 1)
end = date(end_year, 1, 1)
df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
totals = df1.sum().values
percentages = totals / float(sum(totals))
selected_cols = df1.columns[np.argsort(percentages).flatten()[-2:]].values  

ax1 = df1[selected_cols].plot(kind=plot_type, ax=axes[0], x_compat=True, alpha=0.75, marker='.', linewidth=2)
lgd1 = ax1.legend(loc='upper right', ncol=1, bbox_to_anchor = (1.275, 1.025))
l,h = ax1.get_ylim()
ax1.set_ylim((-2500, h+ .25*h))
df2["Male"] = df2["Male"] * 100
df2["Female"] = df2["Female"] * 100
ax2 = df2[selected_cols].plot(kind=plot_type, ax=axes[1], alpha=0.75, x_compat=True, marker='.', linewidth=2)
lgd2 = ax2.legend(loc='center', ncol=2, bbox_to_anchor = (0.5, 1.25))
ax2.set_ylabel("Percentage (%)")
_ = ax2.set_xlabel("Year")
fig.set_size_inches((12, 4))
ax2.set_ylim((-5, 110))
_ = plt.savefig("/data/csc/compsocial/freebase/figures/gender_gap.pdf", bbox_inches='tight')# bbox_extra_artist=[lgd])

  1. ### Females by career over time (recently--since 1950s)--can we see the counts of the other careers more clearly?)--want a female by male comparison maybe?

f1 = "gender"
f2 = "profession"
category = "Female"
start_year = 1950
end_year = 1990
plot_type = "line"
resolution = 5

fields = [f2]
tmp = sample

tmp = sample.loc[sample[f1] == category]

fields = list(set(fields))
start = date(start_year, 1, 1)
end = date(end_year, 1, 1)
df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
totals = df1.sum().values
percentages = totals / float(sum(totals))
counts = False
#fig, axes  = plt.subplots(nrows=2, ncols=2, sharex=True)
cats = np.argsort(percentages).flatten()[::-1]
cols = df1.columns[cats].values
cols = [x for x in cols if isinstance(x, str)]
for i, start in enumerate(range(0, 100, 10)):
  selected_cols = cols[start: start+7]
  if not selected_cols: continue

  if counts:
    ax = df1[selected_cols].plot(kind=plot_type, ax=axes[0], marker='o', alpha=0.75, linewidth=2)
    ax = df2[selected_cols].plot(kind=plot_type, marker='o', alpha=0.75, linewidth=2,
                               )#ax=axes[i/2, i%2])
  ax.set_yscale('log')# yaxis('log')
  _ = ax.set_xlabel("Year")
  (s,e) = ax.get_xlim()

  _ = ax.set_xlim((s-5, e+5))
  _ = ax.set_xlim((s-5, e+5))
  ax.set_ylabel("$\Pr(Profession \mid Gender=Female)$", fontsize=12)
  handles, labels = ax.get_legend_handles_labels()
  labels = [x.capitalize().replace('_', ' / ') for x in labels]
  ax.legend(handles, labels, loc='upper right', ncol=1, bbox_to_anchor = (1.425, 1.025))
  plt.savefig("/data/csc/compsocial/freebase/figures/female_profession_gap_{}.pdf".format(i), bbox_inches='tight')# bbox_extra_artist=[lgd])