notebook.community

Edit and run



In [1]:

    
import cPickle as pickle
from io import open
import pandas as pd
import numpy as np
import pytz
from types import StringType



In [2]:

    
import datetime
from datetime import date, datetime
import dateutil
from dateutil import parser



In [3]:

    
%matplotlib inline
import matplotlib.pyplot as plt



In [4]:

    
from mpltools import style
style.use("ggplot")



In [5]:

    
from IPython.html.widgets import (interact, RadioButtons, IntSliderWidget, TextWidget,
                                  DropdownWidget, Dropdown, Select, CheckboxWidget, ToggleButtons)
from IPython.display import display









    



:0: FutureWarning: IPython widgets are experimental and may change in the future.

Loading Data



In [6]:

    
fh = open("/data/csc/fb_persons/unified_100percentpeople.pkl", "rb")
df = pickle.load(fh)



In [7]:

    
len(df)









    Out[7]:





3379996



In [8]:

    
df.head()









    Out[8]:






  
    
      
      date_of_birth
      ethnicity
      gender
      name
      nationality
      place_of_birth
      profession
      religion
    
  
  
    
      0
              NaN
       None
       Female
       Courtney Jamieson
                 None
          NaN
                  music_art
       None
    
    
      1
              NaN
       None
         Male
             Robert Moir
                 None
          NaN
                      other
       None
    
    
      2
       1962-04-21
       None
         Male
             Mehdi Jomaa
       middle eastern
       Mahdia
       (law_politics, stem)
       None
    
    
      3
              NaN
       None
         Male
        Victoria Shields
                 None
          NaN
                       None
       None
    
    
      4
       1992-06-24
       None
         Male
         Terrick Colston
                 None
          NaN
                       None
       None

Considering only people after 1800s



In [9]:

    
victorian_age = date(1800, 1, 1)
df = df[np.logical_not(df.date_of_birth.isnull().values)]
df = df[df.date_of_birth > victorian_age]



In [10]:

    
len(df)









    Out[10]:





1300159

Cleaning Nationalities



In [11]:

    
def replace_nationality(x):
  if not x: return np.nan
  USA = "USA"
  if isinstance(x, StringType) and x.lower() == "united states of america":
    return USA
  if isinstance(x, tuple):
    nationalities = set([y.lower() for y in x])
    usa = "united states of america"
    if usa in nationalities:
      nationalities = nationalities.difference([usa])
      nationalities.add(USA)
    return tuple(nationalities)



In [12]:

    
df["nationality"] = df.nationality.apply(func=lambda x: replace_nationality(x))

Considering Only Americans



In [13]:

    
americans = map(lambda x: (isinstance(x, StringType) and x == 'USA') or 
                          (isinstance(x, tuple) and  'USA' in x),
                          df.nationality)
df = df[americans]



In [14]:

    
#rows = np.random.choice(df.index.values, 1000000)
sample = df



In [15]:

    
def remove_USA(x):
  if not x: return np.nan
  USA = "USA"
  if isinstance(x, StringType) and x.lower() == USA:
    return np.nan
  if isinstance(x, tuple):
    nationalities = set(x)
    if USA in nationalities:
      nationalities = nationalities.difference([USA])
      if not nationalities: return np.nan
    if len(nationalities) == 1: return list(nationalities)[0]
    return tuple(nationalities)



In [16]:

    
df["nationality"] = df.nationality.apply(func=lambda x: remove_USA(x))

Attribute Distribution



In [17]:

    
def dist_plot(field):
  fig = plt.figure()
  s = sample[field]  
  total = len(s)
  g = s.dropna()
  left = len(g)
  percentage = 100.0 * left / total
  ax = g.value_counts(normalize=True, dropna=True)[:10].plot(kind="bar",  title=field)
  ax.set_xlabel("{:.1f}% of the {} data is available".format(percentage, field))



In [18]:

    
fields1 = ["religion", "ethnicity", "profession", "gender"]
_ = interact(dist_plot,
         field=RadioButtons(options=zip(fields1, fields1)))









    



/usr/local/lib/python2.7/dist-packages/matplotlib/font_manager.py:1279: UserWarning: findfont: Font family [u'monospace'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))

Temporal Attribute Evolution



In [142]:

    
def temporal(df, fields, resolution=10, nan_included=False, start=None, end=None):
  df = df[np.logical_not(df.date_of_birth.isnull().values)]
  df = df[df.date_of_birth >= start]
  df = df[df.date_of_birth <= end]    
  if nan_included: 
    bla = df[fields].fillna("nan")
  else:
    bla = df[fields]
  bla.index = [datetime(d.year, d.month, d.day) if d < date(2015, 1, 1) else pd.NaT
               for d in df.date_of_birth.values]
  criterion = [(bla.index.year//resolution)*resolution]
  criterion.extend(fields)
  year_grouped = bla.groupby(criterion)
  counted = pd.DataFrame(year_grouped.agg(len))
  counted = counted.reset_index()
  counted = counted.rename(columns={0: "count", "level_0": "yob"})
  counted = counted.set_index("yob")
  counted.index = [datetime(int(y), 1, 1) for y in counted.index]
  final = counted.pivot_table(index=counted.index, values="count", columns=fields)
  final = final.fillna(0)
  final = final[np.logical_not(final.index.isin([pd.NaT]))]
  norm_final = final.div(final.sum(axis=1), axis=0)
  return final, norm_final



In [20]:

    
def temporal_plot(f1, category, f2, plot_type, start_year, resolution, end_year):
  fields = [f2]
  tmp = sample
  if f1 != "Anyone":
    tmp = sample.loc[sample[f1] == category]
    category = ":"+category
  else:
    category = ""
  fields = list(set(fields))
  start = date(start_year, 1, 1)
  end = date(end_year, 1, 1)
  df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
  totals = df1.sum().values
  percentages = totals / float(sum(totals))
  selected_cols = df1.columns[np.argsort(percentages).flatten()[-6:]].values  
  ax1 = df1[selected_cols].plot(kind=plot_type, title="Counts of {} for {}{}".format(f2, f1, category))
  ax1.legend(loc='upper right', ncol=2, bbox_to_anchor = (2.0, 1.0))
  ax2 = df2[selected_cols].plot(kind=plot_type, title="Normalized Counts of {} for {}{}".format(f2, f1, category))
  ax2.legend(loc='upper right', ncol=2, bbox_to_anchor = (2.0, 1.0))



In [318]:

    
fields = ["religion", "ethnicity", "profession", "gender"]
fields2 = ["Anyone"] + fields 
_ = interact(temporal_plot,
         f1=ToggleButtons(options=zip(fields2, fields2)),
         category=TextWidget(value="christian"),
         f2=ToggleButtons(options=zip(fields, fields)),
         plot_type=ToggleButtons(options={"area": "area", "line": "line"}),
         start_year=IntSliderWidget(min=1800, max=2010, step=10, value=1800),
         end_year=IntSliderWidget(min=1800, max=2010, step=10, value=2000),
         resolution=IntSliderWidget(min=0, max=50, step=5,value=5),
        )

Final Plots

###Graph the gender gap by ethnicity (African Americans does not have narrowing gender gap.)



In [373]:

    
f1 = "ethnicity"
f2 = "gender"
start_year = 1750
end_year = 2000
plot_type = "line"
resolution = 20
#fft_axes.set_autoscaley_on(False)
fig, axes  = plt.subplots(nrows=1, ncols=1, sharex=False)
#axes.set_autoscalex_on(False)
categories = filter(lambda x: isinstance(x, str), sample[f1].value_counts().index.values)[:6]
for category in categories:
  fields = [f2]
  tmp = sample
  tmp = sample.loc[sample[f1] == category]
  category = " ".join([x.capitalize() for x in category.replace('_', ' ').split()])
  fields = list(set(fields))
  start = date(start_year, 1, 1)
  end = date(end_year, 1, 1)
  df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
  df1[category] = df1["Male"] + df1["Female"]
  df2[category] = (df2["Male"] - df2["Female"]) * 100

  ## Cleaning data (Not kosher)
  df2.loc[df2[category] < -50] = 100


  ax2 = df2[category].plot(kind=plot_type, ax=axes, alpha=0.75, x_compat=True, linewidth=2, marker="o")
lgd = ax2.legend(loc='upper right', ncol=6, bbox_to_anchor = (1.05, 1.15))
ax2.set_ylabel("Male - Female (%)")
ax2.set_ylim((-20, 120))
(s,e) = ax2.get_xlim()
#_ = ax2.set_xlim((s, e+2500))
_ = ax2.set_xlabel("Year")
fig.set_size_inches((12, 4))
#ax2.annotate('arrowstyle', xy=(0, 5),  xycoords='data', xytext=(0, 0), textcoords='offset points', arrowprops=dict(arrowstyle="->"))
plt.savefig("/data/csc/compsocial/freebase/figures/gender_ethnicity_gap.pdf", bbox_inches='tight')# bbox_extra_artist=[lgd])

Anyone by gender (actual/normalized)



In [365]:

    
f1 = "Anyone"
f2 = "gender"
start_year = 1800
end_year = 2020
plot_type = "line"
resolution = 10

fig, axes  = plt.subplots(nrows=2, ncols=1, sharex=True)

fields = [f2]
tmp = sample
fields = list(set(fields))
start = date(start_year, 1, 1)
end = date(end_year, 1, 1)
df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
totals = df1.sum().values
percentages = totals / float(sum(totals))
selected_cols = df1.columns[np.argsort(percentages).flatten()[-2:]].values  

ax1 = df1[selected_cols].plot(kind=plot_type, ax=axes[0], x_compat=True, alpha=0.75, marker='.', linewidth=2)
lgd1 = ax1.legend(loc='upper right', ncol=1, bbox_to_anchor = (1.275, 1.025))
ax1.set_ylabel("Count")
lgd1.remove()
l,h = ax1.get_ylim()
ax1.set_ylim((-2500, h+ .25*h))
df2["Male"] = df2["Male"] * 100
df2["Female"] = df2["Female"] * 100
ax2 = df2[selected_cols].plot(kind=plot_type, ax=axes[1], alpha=0.75, x_compat=True, marker='.', linewidth=2)
lgd2 = ax2.legend(loc='center', ncol=2, bbox_to_anchor = (0.5, 1.25))
ax2.set_ylabel("Percentage (%)")
_ = ax2.set_xlabel("Year")
#ax2.set_xticks()
fig.set_size_inches((12, 4))
ax2.set_ylim((-5, 110))
_ = plt.savefig("/data/csc/compsocial/freebase/figures/gender_gap.pdf", bbox_inches='tight')# bbox_extra_artist=[lgd])

### Females by career over time (recently--since 1950s)--can we see the counts of the other careers more clearly?)--want a female by male comparison maybe?



In [421]:

    
f1 = "gender"
f2 = "profession"
category = "Female"
start_year = 1950
end_year = 1990
plot_type = "line"
resolution = 5


fields = [f2]
tmp = sample

tmp = sample.loc[sample[f1] == category]

fields = list(set(fields))
start = date(start_year, 1, 1)
end = date(end_year, 1, 1)
df1, df2 = temporal(tmp, fields, nan_included=False, resolution=resolution, start=start, end=end)
totals = df1.sum().values
percentages = totals / float(sum(totals))
counts = False
#fig, axes  = plt.subplots(nrows=2, ncols=2, sharex=True)
cats = np.argsort(percentages).flatten()[::-1]
cols = df1.columns[cats].values
cols = [x for x in cols if isinstance(x, str)]
for i, start in enumerate(range(0, 100, 10)):
  selected_cols = cols[start: start+7]
  if not selected_cols: continue

  if counts:
    ax = df1[selected_cols].plot(kind=plot_type, ax=axes[0], marker='o', alpha=0.75, linewidth=2)
  else:
    ax = df2[selected_cols].plot(kind=plot_type, marker='o', alpha=0.75, linewidth=2,
                               )#ax=axes[i/2, i%2])
  ax.set_yscale('log')# yaxis('log')
  _ = ax.set_xlabel("Year")
  (s,e) = ax.get_xlim()

  _ = ax.set_xlim((s-5, e+5))
  _ = ax.set_xlim((s-5, e+5))
  ax.set_ylabel("$\Pr(Profession \mid Gender=Female)$", fontsize=12)
  handles, labels = ax.get_legend_handles_labels()
  labels = [x.capitalize().replace('_', ' / ') for x in labels]
  ax.legend(handles, labels, loc='upper right', ncol=1, bbox_to_anchor = (1.425, 1.025))
  plt.savefig("/data/csc/compsocial/freebase/figures/female_profession_gap_{}.pdf".format(i), bbox_inches='tight')# bbox_extra_artist=[lgd])

	date_of_birth	ethnicity	gender	name	nationality	place_of_birth	profession	religion
0	NaN	None	Female	Courtney Jamieson	None	NaN	music_art	None
1	NaN	None	Male	Robert Moir	None	NaN	other	None
2	1962-04-21	None	Male	Mehdi Jomaa	middle eastern	Mahdia	(law_politics, stem)	None
3	NaN	None	Male	Victoria Shields	None	NaN	None	None
4	1992-06-24	None	Male	Terrick Colston	None	NaN	None	None