In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import date
from collections import Counter
from pandas import Series
from patsy import dmatrices, dmatrix
from collections import Counter

Loading Data


In [2]:
fh = open("/data/csc/fb_persons/unified_100percentpeople.pkl", 'rb')
df = pickle.load(fh)

In [3]:
len(df)


Out[3]:
3379996

In [4]:
df.head()


Out[4]:
date_of_birth ethnicity gender name nationality place_of_birth profession religion
0 NaN None Female Courtney Jamieson None NaN music_art None
1 NaN None Male Robert Moir None NaN other None
2 1962-04-21 None Male Mehdi Jomaa middle eastern Mahdia (law_politics, stem) None
3 NaN None Male Victoria Shields None NaN None None
4 1992-06-24 None Male Terrick Colston None NaN None None

In [5]:
selected_columns = ["gender", "date_of_birth", "ethnicity", "name", "profession",
                    "religion", "nationality"]
df = df[selected_columns]

In [6]:
df.head()


Out[6]:
gender date_of_birth ethnicity name profession religion nationality
0 Female NaN None Courtney Jamieson music_art None None
1 Male NaN None Robert Moir other None None
2 Male 1962-04-21 None Mehdi Jomaa (law_politics, stem) None middle eastern
3 Male NaN None Victoria Shields None None None
4 Male 1992-06-24 None Terrick Colston None None None

Considering only people after the 1800s


In [7]:
victorian_age = date(1800, 1, 1)
df = df[np.logical_not(df.date_of_birth.isnull().values)]
df = df[df.date_of_birth > victorian_age]

In [8]:
len(df)


Out[8]:
1300159

In [9]:
len(df.nationality.dropna())


Out[9]:
703447

Considering Only Americans


In [10]:
americans = map(lambda x: (isinstance(x, basestring) and x == 'united states of america') or
                          (isinstance(x, tuple) and  'united states of america' in x),
                          df.nationality)
df = df[americans]

In [11]:
len(df)


Out[11]:
223632

In [12]:
df.head()


Out[12]:
gender date_of_birth ethnicity name profession religion nationality
11 Male 1995-04-09 None Justin Cook None None united states of america
44 Male 1991-08-29 None Brandon Cunningham None None united states of america
47 Male 1992-11-06 None Mike Davis None None united states of america
48 Male 1994-04-15 None Ricky Ortiz None None united states of america
53 Male 1993-06-03 None Jamal Douglas None None united states of america

Stats Models


In [13]:
import statsmodels as sm
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.tools import add_constant

Drop 'Other' people


In [14]:
other_index = np.where((df.gender == 'Other').values)[0]
df.drop(df.index[other_index], inplace=True)

In [15]:
def replace(x):
  if not x:
    return np.nan
  if isinstance(x, tuple):
    return "mixed"
  return x

In [16]:
df["religion"] = df.religion.apply(func=lambda x: replace(x))

In [17]:
def replace_nationality(x):
  if not x: return np.nan
  if x == "united states of america":
    return "USA"
  if isinstance(x, tuple):
    nationalities = set(x)
    x = nationalities.difference(["united states of america"])
    if not x: return "USA"
    if len(x) > 1: return "mixed"
    x = list(x)[0]
  return x

In [18]:
df["nationality"] = df.nationality.apply(func=lambda x: replace_nationality(x))

Drop People without profession


In [19]:
empty_rows = np.where([x is None for x in df.profession.values.flatten()])[0]
dfXy = df.drop(df.index[empty_rows])
dfXy = dfXy[["profession", "gender", "religion", "nationality"]]

Map Professions to traditional vs non-traditional


In [20]:
prof_map = {'music_art': "traditional", 'humanities': "traditional",
            'athlete': "non-traditional", 'business': "non-traditional",
             'law_politics': "non-traditional", 'religion': "non-traditional",
             'stem': "non-traditional"}

In [21]:
def clear_prof(x):
  if isinstance(x, basestring):
    return prof_map.get(x, x)
  else:
    tmp = set([prof_map.get(x_, x_) for x_ in x])
    if len(tmp) == 1:
      return list(tmp)[0]
    else:
      return "mixed"

In [22]:
dfXy["profession"] = dfXy.profession.apply(func=lambda x: clear_prof(x))

Save Results


In [28]:
dfXy.nationality.value_counts()


Out[28]:
USA                 154456
white                 5841
asian                  828
hispanic_latino        738
middle eastern         461
african_american       302
mixed                  112
indigenous              45
dtype: int64

In [26]:
dfXy.to_csv("Profession_Analysis_Processed.csv")

In [67]:
X = dmatrix("C(gender, Treatment('Male')) + "
            "C(nationality, Treatment('USA')) + "
            "C(religion, Treatment('christian'))",
            data=dfXy.fillna("missing"), return_type='dataframe')

In [68]:
len(X)


Out[68]:
162783

In [69]:
X2  = X[[x  for x in X.columns if 'missing' not in x]]

In [70]:
X.head()


Out[70]:
Intercept C(gender, Treatment('Male'))[T.Female] C(gender, Treatment('Male'))[T.missing] C(nationality, Treatment('USA'))[T.african_american] C(nationality, Treatment('USA'))[T.asian] C(nationality, Treatment('USA'))[T.hispanic_latino] C(nationality, Treatment('USA'))[T.indigenous] C(nationality, Treatment('USA'))[T.middle eastern] C(nationality, Treatment('USA'))[T.mixed] C(nationality, Treatment('USA'))[T.white] C(religion, Treatment('christian'))[T.asian religion] C(religion, Treatment('christian'))[T.atheist_spiritual] C(religion, Treatment('christian'))[T.buddhist] C(religion, Treatment('christian'))[T.hindu] C(religion, Treatment('christian'))[T.jewish] C(religion, Treatment('christian'))[T.missing] C(religion, Treatment('christian'))[T.mixed] C(religion, Treatment('christian'))[T.muslim] C(religion, Treatment('christian'))[T.other]
74 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
75 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
149 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
228 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0
377 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0

In [71]:
y = dfXy["profession"]
ynames = {'mixed': 2, 'non-traditional': 1, 'other': 3, 'traditional': 0}
ynames_reverse = {v:k for k,v in ynames.items()}
y1 = y.apply(lambda x: ynames[x])

In [1]:
y.head()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-073ab239a5bf> in <module>()
----> 1 y.head()

NameError: name 'y' is not defined

In [73]:
dict(Counter(y.values))


Out[73]:
{'mixed': 11514,
 'non-traditional': 48507,
 'other': 2443,
 'traditional': 100319}

In [74]:
print "Colinearity before removing the missing value", np.linalg.cond(X)
print "Colinearity after removing the missing value", np.linalg.cond(X2)


Colinearity before removing the missing value 113.900214164
Colinearity after removing the missing value 83.2171475383

In [75]:
#data = dfX.values
#data = add_constant(dfX2)
model = MNLogit(y1, X2)
result = model.fit(method='ncg', maxiter=100)


Optimization terminated successfully.
         Current function value: 0.873872
         Iterations: 89
         Function evaluations: 90
         Gradient evaluations: 178
         Hessian evaluations: 89

In [76]:
result.summary(yname_list=[ynames_reverse[i] for i in range(1,len(ynames))])


Out[76]:
MNLogit Regression Results
Dep. Variable: profession No. Observations: 162783
Model: MNLogit Df Residuals: 162732
Method: MLE Df Model: 48
Date: Mon, 02 Feb 2015 Pseudo R-squ.: 0.03914
Time: 01:01:24 Log-Likelihood: -1.4225e+05
converged: True LL-Null: -1.4805e+05
LLR p-value: 0.000
non-traditional coef std err z P>|z| [95.0% Conf. Int.]
Intercept -0.4522 0.006 -72.912 0.000 -0.464 -0.440
C(gender, Treatment('Male'))[T.Female] -1.6275 0.018 -88.625 0.000 -1.664 -1.592
C(nationality, Treatment('USA'))[T.african_american] 0.5413 0.131 4.148 0.000 0.285 0.797
C(nationality, Treatment('USA'))[T.asian] -0.4445 0.095 -4.690 0.000 -0.630 -0.259
C(nationality, Treatment('USA'))[T.hispanic_latino] -0.0328 0.087 -0.379 0.705 -0.203 0.137
C(nationality, Treatment('USA'))[T.indigenous] 2.1484 0.424 5.065 0.000 1.317 2.980
C(nationality, Treatment('USA'))[T.middle eastern] -0.2090 0.116 -1.798 0.072 -0.437 0.019
C(nationality, Treatment('USA'))[T.mixed] 0.1930 0.218 0.883 0.377 -0.235 0.621
C(nationality, Treatment('USA'))[T.white] -0.0838 0.032 -2.643 0.008 -0.146 -0.022
C(religion, Treatment('christian'))[T.asian religion] 0.8817 0.451 1.954 0.051 -0.003 1.766
C(religion, Treatment('christian'))[T.atheist_spiritual] -0.2715 0.130 -2.081 0.037 -0.527 -0.016
C(religion, Treatment('christian'))[T.buddhist] -1.4632 0.354 -4.133 0.000 -2.157 -0.769
C(religion, Treatment('christian'))[T.hindu] 0.6567 0.211 3.112 0.002 0.243 1.070
C(religion, Treatment('christian'))[T.jewish] 0.6840 0.068 10.125 0.000 0.552 0.816
C(religion, Treatment('christian'))[T.mixed] -0.1477 0.229 -0.645 0.519 -0.596 0.301
C(religion, Treatment('christian'))[T.muslim] -0.6471 0.191 -3.383 0.001 -1.022 -0.272
C(religion, Treatment('christian'))[T.other] 0.2713 0.179 1.518 0.129 -0.079 0.621
mixed coef std err z P>|z| [95.0% Conf. Int.]
Intercept -2.0706 0.011 -180.641 0.000 -2.093 -2.048
C(gender, Treatment('Male'))[T.Female] -0.6307 0.025 -25.185 0.000 -0.680 -0.582
C(nationality, Treatment('USA'))[T.african_american] 1.0090 0.175 5.757 0.000 0.665 1.353
C(nationality, Treatment('USA'))[T.asian] 0.5254 0.110 4.773 0.000 0.310 0.741
C(nationality, Treatment('USA'))[T.hispanic_latino] 0.2543 0.133 1.909 0.056 -0.007 0.515
C(nationality, Treatment('USA'))[T.indigenous] 1.5635 0.629 2.484 0.013 0.330 2.797
C(nationality, Treatment('USA'))[T.middle eastern] 0.6410 0.140 4.593 0.000 0.367 0.915
C(nationality, Treatment('USA'))[T.mixed] 0.4499 0.320 1.407 0.160 -0.177 1.077
C(nationality, Treatment('USA'))[T.white] 0.3727 0.046 8.166 0.000 0.283 0.462
C(religion, Treatment('christian'))[T.asian religion] 0.7351 0.658 1.117 0.264 -0.555 2.025
C(religion, Treatment('christian'))[T.atheist_spiritual] 1.3621 0.123 11.084 0.000 1.121 1.603
C(religion, Treatment('christian'))[T.buddhist] 0.8253 0.240 3.436 0.001 0.355 1.296
C(religion, Treatment('christian'))[T.hindu] 0.7764 0.282 2.757 0.006 0.225 1.328
C(religion, Treatment('christian'))[T.jewish] 1.3252 0.082 16.186 0.000 1.165 1.486
C(religion, Treatment('christian'))[T.mixed] 1.9005 0.183 10.379 0.000 1.542 2.259
C(religion, Treatment('christian'))[T.muslim] 0.4177 0.218 1.914 0.056 -0.010 0.845
C(religion, Treatment('christian'))[T.other] 1.2020 0.202 5.948 0.000 0.806 1.598
other coef std err z P>|z| [95.0% Conf. Int.]
Intercept -3.5629 0.023 -153.305 0.000 -3.608 -3.517
C(gender, Treatment('Male'))[T.Female] -0.6510 0.053 -12.284 0.000 -0.755 -0.547
C(nationality, Treatment('USA'))[T.african_american] -4.4058 4.835 -0.911 0.362 -13.883 5.071
C(nationality, Treatment('USA'))[T.asian] -0.6227 0.383 -1.625 0.104 -1.374 0.128
C(nationality, Treatment('USA'))[T.hispanic_latino] -0.5996 0.412 -1.457 0.145 -1.406 0.207
C(nationality, Treatment('USA'))[T.indigenous] 1.6349 1.073 1.524 0.127 -0.467 3.737
C(nationality, Treatment('USA'))[T.middle eastern] -0.6392 0.507 -1.261 0.207 -1.633 0.354
C(nationality, Treatment('USA'))[T.mixed] -0.3855 1.009 -0.382 0.702 -2.363 1.592
C(nationality, Treatment('USA'))[T.white] 0.1094 0.106 1.028 0.304 -0.099 0.318
C(religion, Treatment('christian'))[T.asian religion] -1.7469 5.172 -0.338 0.736 -11.883 8.389
C(religion, Treatment('christian'))[T.atheist_spiritual] 0.1021 0.415 0.246 0.805 -0.710 0.915
C(religion, Treatment('christian'))[T.buddhist] -0.7045 1.006 -0.700 0.484 -2.677 1.268
C(religion, Treatment('christian'))[T.hindu] -0.1409 1.013 -0.139 0.889 -2.126 1.844
C(religion, Treatment('christian'))[T.jewish] -0.4215 0.358 -1.178 0.239 -1.123 0.280
C(religion, Treatment('christian'))[T.mixed] -0.5068 1.008 -0.503 0.615 -2.483 1.469
C(religion, Treatment('christian'))[T.muslim] 1.1908 0.332 3.591 0.000 0.541 1.841
C(religion, Treatment('christian'))[T.other] 1.7997 0.298 6.043 0.000 1.216 2.383

In [77]:
print "number of labels is", model.J, "number of features is", model.K


number of labels is 4.0 number of features is 17.0

Parameters


In [78]:
for i in range(1,len(ynames)):
  print i-1, ":", ynames_reverse[i]


0 : non-traditional
1 : mixed
2 : other

In [79]:
params = result.params
params


Out[79]:
0 1 2
Intercept -0.452178 -2.070631 -3.562928
C(gender, Treatment('Male'))[T.Female] -1.627548 -0.630670 -0.650982
C(nationality, Treatment('USA'))[T.african_american] 0.541280 1.009002 -4.405795
C(nationality, Treatment('USA'))[T.asian] -0.444467 0.525377 -0.622702
C(nationality, Treatment('USA'))[T.hispanic_latino] -0.032839 0.254258 -0.599552
C(nationality, Treatment('USA'))[T.indigenous] 2.148423 1.563519 1.634937
C(nationality, Treatment('USA'))[T.middle eastern] -0.208955 0.640987 -0.639186
C(nationality, Treatment('USA'))[T.mixed] 0.192999 0.449902 -0.385529
C(nationality, Treatment('USA'))[T.white] -0.083787 0.372693 0.109439
C(religion, Treatment('christian'))[T.asian religion] 0.881683 0.735074 -1.746942
C(religion, Treatment('christian'))[T.atheist_spiritual] -0.271548 1.362085 0.102135
C(religion, Treatment('christian'))[T.buddhist] -1.463246 0.825275 -0.704541
C(religion, Treatment('christian'))[T.hindu] 0.656731 0.776449 -0.140925
C(religion, Treatment('christian'))[T.jewish] 0.683999 1.325236 -0.421531
C(religion, Treatment('christian'))[T.mixed] -0.147686 1.900456 -0.506792
C(religion, Treatment('christian'))[T.muslim] -0.647086 0.417721 1.190757
C(religion, Treatment('christian'))[T.other] 0.271282 1.202023 1.799701

Odd ratios


In [80]:
for i in range(1,len(ynames)):
  print i-1, ":", ynames_reverse[i]


0 : non-traditional
1 : mixed
2 : other

In [81]:
odds = np.exp(result.params)
odds


Out[81]:
0 1 2
Intercept 0.636241 0.126106 0.028356
C(gender, Treatment('Male'))[T.Female] 0.196411 0.532235 0.521533
C(nationality, Treatment('USA'))[T.african_american] 1.718205 2.742861 0.012206
C(nationality, Treatment('USA'))[T.asian] 0.641166 1.691096 0.536493
C(nationality, Treatment('USA'))[T.hispanic_latino] 0.967695 1.289504 0.549057
C(nationality, Treatment('USA'))[T.indigenous] 8.571330 4.775597 5.129132
C(nationality, Treatment('USA'))[T.middle eastern] 0.811431 1.898354 0.527722
C(nationality, Treatment('USA'))[T.mixed] 1.212882 1.568158 0.680091
C(nationality, Treatment('USA'))[T.white] 0.919627 1.451638 1.115653
C(religion, Treatment('christian'))[T.asian religion] 2.414960 2.085636 0.174306
C(religion, Treatment('christian'))[T.atheist_spiritual] 0.762198 3.904325 1.107533
C(religion, Treatment('christian'))[T.buddhist] 0.231484 2.282509 0.494335
C(religion, Treatment('christian'))[T.hindu] 1.928478 2.173740 0.868555
C(religion, Treatment('christian'))[T.jewish] 1.981787 3.763073 0.656042
C(religion, Treatment('christian'))[T.mixed] 0.862702 6.688946 0.602425
C(religion, Treatment('christian'))[T.muslim] 0.523569 1.518498 3.289570
C(religion, Treatment('christian'))[T.other] 1.311644 3.326841 6.047837

p-values


In [82]:
result.pvalues


Out[82]:
0 1 2
Intercept 0.000000e+00 0.000000e+00 0.000000e+00
C(gender, Treatment('Male'))[T.Female] 0.000000e+00 5.850423e-140 1.105136e-34
C(nationality, Treatment('USA'))[T.african_american] 3.360756e-05 8.567901e-09 3.622180e-01
C(nationality, Treatment('USA'))[T.asian] 2.731140e-06 1.819300e-06 1.040626e-01
C(nationality, Treatment('USA'))[T.hispanic_latino] 7.050353e-01 5.628118e-02 1.452454e-01
C(nationality, Treatment('USA'))[T.indigenous] 4.080337e-07 1.298575e-02 1.274512e-01
C(nationality, Treatment('USA'))[T.middle eastern] 7.212110e-02 4.371501e-06 2.073249e-01
C(nationality, Treatment('USA'))[T.mixed] 3.770247e-01 1.595320e-01 7.023441e-01
C(nationality, Treatment('USA'))[T.white] 8.229002e-03 3.198316e-16 3.039058e-01
C(religion, Treatment('christian'))[T.asian religion] 5.068633e-02 2.641521e-01 7.355206e-01
C(religion, Treatment('christian'))[T.atheist_spiritual] 3.739104e-02 1.495481e-28 8.054083e-01
C(religion, Treatment('christian'))[T.buddhist] 3.584632e-05 5.895757e-04 4.838615e-01
C(religion, Treatment('christian'))[T.hindu] 1.860232e-03 5.825954e-03 8.893516e-01
C(religion, Treatment('christian'))[T.jewish] 4.271339e-24 6.341905e-59 2.388538e-01
C(religion, Treatment('christian'))[T.mixed] 5.188181e-01 3.075779e-25 6.151503e-01
C(religion, Treatment('christian'))[T.muslim] 7.175871e-04 5.560101e-02 3.297670e-04
C(religion, Treatment('christian'))[T.other] 1.289143e-01 2.720419e-09 1.508855e-09

In [83]:
result.bse


Out[83]:
0 1 2
Intercept 0.006202 0.011463 0.023241
C(gender, Treatment('Male'))[T.Female] 0.018364 0.025041 0.052995
C(nationality, Treatment('USA'))[T.african_american] 0.130506 0.175269 4.835439
C(nationality, Treatment('USA'))[T.asian] 0.094768 0.110084 0.383090
C(nationality, Treatment('USA'))[T.hispanic_latino] 0.086752 0.133199 0.411630
C(nationality, Treatment('USA'))[T.indigenous] 0.424156 0.629396 1.072631
C(nationality, Treatment('USA'))[T.middle eastern] 0.116193 0.139561 0.506906
C(nationality, Treatment('USA'))[T.mixed] 0.218475 0.319840 1.008821
C(nationality, Treatment('USA'))[T.white] 0.031707 0.045642 0.106449
C(religion, Treatment('christian'))[T.asian religion] 0.451190 0.658296 5.171662
C(religion, Treatment('christian'))[T.atheist_spiritual] 0.130460 0.122884 0.414588
C(religion, Treatment('christian'))[T.buddhist] 0.354062 0.240159 1.006336
C(religion, Treatment('christian'))[T.hindu] 0.211053 0.281585 1.012939
C(religion, Treatment('christian'))[T.jewish] 0.067554 0.081876 0.357879
C(religion, Treatment('christian'))[T.mixed] 0.228910 0.183098 1.008068
C(religion, Treatment('christian'))[T.muslim] 0.191289 0.218228 0.331621
C(religion, Treatment('christian'))[T.other] 0.178663 0.202101 0.297795

Objectives

  1. Change base case--make traditional, male, white and christian base cases
  2. Covariate plot matrix (religion, gender, nationality)--percentages and maybe correlations
  3. Redo regression analyses (review how to interpret info)--try mixed and accounting for multiple identities
  4. Stacked area plot for all demographics (religion, nationality, gender)