In [1]:
import pandas as pd
import numpy as np
import cPickle as pickle
from datetime import date
from collections import Counter
from pandas import Series

Loading Data


In [2]:
fh = open("/data/csc/fb_persons/unified_100percentpeople.pkl", 'rb')
df = pickle.load(fh)

In [3]:
df.head()


Out[3]:
date_of_birth ethnicity gender name nationality place_of_birth profession religion
0 NaN None Female Courtney Jamieson None NaN music_art None
1 NaN None Male Robert Moir None NaN other None
2 1962-04-21 None Male Mehdi Jomaa middle eastern Mahdia (law_politics, stem) None
3 NaN None Male Victoria Shields None NaN None None
4 1992-06-24 None Male Terrick Colston None NaN None None

In [4]:
selected_columns = ["gender", "date_of_birth", "ethnicity", "name", "profession",
                    "religion", "nationality"]
df = df[selected_columns]

In [5]:
df.head()


Out[5]:
gender date_of_birth ethnicity name profession religion nationality
0 Female NaN None Courtney Jamieson music_art None None
1 Male NaN None Robert Moir other None None
2 Male 1962-04-21 None Mehdi Jomaa (law_politics, stem) None middle eastern
3 Male NaN None Victoria Shields None None None
4 Male 1992-06-24 None Terrick Colston None None None

Considering only people after the 1800s


In [6]:
victorian_age = date(1800, 1, 1)
df = df[np.logical_not(df.date_of_birth.isnull().values)]
df = df[df.date_of_birth > victorian_age]

In [7]:
len(df)


Out[7]:
1300159

In [8]:
len(df.nationality.dropna())


Out[8]:
703447

Considering Only Americans


In [9]:
americans = map(lambda x: (isinstance(x, basestring) and x == 'united states of america') or
                          (isinstance(x, tuple) and  'united states of america' in x),
                          df.nationality)
df = df[americans]

In [10]:
len(df)


Out[10]:
223632

Feature Extraction


In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

In [12]:
def feature_matrix(df, field):
  features = df[[field]].fillna("nan").values.reshape(-1)
  vocab = []
  for value in features:
    if isinstance(value, tuple):
      vocab.extend(value)
    else:
      vocab.append(value)
  vocab = set(vocab)
  word_index = {w:i for i,w in enumerate(vocab)}
  matrix = np.zeros((len(features), len(vocab)))
  for i, value in enumerate(features):
    if isinstance(value, tuple):
      for v in value:
        matrix[i, word_index[v]] = 1.0/len(value)
    else:
      matrix[i, word_index[value]] = 1
  print "{}: persons, {}:number of categories".format(*matrix.shape)
  print "{} persons have mixed categories".format(np.argwhere((matrix > 0).sum(axis=1) > 1).shape[0])
  return matrix, word_index

In [13]:
gender_features, gender_index = feature_matrix(df, "gender")


223632: persons, 4:number of categories
0 persons have mixed categories

In [14]:
ethnicity_features, ethnicity_index = feature_matrix(df, "ethnicity")


223632: persons, 12:number of categories
795 persons have mixed categories

In [15]:
religion_features, religion_index = feature_matrix(df, "religion")


223632: persons, 9:number of categories
179 persons have mixed categories

In [16]:
nationality_features, nationality_index = feature_matrix(df, "nationality")


223632: persons, 7:number of categories
10910 persons have mixed categories

In [17]:
profession_features, profession_index = feature_matrix(df, "profession")


223632: persons, 9:number of categories
31897 persons have mixed categories

In [18]:
non_traditional_cols  = [profession_index[x] for x in ['athlete', 'business', 'law_politics',
                                                       'religion', 'stem']]
traditional_cols = [profession_index[x] for x in ['music_art', 'humanities']]

In [19]:
non_traditional = 1 * (profession_features[:, non_traditional_cols].sum(axis=1) >= 1)
traditional = 1 * (profession_features[:, traditional_cols].sum(axis=1) >= 1)
nan_features = profession_features[:, profession_index["nan"]]
other_features = profession_features[:, profession_index["other"]]
general_professions = np.vstack([non_traditional, traditional, nan_features, other_features]).T
general_professions_index = {"non_traditional": 0, "traditional": 1, "nan": 2, "other":3}

Correlation Analysis


In [20]:
from scipy.stats import pearsonr

In [21]:
def correlate(var1_features, var2_features, var1_index, var2_index):
  R = np.zeros((var1_features.shape[1], var2_features.shape[1]))
  P = np.zeros((var1_features.shape[1], var2_features.shape[1]))
  k = 0
  for i in range(var1_features.shape[1]):
    for j in range(var2_features.shape[1]):
      r, p =  pearsonr(var1_features[:, i], var2_features[:, j])
      R[i,j] = r
      P[i,j] = p
      print "\r%i" % k,
      k += 1
  idx = R.argsort(axis=1)
  index_var1 = {i:g for g,i in var1_index.iteritems()}
  index_var2 = {i:g for g,i in var2_index.iteritems()}
  print
  for i in range(len(index_var1)):
    print
    #if i in [0, 2]: continue
    for j in idx[i][::-1]:
      print  "{:<4}{:<4}{:<24}{:<24}{:>20.4f}{:>20.4f}".format(i, j, index_var1[i], index_var2[j], R[i,j], P[i,j])

In [22]:
correlate(gender_features, profession_features, gender_index, profession_index)


35

0   1   Other                   humanities                            0.0062              0.0031
0   6   Other                   other                                 0.0036              0.0921
0   0   Other                   business                              0.0020              0.3482
0   8   Other                   music_art                             0.0010              0.6288
0   5   Other                   religion                             -0.0006              0.7871
0   4   Other                   stem                                 -0.0013              0.5412
0   3   Other                   nan                                  -0.0018              0.4057
0   7   Other                   law_politics                         -0.0021              0.3149
0   2   Other                   athlete                              -0.0044              0.0358

1   2   Male                    athlete                               0.1379              0.0000
1   7   Male                    law_politics                          0.0586              0.0000
1   4   Male                    stem                                  0.0578              0.0000
1   3   Male                    nan                                   0.0459              0.0000
1   0   Male                    business                              0.0349              0.0000
1   5   Male                    religion                              0.0167              0.0000
1   6   Male                    other                                 0.0016              0.4430
1   1   Male                    humanities                           -0.0453              0.0000
1   8   Male                    music_art                            -0.1801              0.0000

2   3   nan                     nan                                   0.1112              0.0000
2   6   nan                     other                                 0.0074              0.0005
2   5   nan                     religion                              0.0004              0.8572
2   0   nan                     business                             -0.0062              0.0034
2   4   nan                     stem                                 -0.0109              0.0000
2   7   nan                     law_politics                         -0.0195              0.0000
2   2   nan                     athlete                              -0.0257              0.0000
2   1   nan                     humanities                           -0.0288              0.0000
2   8   nan                     music_art                            -0.0600              0.0000

3   8   Female                  music_art                             0.1999              0.0000
3   1   Female                  humanities                            0.0537              0.0000
3   6   Female                  other                                -0.0038              0.0754
3   5   Female                  religion                             -0.0172              0.0000
3   0   Female                  business                             -0.0340              0.0000
3   7   Female                  law_politics                         -0.0545              0.0000
3   4   Female                  stem                                 -0.0560              0.0000
3   3   Female                  nan                                  -0.0765              0.0000
3   2   Female                  athlete                              -0.1338              0.0000

In [23]:
correlate(gender_features, general_professions, gender_index, general_professions_index)


15

0   1   Other                   traditional                           0.0046              0.0291
0   3   Other                   other                                 0.0036              0.0921
0   2   Other                   nan                                  -0.0018              0.4057
0   0   Other                   non_traditional                      -0.0045              0.0324

1   0   Male                    non_traditional                       0.1708              0.0000
1   2   Male                    nan                                   0.0459              0.0000
1   3   Male                    other                                 0.0016              0.4430
1   1   Male                    traditional                          -0.1903              0.0000

2   2   nan                     nan                                   0.1112              0.0000
2   3   nan                     other                                 0.0074              0.0005
2   0   nan                     non_traditional                      -0.0324              0.0000
2   1   nan                     traditional                          -0.0661              0.0000

3   1   Female                  traditional                           0.2117              0.0000
3   3   Female                  other                                -0.0038              0.0754
3   2   Female                  nan                                  -0.0765              0.0000
3   0   Female                  non_traditional                      -0.1656              0.0000

In [24]:
correlate(ethnicity_features, profession_features, ethnicity_index, profession_index)


107

0   1   jewish                  humanities                            0.0442              0.0000
0   0   jewish                  business                              0.0334              0.0000
0   7   jewish                  law_politics                          0.0179              0.0000
0   4   jewish                  stem                                  0.0177              0.0000
0   5   jewish                  religion                              0.0048              0.0224
0   6   jewish                  other                                 0.0023              0.2686
0   8   jewish                  music_art                             0.0006              0.7813
0   2   jewish                  athlete                              -0.0213              0.0000
0   3   jewish                  nan                                  -0.0408              0.0000

1   7   african_american        law_politics                          0.0044              0.0375
1   5   african_american        religion                              0.0042              0.0462
1   1   african_american        humanities                            0.0032              0.1262
1   8   african_american        music_art                             0.0029              0.1714
1   0   african_american        business                              0.0002              0.9286
1   6   african_american        other                                -0.0007              0.7307
1   4   african_american        stem                                 -0.0010              0.6387
1   2   african_american        athlete                              -0.0038              0.0753
1   3   african_american        nan                                  -0.0041              0.0510

2   3   nan                     nan                                   0.1226              0.0000
2   2   nan                     athlete                               0.0098              0.0000
2   4   nan                     stem                                  0.0047              0.0272
2   5   nan                     religion                             -0.0050              0.0189
2   6   nan                     other                                -0.0075              0.0004
2   7   nan                     law_politics                         -0.0293              0.0000
2   1   nan                     humanities                           -0.0380              0.0000
2   0   nan                     business                             -0.0448              0.0000
2   8   nan                     music_art                            -0.0766              0.0000

3   1   middle eastern          humanities                            0.0152              0.0000
3   0   middle eastern          business                              0.0068              0.0013
3   7   middle eastern          law_politics                          0.0058              0.0059
3   4   middle eastern          stem                                  0.0038              0.0739
3   8   middle eastern          music_art                             0.0016              0.4471
3   6   middle eastern          other                                 0.0005              0.7998
3   5   middle eastern          religion                             -0.0003              0.8878
3   2   middle eastern          athlete                              -0.0091              0.0000
3   3   middle eastern          nan                                  -0.0114              0.0000

4   8   hispanic_latino         music_art                             0.0152              0.0000
4   1   hispanic_latino         humanities                            0.0081              0.0001
4   0   hispanic_latino         business                              0.0042              0.0462
4   7   hispanic_latino         law_politics                          0.0037              0.0827
4   6   hispanic_latino         other                                 0.0029              0.1750
4   5   hispanic_latino         religion                              0.0011              0.5983
4   4   hispanic_latino         stem                                 -0.0063              0.0030
4   2   hispanic_latino         athlete                              -0.0064              0.0025
4   3   hispanic_latino         nan                                  -0.0171              0.0000

5   1   indigenous              humanities                            0.0116              0.0000
5   8   indigenous              music_art                             0.0070              0.0009
5   7   indigenous              law_politics                          0.0055              0.0097
5   2   indigenous              athlete                               0.0025              0.2438
5   6   indigenous              other                                 0.0003              0.8726
5   5   indigenous              religion                             -0.0015              0.4665
5   0   indigenous              business                             -0.0039              0.0651
5   4   indigenous              stem                                 -0.0055              0.0093
5   3   indigenous              nan                                  -0.0154              0.0000

6   8   hispanic                music_art                             0.0132              0.0000
6   6   hispanic                other                                 0.0036              0.0900
6   1   hispanic                humanities                            0.0019              0.3734
6   0   hispanic                business                             -0.0004              0.8372
6   5   hispanic                religion                             -0.0006              0.7664
6   7   hispanic                law_politics                         -0.0032              0.1322
6   4   hispanic                stem                                 -0.0033              0.1177
6   2   hispanic                athlete                              -0.0037              0.0816
6   3   hispanic                nan                                  -0.0096              0.0000

7   6   other                   other                                 0.0060              0.0046
7   7   other                   law_politics                          0.0049              0.0211
7   8   other                   music_art                             0.0033              0.1193
7   1   other                   humanities                            0.0032              0.1250
7   0   other                   business                              0.0020              0.3405
7   4   other                   stem                                  0.0018              0.3855
7   5   other                   religion                             -0.0011              0.6145
7   2   other                   athlete                              -0.0043              0.0417
7   3   other                   nan                                  -0.0074              0.0005

8   8   african_americans       music_art                             0.0692              0.0000
8   2   african_americans       athlete                               0.0314              0.0000
8   7   african_americans       law_politics                          0.0219              0.0000
8   5   african_americans       religion                              0.0038              0.0710
8   6   african_americans       other                                -0.0039              0.0619
8   1   african_americans       humanities                           -0.0050              0.0190
8   0   african_americans       business                             -0.0112              0.0000
8   4   african_americans       stem                                 -0.0240              0.0000
8   3   african_americans       nan                                  -0.0855              0.0000

9   6   mixed                   other                                 0.0074              0.0005
9   8   mixed                   music_art                             0.0044              0.0366
9   0   mixed                   business                              0.0012              0.5823
9   5   mixed                   religion                             -0.0002              0.9083
9   7   mixed                   law_politics                         -0.0004              0.8345
9   1   mixed                   humanities                           -0.0008              0.7135
9   4   mixed                   stem                                 -0.0013              0.5439
9   2   mixed                   athlete                              -0.0023              0.2752
9   3   mixed                   nan                                  -0.0037              0.0769

10  0   white                   business                              0.0677              0.0000
10  8   white                   music_art                             0.0398              0.0000
10  1   white                   humanities                            0.0304              0.0000
10  6   white                   other                                 0.0135              0.0000
10  7   white                   law_politics                          0.0074              0.0004
10  4   white                   stem                                  0.0073              0.0006
10  5   white                   religion                              0.0023              0.2800
10  2   white                   athlete                              -0.0313              0.0000
10  3   white                   nan                                  -0.0683              0.0000

11  1   asian                   humanities                            0.0170              0.0000
11  4   asian                   stem                                  0.0129              0.0000
11  0   asian                   business                              0.0099              0.0000
11  8   asian                   music_art                             0.0072              0.0007
11  7   asian                   law_politics                          0.0057              0.0075
11  6   asian                   other                                 0.0021              0.3112
11  5   asian                   religion                             -0.0026              0.2196
11  3   asian                   nan                                  -0.0164              0.0000
11  2   asian                   athlete                              -0.0180              0.0000

In [25]:
correlate(religion_features, profession_features, religion_index, profession_index)


80

0   1   muslim                  humanities                            0.0104              0.0000
0   6   muslim                  other                                 0.0093              0.0000
0   7   muslim                  law_politics                          0.0028              0.1798
0   0   muslim                  business                              0.0011              0.6153
0   4   muslim                  stem                                  0.0005              0.8049
0   5   muslim                  religion                             -0.0014              0.5200
0   8   muslim                  music_art                            -0.0014              0.5153
0   3   muslim                  nan                                  -0.0035              0.0954
0   2   muslim                  athlete                              -0.0082              0.0001

1   7   christian               law_politics                          0.2328              0.0000
1   0   christian               business                              0.0550              0.0000
1   5   christian               religion                              0.0473              0.0000
1   6   christian               other                                 0.0395              0.0000
1   4   christian               stem                                  0.0021              0.3125
1   1   christian               humanities                           -0.0036              0.0876
1   3   christian               nan                                  -0.0183              0.0000
1   2   christian               athlete                              -0.0510              0.0000
1   8   christian               music_art                            -0.0855              0.0000

2   0   asian religion          business                              0.0100              0.0000
2   3   asian religion          nan                                   0.0057              0.0072
2   4   asian religion          stem                                  0.0034              0.1095
2   1   asian religion          humanities                            0.0010              0.6235
2   5   asian religion          religion                              0.0007              0.7570
2   7   asian religion          law_politics                         -0.0006              0.7722
2   6   asian religion          other                                -0.0022              0.3045
2   2   asian religion          athlete                              -0.0040              0.0586
2   8   asian religion          music_art                            -0.0071              0.0008

3   5   jewish                  religion                              0.0524              0.0000
3   7   jewish                  law_politics                          0.0353              0.0000
3   1   jewish                  humanities                            0.0336              0.0000
3   0   jewish                  business                              0.0333              0.0000
3   4   jewish                  stem                                  0.0225              0.0000
3   6   jewish                  other                                 0.0035              0.0960
3   2   jewish                  athlete                              -0.0112              0.0000
3   8   jewish                  music_art                            -0.0212              0.0000
3   3   jewish                  nan                                  -0.0334              0.0000

4   4   hindu                   stem                                  0.0219              0.0000
4   0   hindu                   business                              0.0071              0.0008
4   1   hindu                   humanities                            0.0070              0.0009
4   3   hindu                   nan                                   0.0024              0.2567
4   6   hindu                   other                                 0.0007              0.7581
4   5   hindu                   religion                             -0.0005              0.8004
4   7   hindu                   law_politics                         -0.0026              0.2154
4   8   hindu                   music_art                            -0.0096              0.0000
4   2   hindu                   athlete                              -0.0101              0.0000

5   1   buddhist                humanities                            0.0222              0.0000
5   5   buddhist                religion                              0.0048              0.0244
5   7   buddhist                law_politics                          0.0001              0.9476
5   3   buddhist                nan                                  -0.0003              0.8822
5   6   buddhist                other                                -0.0007              0.7491
5   4   buddhist                stem                                 -0.0017              0.4091
5   0   buddhist                business                             -0.0018              0.4073
5   8   buddhist                music_art                            -0.0057              0.0074
5   2   buddhist                athlete                              -0.0100              0.0000

6   8   nan                     music_art                             0.0881              0.0000
6   2   nan                     athlete                               0.0576              0.0000
6   3   nan                     nan                                   0.0326              0.0000
6   4   nan                     stem                                 -0.0214              0.0000
6   1   nan                     humanities                           -0.0243              0.0000
6   6   nan                     other                                -0.0406              0.0000
6   5   nan                     religion                             -0.0613              0.0000
6   0   nan                     business                             -0.0622              0.0000
6   7   nan                     law_politics                         -0.2168              0.0000

7   6   other                   other                                 0.0167              0.0000
7   1   other                   humanities                            0.0130              0.0000
7   7   other                   law_politics                          0.0127              0.0000
7   4   other                   stem                                  0.0112              0.0000
7   5   other                   religion                              0.0046              0.0313
7   0   other                   business                             -0.0002              0.9113
7   3   other                   nan                                  -0.0056              0.0076
7   8   other                   music_art                            -0.0097              0.0000
7   2   other                   athlete                              -0.0117              0.0000

8   1   atheist_spiritual       humanities                            0.0354              0.0000
8   4   atheist_spiritual       stem                                  0.0294              0.0000
8   0   atheist_spiritual       business                              0.0058              0.0065
8   6   atheist_spiritual       other                                 0.0055              0.0090
8   7   atheist_spiritual       law_politics                          0.0049              0.0200
8   5   atheist_spiritual       religion                             -0.0011              0.5958
8   8   atheist_spiritual       music_art                            -0.0108              0.0000
8   2   atheist_spiritual       athlete                              -0.0161              0.0000
8   3   atheist_spiritual       nan                                  -0.0178              0.0000

In [26]:
correlate(nationality_features, profession_features, nationality_index, profession_index)


62

0   2   african_american        athlete                               0.0220              0.0000
0   5   african_american        religion                              0.0013              0.5436
0   1   african_american        humanities                            0.0003              0.8795
0   0   african_american        business                             -0.0006              0.7901
0   8   african_american        music_art                            -0.0016              0.4574
0   4   african_american        stem                                 -0.0025              0.2403
0   6   african_american        other                                -0.0040              0.0572
0   7   african_american        law_politics                         -0.0058              0.0062
0   3   african_american        nan                                  -0.0096              0.0000

1   2   united states of americaathlete                               0.0253              0.0000
1   7   united states of americalaw_politics                          0.0181              0.0000
1   3   united states of americanan                                   0.0181              0.0000
1   8   united states of americamusic_art                            -0.0002              0.9364
1   6   united states of americaother                                -0.0033              0.1230
1   5   united states of americareligion                             -0.0065              0.0022
1   0   united states of americabusiness                             -0.0137              0.0000
1   1   united states of americahumanities                           -0.0221              0.0000
1   4   united states of americastem                                 -0.0620              0.0000

2   4   middle eastern          stem                                  0.0192              0.0000
2   1   middle eastern          humanities                            0.0146              0.0000
2   0   middle eastern          business                              0.0083              0.0001
2   5   middle eastern          religion                              0.0081              0.0001
2   8   middle eastern          music_art                            -0.0012              0.5860
2   6   middle eastern          other                                -0.0023              0.2680
2   7   middle eastern          law_politics                         -0.0031              0.1376
2   3   middle eastern          nan                                  -0.0084              0.0001
2   2   middle eastern          athlete                              -0.0121              0.0000

3   8   hispanic_latino         music_art                             0.0102              0.0000
3   2   hispanic_latino         athlete                               0.0100              0.0000
3   5   hispanic_latino         religion                              0.0004              0.8496
3   0   hispanic_latino         business                              0.0001              0.9699
3   6   hispanic_latino         other                                -0.0018              0.3831
3   1   hispanic_latino         humanities                           -0.0023              0.2733
3   4   hispanic_latino         stem                                 -0.0031              0.1406
3   7   hispanic_latino         law_politics                         -0.0073              0.0005
3   3   hispanic_latino         nan                                  -0.0107              0.0000

4   2   indigenous              athlete                               0.0194              0.0000
4   7   indigenous              law_politics                          0.0016              0.4629
4   6   indigenous              other                                 0.0001              0.9530
4   5   indigenous              religion                             -0.0007              0.7543
4   3   indigenous              nan                                  -0.0010              0.6287
4   0   indigenous              business                             -0.0029              0.1662
4   4   indigenous              stem                                 -0.0035              0.0992
4   1   indigenous              humanities                           -0.0041              0.0529
4   8   indigenous              music_art                            -0.0087              0.0000

5   4   asian                   stem                                  0.0210              0.0000
5   8   asian                   music_art                             0.0174              0.0000
5   1   asian                   humanities                            0.0070              0.0009
5   0   asian                   business                              0.0053              0.0130
5   6   asian                   other                                 0.0008              0.7050
5   5   asian                   religion                             -0.0016              0.4425
5   7   asian                   law_politics                         -0.0102              0.0000
5   2   asian                   athlete                              -0.0150              0.0000
5   3   asian                   nan                                  -0.0164              0.0000

6   4   white                   stem                                  0.0615              0.0000
6   1   white                   humanities                            0.0203              0.0000
6   0   white                   business                              0.0121              0.0000
6   6   white                   other                                 0.0057              0.0069
6   5   white                   religion                              0.0055              0.0090
6   3   white                   nan                                  -0.0070              0.0010
6   8   white                   music_art                            -0.0082              0.0001
6   7   white                   law_politics                         -0.0129              0.0000
6   2   white                   athlete                              -0.0309              0.0000

Classification


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

In [28]:
valid_rows = np.where([x is not None  for x in df.profession.values.flatten()])
X = np.hstack([gender_features, ethnicity_features, religion_features, nationality_features])[valid_rows]
profs = [(x,) if x is None else x for x in df.profession]
y = np.array([set([x]) if isinstance(x, basestring) else set(sorted(x)) for x in profs])[valid_rows]

In [29]:
binarizer = MultiLabelBinarizer()
y = binarizer.fit_transform(y)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [31]:
estimator = LogisticRegression()
model = OneVsRestClassifier(estimator)
model.fit(X_train, y_train)


Out[31]:
OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),
          n_jobs=1)

In [32]:
y_pred = model.predict(X_test)
print classification_report(y_test, y_pred, labels=binarizer.classes_)


             precision    recall  f1-score   support

    athlete       0.44      0.00      0.00     11555
   business       0.00      0.00      0.00      3037
 humanities       0.60      0.01      0.03     16232
law_politics       0.56      0.16      0.25      5645
  music_art       0.57      0.97      0.72     36516
      other       0.00      0.00      0.00      1903
   religion       0.00      0.00      0.00       159
       stem       0.62      0.00      0.00      4155

avg / total       0.53      0.46      0.36     79202

/usr/local/lib/python2.7/dist-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Stats Models


In [33]:
import statsmodels as sm
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.tools import add_constant

In [34]:
valid_rows = np.where([x is not None  for x in df.profession.values.flatten()])
X = np.hstack([gender_features, religion_features, nationality_features])[valid_rows]
cols = []
for name, fs in [("gender", gender_index), ("religion", religion_index),
                 ("nationality", nationality_index)]:#, ('ethinicity', ethnicity_index)]:
  cols.extend([x+"_"+name[0] for x in zip(*sorted(fs.items(), key=lambda(x,y): y))[0]])
dfX = pd.DataFrame(X, columns=cols)

In [35]:
profs = ["nan" if x is None else x for x in df.profession]
prof_map = {'music_art': "traditional", 'humanities': "traditional",
            'athlete': "non-traditional", 'business': "non-traditional",
             'law_politics': "non-traditional", 'religion': "non-traditional",
             'stem': "non-traditional"}
profs2 = []
for x in profs:
  if isinstance(x, basestring):
    profs2.append(prof_map.get(x, x))
  else:
    tmp = set([prof_map.get(x_, x_) for x_ in x])
    if len(tmp) == 1:
      profs2.append("mixed")
    else:
      profs2.append("mixed")

y = np.array(profs2)[valid_rows]
classes = list(set(y))

In [81]:
y = Series(y, name='profession')
ynames = {'mixed': 2, 'non-traditional': 1, 'other': 3, 'traditional': 0}
ynames_reverse = {v:k for k,v in ynames.items()}

In [82]:
y1 = y.apply(lambda x: ynames[x])

In [65]:
from collections import Counter
dict(Counter(y.values))


Out[65]:
{'mixed': 32005, 'non-traditional': 46421, 'other': 2445, 'traditional': 81948}

In [66]:
dfX2  = dfX[[x  for x in dfX.columns if 'nan' not in x and x != 'united states of america_n']]

In [87]:
len(dfX2)


Out[87]:
162819

In [ ]:
dfX2

In [67]:
print "Colinearity before removing almost colinear features", np.linalg.cond(dfX)
print "Colinearity after removing almost colinear features", np.linalg.cond(dfX2)


Colinearity before removing almost colinear features 4.1051041016e+13
Colinearity after removing almost colinear features 104.837039165

In [68]:
#data = dfX.values
data = add_constant(dfX2)
model = MNLogit(y1, data)
result = model.fit(method='ncg')


Optimization terminated successfully.
         Current function value: 1.040340
         Iterations: 28
         Function evaluations: 35
         Gradient evaluations: 62
         Hessian evaluations: 28

In [69]:
print "number of labels is", model.J, "number of features is", model.K


number of labels is 4.0 number of features is 18.0

In [84]:
result.summary(yname_list=[ynames_reverse[i] for i in range(1,len(ynames))])


Out[84]:
MNLogit Regression Results
Dep. Variable: profession No. Observations: 162819
Model: MNLogit Df Residuals: 162765
Method: MLE Df Model: 51
Date: Sat, 31 Jan 2015 Pseudo R-squ.: 0.04217
Time: 17:03:20 Log-Likelihood: -1.6939e+05
converged: True LL-Null: -1.7684e+05
LLR p-value: 0.000
non-traditional coef std err z P>|z| [95.0% Conf. Int.]
const -0.3751 0.085 -4.426 0.000 -0.541 -0.209
Other_g -1.4733 0.548 -2.688 0.007 -2.548 -0.399
Male_g 0.0627 0.085 0.737 0.461 -0.104 0.229
Female_g -1.6708 0.087 -19.287 0.000 -1.841 -1.501
muslim_r -0.5786 0.208 -2.787 0.005 -0.986 -0.172
christian_r 1.4658 0.036 40.358 0.000 1.395 1.537
asian religion_r 0.8422 0.462 1.823 0.068 -0.063 1.748
jewish_r 0.8363 0.076 11.019 0.000 0.688 0.985
hindu_r 0.7343 0.225 3.266 0.001 0.294 1.175
buddhist_r -1.3731 0.379 -3.623 0.000 -2.116 -0.630
other_r 0.4216 0.202 2.089 0.037 0.026 0.817
atheist_spiritual_r -0.0238 0.148 -0.161 0.872 -0.313 0.265
african_american_n 1.4489 0.282 5.145 0.000 0.897 2.001
middle eastern_n -0.2828 0.239 -1.181 0.238 -0.752 0.187
hispanic_latino_n -0.1068 0.183 -0.583 0.560 -0.465 0.252
indigenous_n 5.3659 1.129 4.752 0.000 3.153 7.579
asian_n -0.6709 0.196 -3.416 0.001 -1.056 -0.286
white_n -0.0888 0.068 -1.314 0.189 -0.221 0.044
mixed coef std err z P>|z| [95.0% Conf. Int.]
const -1.5004 0.125 -11.989 0.000 -1.746 -1.255
Other_g 0.0132 0.478 0.028 0.978 -0.923 0.949
Male_g 0.6567 0.125 5.238 0.000 0.411 0.902
Female_g -0.0526 0.126 -0.418 0.676 -0.300 0.194
muslim_r 0.5156 0.164 3.146 0.002 0.194 0.837
christian_r 1.4537 0.038 38.326 0.000 1.379 1.528
asian religion_r 0.2799 0.537 0.521 0.602 -0.773 1.332
jewish_r 1.2064 0.073 16.543 0.000 1.063 1.349
hindu_r 0.6202 0.228 2.719 0.007 0.173 1.067
buddhist_r 0.8783 0.189 4.636 0.000 0.507 1.250
other_r 1.1994 0.174 6.886 0.000 0.858 1.541
atheist_spiritual_r 1.4330 0.113 12.670 0.000 1.211 1.655
african_american_n 1.6481 0.291 5.664 0.000 1.078 2.218
middle eastern_n 0.6948 0.223 3.111 0.002 0.257 1.132
hispanic_latino_n 0.2167 0.191 1.134 0.257 -0.158 0.591
indigenous_n 3.5284 1.265 2.790 0.005 1.050 6.007
asian_n 0.7708 0.167 4.613 0.000 0.443 1.098
white_n 0.5847 0.067 8.686 0.000 0.453 0.717
other coef std err z P>|z| [95.0% Conf. Int.]
const -1.8426 0.145 -12.725 0.000 -2.126 -1.559
Other_g -0.6818 0.751 -0.907 0.364 -2.154 0.791
Male_g -1.5744 0.147 -10.739 0.000 -1.862 -1.287
Female_g -2.3092 0.153 -15.139 0.000 -2.608 -2.010
muslim_r 1.3493 0.338 3.997 0.000 0.688 2.011
christian_r 1.6764 0.084 19.993 0.000 1.512 1.841
asian religion_r -2.7552 7.297 -0.378 0.706 -17.056 11.546
jewish_r -0.1678 0.354 -0.474 0.635 -0.862 0.526
hindu_r -0.0553 1.029 -0.054 0.957 -2.072 1.962
buddhist_r -0.5499 1.038 -0.530 0.596 -2.584 1.484
other_r 2.0502 0.308 6.654 0.000 1.446 2.654
atheist_spiritual_r 0.4990 0.412 1.210 0.226 -0.309 1.307
african_american_n -7.2252 6.620 -1.091 0.275 -20.200 5.750
middle eastern_n -1.3513 1.026 -1.317 0.188 -3.363 0.660
hispanic_latino_n -1.0741 0.787 -1.365 0.172 -2.617 0.468
indigenous_n 4.2822 2.279 1.879 0.060 -0.185 8.750
asian_n -1.1497 0.772 -1.490 0.136 -2.662 0.363
white_n 0.3061 0.214 1.433 0.152 -0.113 0.725

Parameters

Note: note that 0 = non-traditional, 2= traditional


In [85]:
params = result.params
params


Out[85]:
0 1 2
const -0.375106 -1.500449 -1.842576
Other_g -1.473276 0.013180 -0.681761
Male_g 0.062669 0.656687 -1.574357
Female_g -1.670828 -0.052630 -2.309196
muslim_r -0.578650 0.515620 1.349343
christian_r 1.465816 1.453665 1.676441
asian religion_r 0.842240 0.279932 -2.755202
jewish_r 0.836341 1.206423 -0.167813
hindu_r 0.734324 0.620205 -0.055339
buddhist_r -1.373070 0.878270 -0.549942
other_r 0.421563 1.199450 2.050216
atheist_spiritual_r -0.023815 1.432976 0.498957
african_american_n 1.448916 1.648092 -7.225231
middle eastern_n -0.282792 0.694810 -1.351258
hispanic_latino_n -0.106758 0.216727 -1.074113
indigenous_n 5.365942 3.528375 4.282204
asian_n -0.670932 0.770842 -1.149662
white_n -0.088781 0.584725 0.306136

Odd ratios


In [86]:
odds = np.exp(result.params)
odds


Out[86]:
0 1 2
const 0.687217 0.223030 0.158409
Other_g 0.229174 1.013267 0.505725
Male_g 1.064675 1.928393 0.207141
Female_g 0.188091 0.948731 0.099341
muslim_r 0.560655 1.674676 3.854893
christian_r 4.331077 4.278765 5.346491
asian religion_r 2.321562 1.323040 0.063596
jewish_r 2.307908 3.341509 0.845512
hindu_r 2.084073 1.859309 0.946164
buddhist_r 0.253328 2.406732 0.576983
other_r 1.524342 3.318291 7.769578
atheist_spiritual_r 0.976467 4.191153 1.647002
african_american_n 4.258495 5.197057 0.000728
middle eastern_n 0.753677 2.003328 0.258914
hispanic_latino_n 0.898743 1.242005 0.341601
indigenous_n 213.992693 34.068573 72.399821
asian_n 0.511232 2.161586 0.316744
white_n 0.915046 1.794498 1.358167

p-values


In [274]:
result.pvalues


Out[274]:
0 1 2
const 4.285576e-18 5.073674e-02 4.024810e-33
Other_g 2.331955e-02 4.012456e-01 9.709546e-01
Male_g 4.854284e-06 1.203844e-36 1.607785e-07
Female_g 8.445788e-35 2.016807e-35 6.783628e-01
muslim_r 4.449821e-07 1.545476e-02 1.683303e-03
christian_r 6.983251e-01 6.619225e-03 0.000000e+00
asian religion_r 2.846772e-01 6.748922e-01 5.993384e-01
jewish_r 1.267920e-07 9.929791e-05 1.900924e-61
hindu_r 6.315955e-01 5.116628e-01 6.607872e-03
buddhist_r 2.887008e-09 1.683594e-01 3.541388e-06
other_r 4.926627e-05 5.141816e-03 5.831586e-12
atheist_spiritual_r 4.406033e-27 2.191389e-02 8.674738e-37
african_american_n 4.886845e-01 1.778745e-01 1.569693e-08
middle eastern_n 1.695567e-04 4.722800e-02 1.873552e-03
hispanic_latino_n 1.306863e-01 1.051129e-01 2.557365e-01
indigenous_n 1.817902e-02 7.338761e-01 5.260055e-03
asian_n 1.096343e-11 1.290363e-02 3.793135e-06
white_n 2.659402e-19 1.981078e-01 3.937791e-18

In [278]:
result.bse


Out[278]:
0 1 2
const 0.129853 0.174932 0.125176
Other_g 0.658948 0.836619 0.476872
Male_g 0.130088 0.176473 0.125400
Female_g 0.131551 0.181704 0.125995
muslim_r 0.216909 0.344344 0.163880
christian_r 0.031338 0.082073 0.037929
asian religion_r 0.534197 6.927068 0.537921
jewish_r 0.069996 0.353049 0.072928
hindu_r 0.237263 1.036154 0.228070
buddhist_r 0.379209 1.042614 0.189438
other_r 0.191508 0.303982 0.174189
atheist_spiritual_r 0.135197 0.409229 0.113095
african_american_n 0.285622 6.552839 0.291010
middle eastern_n 0.260059 1.032312 0.223309
hispanic_latino_n 0.213800 0.794566 0.191155
indigenous_n 0.773668 2.136361 1.260262
asian_n 0.212253 0.779752 0.167078
white_n 0.074949 0.216352 0.067322

Objectives

  1. Change base case--make traditional, male, white and christian base cases
  2. Covariate plot matrix (religion, gender, nationality)--percentages and maybe correlations
  3. Redo regression analyses (review how to interpret info)--try mixed and accounting for multiple identities
  4. Stacked area plot for all demographics (religion, nationality, gender)

In [ ]: