In [1]:
import pandas as pd
import numpy as np
import cPickle as pickle
from datetime import date
from collections import Counter
from pandas import Series
In [2]:
fh = open("/data/csc/fb_persons/unified_100percentpeople.pkl", 'rb')
df = pickle.load(fh)
In [3]:
df.head()
Out[3]:
In [4]:
selected_columns = ["gender", "date_of_birth", "ethnicity", "name", "profession",
"religion", "nationality"]
df = df[selected_columns]
In [5]:
df.head()
Out[5]:
In [6]:
victorian_age = date(1800, 1, 1)
df = df[np.logical_not(df.date_of_birth.isnull().values)]
df = df[df.date_of_birth > victorian_age]
In [7]:
len(df)
Out[7]:
In [8]:
len(df.nationality.dropna())
Out[8]:
In [9]:
americans = map(lambda x: (isinstance(x, basestring) and x == 'united states of america') or
(isinstance(x, tuple) and 'united states of america' in x),
df.nationality)
df = df[americans]
In [10]:
len(df)
Out[10]:
In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
In [12]:
def feature_matrix(df, field):
features = df[[field]].fillna("nan").values.reshape(-1)
vocab = []
for value in features:
if isinstance(value, tuple):
vocab.extend(value)
else:
vocab.append(value)
vocab = set(vocab)
word_index = {w:i for i,w in enumerate(vocab)}
matrix = np.zeros((len(features), len(vocab)))
for i, value in enumerate(features):
if isinstance(value, tuple):
for v in value:
matrix[i, word_index[v]] = 1.0/len(value)
else:
matrix[i, word_index[value]] = 1
print "{}: persons, {}:number of categories".format(*matrix.shape)
print "{} persons have mixed categories".format(np.argwhere((matrix > 0).sum(axis=1) > 1).shape[0])
return matrix, word_index
In [13]:
gender_features, gender_index = feature_matrix(df, "gender")
In [14]:
ethnicity_features, ethnicity_index = feature_matrix(df, "ethnicity")
In [15]:
religion_features, religion_index = feature_matrix(df, "religion")
In [16]:
nationality_features, nationality_index = feature_matrix(df, "nationality")
In [17]:
profession_features, profession_index = feature_matrix(df, "profession")
In [18]:
non_traditional_cols = [profession_index[x] for x in ['athlete', 'business', 'law_politics',
'religion', 'stem']]
traditional_cols = [profession_index[x] for x in ['music_art', 'humanities']]
In [19]:
non_traditional = 1 * (profession_features[:, non_traditional_cols].sum(axis=1) >= 1)
traditional = 1 * (profession_features[:, traditional_cols].sum(axis=1) >= 1)
nan_features = profession_features[:, profession_index["nan"]]
other_features = profession_features[:, profession_index["other"]]
general_professions = np.vstack([non_traditional, traditional, nan_features, other_features]).T
general_professions_index = {"non_traditional": 0, "traditional": 1, "nan": 2, "other":3}
In [20]:
from scipy.stats import pearsonr
In [21]:
def correlate(var1_features, var2_features, var1_index, var2_index):
R = np.zeros((var1_features.shape[1], var2_features.shape[1]))
P = np.zeros((var1_features.shape[1], var2_features.shape[1]))
k = 0
for i in range(var1_features.shape[1]):
for j in range(var2_features.shape[1]):
r, p = pearsonr(var1_features[:, i], var2_features[:, j])
R[i,j] = r
P[i,j] = p
print "\r%i" % k,
k += 1
idx = R.argsort(axis=1)
index_var1 = {i:g for g,i in var1_index.iteritems()}
index_var2 = {i:g for g,i in var2_index.iteritems()}
print
for i in range(len(index_var1)):
print
#if i in [0, 2]: continue
for j in idx[i][::-1]:
print "{:<4}{:<4}{:<24}{:<24}{:>20.4f}{:>20.4f}".format(i, j, index_var1[i], index_var2[j], R[i,j], P[i,j])
In [22]:
correlate(gender_features, profession_features, gender_index, profession_index)
In [23]:
correlate(gender_features, general_professions, gender_index, general_professions_index)
In [24]:
correlate(ethnicity_features, profession_features, ethnicity_index, profession_index)
In [25]:
correlate(religion_features, profession_features, religion_index, profession_index)
In [26]:
correlate(nationality_features, profession_features, nationality_index, profession_index)
In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
In [28]:
valid_rows = np.where([x is not None for x in df.profession.values.flatten()])
X = np.hstack([gender_features, ethnicity_features, religion_features, nationality_features])[valid_rows]
profs = [(x,) if x is None else x for x in df.profession]
y = np.array([set([x]) if isinstance(x, basestring) else set(sorted(x)) for x in profs])[valid_rows]
In [29]:
binarizer = MultiLabelBinarizer()
y = binarizer.fit_transform(y)
In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
In [31]:
estimator = LogisticRegression()
model = OneVsRestClassifier(estimator)
model.fit(X_train, y_train)
Out[31]:
In [32]:
y_pred = model.predict(X_test)
print classification_report(y_test, y_pred, labels=binarizer.classes_)
In [33]:
import statsmodels as sm
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.tools import add_constant
In [34]:
valid_rows = np.where([x is not None for x in df.profession.values.flatten()])
X = np.hstack([gender_features, religion_features, nationality_features])[valid_rows]
cols = []
for name, fs in [("gender", gender_index), ("religion", religion_index),
("nationality", nationality_index)]:#, ('ethinicity', ethnicity_index)]:
cols.extend([x+"_"+name[0] for x in zip(*sorted(fs.items(), key=lambda(x,y): y))[0]])
dfX = pd.DataFrame(X, columns=cols)
In [35]:
profs = ["nan" if x is None else x for x in df.profession]
prof_map = {'music_art': "traditional", 'humanities': "traditional",
'athlete': "non-traditional", 'business': "non-traditional",
'law_politics': "non-traditional", 'religion': "non-traditional",
'stem': "non-traditional"}
profs2 = []
for x in profs:
if isinstance(x, basestring):
profs2.append(prof_map.get(x, x))
else:
tmp = set([prof_map.get(x_, x_) for x_ in x])
if len(tmp) == 1:
profs2.append("mixed")
else:
profs2.append("mixed")
y = np.array(profs2)[valid_rows]
classes = list(set(y))
In [81]:
y = Series(y, name='profession')
ynames = {'mixed': 2, 'non-traditional': 1, 'other': 3, 'traditional': 0}
ynames_reverse = {v:k for k,v in ynames.items()}
In [82]:
y1 = y.apply(lambda x: ynames[x])
In [65]:
from collections import Counter
dict(Counter(y.values))
Out[65]:
In [66]:
dfX2 = dfX[[x for x in dfX.columns if 'nan' not in x and x != 'united states of america_n']]
In [87]:
len(dfX2)
Out[87]:
In [ ]:
dfX2
In [67]:
print "Colinearity before removing almost colinear features", np.linalg.cond(dfX)
print "Colinearity after removing almost colinear features", np.linalg.cond(dfX2)
In [68]:
#data = dfX.values
data = add_constant(dfX2)
model = MNLogit(y1, data)
result = model.fit(method='ncg')
In [69]:
print "number of labels is", model.J, "number of features is", model.K
In [84]:
result.summary(yname_list=[ynames_reverse[i] for i in range(1,len(ynames))])
Out[84]:
In [85]:
params = result.params
params
Out[85]:
In [86]:
odds = np.exp(result.params)
odds
Out[86]:
In [274]:
result.pvalues
Out[274]:
In [278]:
result.bse
Out[278]:
In [ ]: