In [28]:

    
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns=1000

Load data



In [29]:

    
df = pd.read_csv('/gh/data/hcmst/1_cleaned.csv', index_col=0)
df.head(4)









    Out[29]:







  
    
      
      age
      education
      race
      sex
      household_size
      house_type
      income
      marital_status
      in_metro
      usa_region
      house_payment
      N_child
      work
      has_internet
      has_gay_friendsfam
      politics
      religion
      in_relationship
      partner_age
      N_minutes_survey
      is_lgb
      is_married
      partner_race
      partner_religion
      partner_education
      USA_raised
      N_marriages
      cohabit
      age_first_met
      age_relationship_begin
      age_married
      relative_income
      same_high_school
      same_college
      same_hometown
      age_difference
      relationship_quality
      met_online
      met_friends
      met_family
      met_work
      relationship_excellent
      is_not_working
      has_gay_friends
      has_gay_family
      religion_is_christian
      religion_is_none
      partner_religion_is_christian
      partner_religion_is_none
    
  
  
    
      0
      52
      bachelor's degree or higher
      hispanic
      female
      2
      apartment
      22250.0
      living with partner
      True
      midwest
      rent
      0
      working - as a paid employee
      True
      yes, friends
      democrat
      catholic
      True
      48.0
      20
      True
      False
      white
      protestant (e.g. methodist, lutheran, presbyte...
      some college
      True
      1
      False
      45.0
      45.0
      NaN
      less
      False
      False
      False
      4.0
      good
      True
      False
      False
      False
      False
      False
      True
      False
      True
      False
      False
      False
    
    
      1
      28
      bachelor's degree or higher
      white
      female
      2
      apartment
      45000.0
      living with partner
      True
      west
      rent
      0
      working - as a paid employee
      True
      yes, both
      democrat
      jewish
      True
      30.0
      13
      True
      True
      white
      none
      bachelor's degree or higher
      True
      1
      False
      19.0
      20.0
      23.0
      more
      False
      True
      False
      2.0
      good
      False
      True
      False
      False
      False
      False
      True
      True
      False
      False
      False
      True
    
    
      2
      49
      high school
      black
      female
      4
      apartment
      37250.0
      never married
      True
      south
      rent
      1
      working - as a paid employee
      True
      yes, both
      democrat
      baptist-any denomination
      False
      NaN
      0
      False
      False
      NaN
      NaN
      less than high school
      False
      NaN
      False
      NaN
      NaN
      NaN
      NaN
      False
      False
      False
      NaN
      NaN
      False
      False
      False
      False
      False
      False
      True
      True
      True
      False
      False
      False
    
    
      3
      31
      some college
      white
      male
      1
      apartment
      45000.0
      never married
      True
      south
      owned
      0
      working - as a paid employee
      True
      yes, both
      democrat
      other non-christian, please specify:
      True
      40.0
      9
      True
      False
      white
      other non-christian, please specify
      high school
      True
      0
      False
      23.0
      23.0
      NaN
      more
      False
      False
      False
      9.0
      good
      True
      True
      False
      False
      False
      False
      True
      True
      False
      False
      False
      False

Make some new features



In [30]:

    
df['met_to_marriage_years'] = df['age_married'] - df['age_first_met']
df['relationship_to_marriage_years'] = df['age_married'] - df['age_relationship_begin']
df['years_since_marriage'] = df['age'] - df['age_married']
df['same_race'] = df['race'] == df['partner_race']
df['same_religion'] = df['religion'] == df['partner_religion']
df['same_education'] = df['education'] == df['partner_education']

Choose target



In [31]:

    
df.dropna(subset=['relationship_quality'], inplace=True)
y = df['relationship_excellent'].values
df.drop(['relationship_excellent', 'relationship_quality'], axis=1, inplace=True)

Make model



In [78]:

    
# Model settings
model_config = {
    'algorithm': 'logit',
    'param_grid': {'penalty': ['l1', 'l2'], 'C': np.logspace(-6,3,10)},
#     'algorithm': 'rf',
#     'param_grid': {'n_estimators': [100, 1000], 'max_depth': [5, 7, 9], 'max_features': [.1, .3, .5]},
    'n_folds': 5
}



In [79]:

    
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler,Imputer

from lime.lime_tabular import LimeTabularExplainer



In [80]:

    
class DataFrame_Encoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, categorical_cols_=None,numeric_cols_=None):
        self.categorical_cols_ = categorical_cols_
        self.numeric_cols_ = numeric_cols_
    
    def fit(self, df, y=None):
        ### df should be a dataframe that is a mix of categorical and numeric columns
        self.vec_ = DictVectorizer(sparse=False)
        temp_data = df[self.categorical_cols_].astype(str)
        self.vec_.fit(temp_data.to_dict('records'))
        
        self.feature_names_ = list(self.numeric_cols_) + list(self.vec_.feature_names_)
        return self

    def transform(self, df):
        ### df should be a dataframe that is a mix of categorical and numeric columns
        if len(self.categorical_cols_) > 0:
            temp_data = df[self.categorical_cols_].astype(str)
            categorical_data = self.vec_.transform(temp_data.to_dict('records'))
            categorical_df = pd.DataFrame(categorical_data, columns=self.vec_.feature_names_, index=df.index)

            new_data = pd.concat([df[self.numeric_cols_], categorical_df],axis=1)
        else:
            new_data = df.copy()
        return new_data

Prep model



In [81]:

    
# Train and test sets
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=.2, random_state=0)

# Algorithm type
if model_config['algorithm'] == 'lgbm':
    base_clf = lightgbm.LGBMClassifier(is_unbalance=True, nthread=4)
if model_config['algorithm'] == 'logit':
    base_clf = LogisticRegression(solver='liblinear')
if model_config['algorithm'] == 'rf':
    base_clf = RandomForestClassifier(n_jobs=-1)

# Pipeline
if model_config['algorithm'] == 'rf':
    pipeline = Pipeline([
            ('feature_encoder',DataFrame_Encoder()),
            ('imputer',Imputer()),
            ('clf',base_clf)
    ])
else:
    pipeline = Pipeline([
        ('feature_encoder',DataFrame_Encoder()),
        ('imputer',Imputer()),
        ('scaler',StandardScaler()),
        ('clf',base_clf)
    ])


# Pipeline parameters
string_features = X_train.select_dtypes(include=[object]).columns.values
numeric_features = X_train.select_dtypes(exclude=[object]).columns.values
pipeline.set_params(feature_encoder__categorical_cols_=string_features,
                    feature_encoder__numeric_cols_=numeric_features)

pipeline_params = {}
for key in model_config['param_grid']:
    pipeline_params['clf__'+key] = model_config['param_grid'][key]

Train model



In [82]:

    
pipeline









    Out[82]:





Pipeline(steps=[('feature_encoder', DataFrame_Encoder(categorical_cols_=array(['education', 'race', 'sex', 'house_type', 'marital_status',
       'usa_region', 'house_payment', 'work', 'has_gay_friendsfam',
       'politics', 'religion', 'partner_race', 'partner_religion',
       'partner_education', 'N_marr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])



In [83]:

    
pipeline_params









    Out[83]:





{'clf__C': array([  1.00000000e-06,   1.00000000e-05,   1.00000000e-04,
          1.00000000e-03,   1.00000000e-02,   1.00000000e-01,
          1.00000000e+00,   1.00000000e+01,   1.00000000e+02,
          1.00000000e+03]), 'clf__penalty': ['l1', 'l2']}



In [84]:

    
clf = GridSearchCV(pipeline, pipeline_params, cv=model_config['n_folds'],
                   n_jobs=4, verbose=4, scoring='roc_auc')
clf.fit(X_train, y_train)









    



Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] clf__C=1e-06, clf__penalty=l1 ...................................
[CV] clf__C=1e-06, clf__penalty=l1 ...................................
[CV] clf__C=1e-06, clf__penalty=l1 ...................................
[CV] clf__C=1e-06, clf__penalty=l1 ...................................
[CV] clf__C=1e-06, clf__penalty=l1 ...................................
[CV] clf__C=1e-06, clf__penalty=l2 ...................................
[CV] clf__C=1e-06, clf__penalty=l2 ...................................
[CV] clf__C=1e-06, clf__penalty=l2 ...................................






    



Process ForkPoolWorker-31:
Process ForkPoolWorker-30:
Process ForkPoolWorker-29:
Process ForkPoolWorker-32:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/Users/scott/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/Users/scott/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/Users/scott/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/Users/scott/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 362, in get
    return recv()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/Users/scott/anaconda/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-84-7d32a738cde7> in <module>()
      1 clf = GridSearchCV(pipeline, pipeline_params, cv=model_config['n_folds'],
      2                    n_jobs=4, verbose=4, scoring='roc_auc')
----> 3 clf.fit(X_train, y_train)

~/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups)
    943             train/test set.
    944         """
--> 945         return self._fit(X, y, groups, ParameterGrid(self.param_grid))
    946 
    947 

~/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self, X, y, groups, parameter_iterable)
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
    565           for train, test in cv_iter)
    566 

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in retrieve(self)
    717                     ensure_ready = self._managed_backend
    718                     backend.abort_everything(ensure_ready=ensure_ready)
--> 719                 raise exception
    720 
    721     def __call__(self, iterable):

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in retrieve(self)
    680                 # check if timeout supported in backend future implementation
    681                 if 'timeout' in getfullargspec(job.get).args:
--> 682                     self._output.extend(job.get(timeout=self.timeout))
    683                 else:
    684                     self._output.extend(job.get())

~/anaconda/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
    600 
    601     def get(self, timeout=None):
--> 602         self.wait(timeout)
    603         if not self.ready():
    604             raise TimeoutError

~/anaconda/lib/python3.6/multiprocessing/pool.py in wait(self, timeout)
    597 
    598     def wait(self, timeout=None):
--> 599         self._event.wait(timeout)
    600 
    601     def get(self, timeout=None):

~/anaconda/lib/python3.6/threading.py in wait(self, timeout)
    549             signaled = self._flag
    550             if not signaled:
--> 551                 signaled = self._cond.wait(timeout)
    552             return signaled
    553 

~/anaconda/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:

KeyboardInterrupt:

Evaluate model performance on test set

Logit
- Not great: violin plots of proba mostly overlap
- AUC = 0.615
- precision = ??
RF
- AUC = 0.636
- precision =

Compute AUC and precision



In [75]:

    
y_train_pred = [x[1] for x in clf.predict_proba(X_train)]
y_test_pred = [x[1] for x in clf.predict_proba(X_test)]
print('AUC=', roc_auc_score(y_test, y_test_pred))









    



AUC= 0.635638782577



In [ ]:

    
frac_flag = .1



In [68]:

    
df_y = pd.DataFrame({'label': y_test, 'pred': y_test_pred})
sns.violinplot(y='pred', x='label', data=df_y)









    



/Users/scott/anaconda/lib/python3.6/site-packages/seaborn/categorical.py:588: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  kde_data = remove_na(group_data)
/Users/scott/anaconda/lib/python3.6/site-packages/seaborn/categorical.py:816: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  violin_data = remove_na(group_data)






    Out[68]:





<matplotlib.axes._subplots.AxesSubplot at 0x11fc84898>



In [ ]:

	age	education	race	sex	household_size	house_type	income	marital_status	in_metro	usa_region	house_payment	N_child	work	has_internet	has_gay_friendsfam	politics	religion	in_relationship	partner_age	N_minutes_survey	is_lgb	is_married	partner_race	partner_religion	partner_education	USA_raised	N_marriages	cohabit	age_first_met	age_relationship_begin	age_married	relative_income	same_high_school	same_college	same_hometown	age_difference	relationship_quality	met_online	met_friends	met_family	met_work	relationship_excellent	is_not_working	has_gay_friends	has_gay_family	religion_is_christian	religion_is_none	partner_religion_is_christian	partner_religion_is_none
0	52	bachelor's degree or higher	hispanic	female	2	apartment	22250.0	living with partner	True	midwest	rent	0	working - as a paid employee	True	yes, friends	democrat	catholic	True	48.0	20	True	False	white	protestant (e.g. methodist, lutheran, presbyte...	some college	True	1	False	45.0	45.0	NaN	less	False	False	False	4.0	good	True	False	False	False	False	False	True	False	True	False	False	False
1	28	bachelor's degree or higher	white	female	2	apartment	45000.0	living with partner	True	west	rent	0	working - as a paid employee	True	yes, both	democrat	jewish	True	30.0	13	True	True	white	none	bachelor's degree or higher	True	1	False	19.0	20.0	23.0	more	False	True	False	2.0	good	False	True	False	False	False	False	True	True	False	False	False	True
2	49	high school	black	female	4	apartment	37250.0	never married	True	south	rent	1	working - as a paid employee	True	yes, both	democrat	baptist-any denomination	False	NaN	0	False	False	NaN	NaN	less than high school	False	NaN	False	NaN	NaN	NaN	NaN	False	False	False	NaN	NaN	False	False	False	False	False	False	True	True	True	False	False	False
3	31	some college	white	male	1	apartment	45000.0	never married	True	south	owned	0	working - as a paid employee	True	yes, both	democrat	other non-christian, please specify:	True	40.0	9	True	False	white	other non-christian, please specify	high school	True	0	False	23.0	23.0	NaN	more	False	False	False	9.0	good	True	True	False	False	False	False	True	True	False	False	False	False