Load packages


In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from scipy import interp
import pandas.core.algorithms as algos 
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time

meta-data from clipboard


In [ ]:
col = pd.read_clipboard(header=None)

In [ ]:
droplist1 = pd.read_clipboard(header=None)[0].values

In [ ]:
df1 = pd.read_csv('/data2/GMC/sample_dev_cln.csv',header= None)#,converters = {'.':np.nan})#,nrows=5000)
df1.columns = col[0].values
df1=df1.set_index(df1.Seqnum)
df1.shape

In [ ]:
df1 = df1.drop(droplist1,1)
print df1.shape

In [ ]:
df1 = df1.replace('.',np.nan)
df1 = df1.apply(lambda x: pd.to_numeric(x, errors='ignore'))

In [ ]:
df1.head()

Seperate numeric and non-numeric variables


In [ ]:
def var_classfication(data):
    var_object = data.dtypes[data.dtypes==object].index
    var_date = [x for x in var_object if type(x)=='str' and x[-3:]=='dte' ]
    var_nonnum = [x for x in var_object if x not in var_date]
    var_num = data.dtypes[data.dtypes!=object].index
    return var_date,var_nonnum,var_num

var_date,var_nonnum,var_num = var_classfication(df1)

In [ ]:
print var_nonnum

Category variables with 10+ levels


In [ ]:
for i,j in enumerate(df1[var_nonnum]):
        ct = df1[j].nunique()
        if ct >10:
            print j,ct#i

In [ ]:
for i,j in enumerate(df1[var_nonnum]):
        ct = df1[j].nunique()
        if ct <=10:
            print j,ct#i

In [ ]:
def sic2d(row_):
    if row_=='    ' or row_!=row_ or row_ == '' or row_ == ' ':
        return 11
    else:
        row = int(row_)/100
        #if row.isin(43, 90,91,92,93,94,95,96,97,98):
        if row >  0 and row<=9:
            return 1
        elif row<=14:
            return 2
        elif row<-17:
            return 3
        elif row<=39:
            return 4
        elif row<=49:
            return 5
        elif row<=51:
            return 6
        elif row<=59:
            return 7
        elif row<=67:
            return 8
        elif row<=89:
            return 9
        elif row<=97:
            return 10
        else:
            return 11
df1['sic2d'] = df1['SIC4'].map(sic2d)
def age(df):
    df.YRSTART = df.YRSTART.replace(to_replace=[' ','  ','   ','    '], value=2016)
    df.YRSTART = df.YRSTART.astype(float)
    df['start_age'] = 2016 - df['YRSTART'] 
    df.loc[df.start_age>200,:].start_age = 200
    return df
age(df1)

Pull all category variables with < 10 levels


In [ ]:
var_nonnum.remove('BUSINESSDATE')
var_nonnum.remove('BUSINESS_DATE')

In [ ]:
dummylist1 = [j for i,j in enumerate(df1[var_nonnum]) if df1[j].nunique()<=10]

create dummy variable


In [ ]:
dummy_matrix=pd.DataFrame()
nan_map = lambda x: 0 if x!= x else x
for j in dummylist1:
    dummy = pd.get_dummies(df1[j].map(nan_map),prefix=j)
    dummy_matrix = pd.concat([dummy_matrix,dummy],1)
dummy_matrix.head()

In [ ]:
df2=pd.concat([df1[var_num],dummy_matrix],1)
df2.shape

impute missing


In [ ]:
df3 = df2.fillna(0)

In [ ]:
df1.NEW_BAD.value_counts()

In [ ]:
#droplist2 = [i for i in df3.columns.values if i[4:15]=='PDUE_BUCKET']
df3 = df3.drop(droplist2,1)

In [ ]:
y= df3['NEW_BAD']
X = df3.drop(['NEW_BAD','SEGMENT_PM2016'],1)
X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size=0.3, random_state=3243)

In [ ]:

alternatively, impute to median


In [ ]:
from sklearn.preprocessing import Imputer, MinMaxScaler
imp = Imputer(missing_values='NaN', strategy='median')
imp.fit(Xtrain)
Xtrain=imp.transform(Xtrain)

Run a quick random forest model to see if top predictors makes sense


In [ ]:
forest = RandomForestClassifier(     n_estimators=100, 
                                    #class_weight='balanced_subsample',
                                    max_depth = 80,
                                    min_samples_leaf=500,
                                    #max_features=100,
                                    n_jobs=-1)

model = forest

import time
start = time.time()    
model.fit(X_train,y_train)
end = time.time()
print(end - start)

In [ ]:
proba= model.predict_proba(X_train)
print roc_auc_score(y_train,proba[:,1])
proba= model.predict_proba(X_test)
print  roc_auc_score(y_test,proba[:,1])

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
feature_importance = model.feature_importances_
cols = X_train.columns
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance )#/ feature_importance.max())
sorted_idx = np.argsort(feature_importance)[::-1]
top_sorted_idx = sorted_idx[:20]
pos = np.arange(top_sorted_idx.shape[0]) + .5
#fig = plt.figure() 
plt.barh(pos, feature_importance[top_sorted_idx][::-1], align='center')
plt.yticks(pos, cols[top_sorted_idx][::-1])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
#plt.savefig("var_importance1.png")
plt.show()

In [ ]:
top_sorted_idx = sorted_idx[:50]
t1 = cols[top_sorted_idx].values
t2 = feature_importance[top_sorted_idx]

In [ ]:
sum([i > 0.0 for i in feature_importance])

In [ ]:
np.argsort(feature_importance)[::-1]

In [ ]:
df2.to_csv('/data2/GMC/sample_dev_cln2.csv',index=False)

In [ ]:
meta = pd.read_table('sample_oot_meta.txt')

In [ ]:
col = meta.Variable.values
meta['pd_type'] = meta['Type'].map({'Num':np.float,'Char':str})
meta.loc[meta['Format'].astype(str).map(lambda x:x[:4])=='MMDD','pd_type'] = str
converter1 = meta.pd_type.to_dict()
meta.pd_type = str
converter2 = meta.pd_type.to_dict()

In [ ]:
###holdout_sample
start = time.time()    
dfh = pd.read_csv('/data2/GMC/sample_cln.csv',na_values=['.'],header= None,converters =converter2)
dfh.columns = col
dfh=dfh.set_index(dfh.Seqnum)
dfh.shape
end = time.time()
print(end - start)

In [ ]:
#keeplist= col[0][~col[0].isin(droplist2)].values

In [ ]:
start = time.time()  
dfh = dfh.replace('.',np.nan)
dfh = dfh.apply(lambda x: pd.to_numeric(x, errors='ignore'))
end = time.time()
print(end - start)
#df_h.fillna('0',inplace=True)
#df_h = df_h.replace(' ','0')

In [ ]:
df1_short = pd.read_csv('/data2/GMC/sample_dev_cln2.csv',nrows = 10)

In [ ]:
def var_classfication(data):
    var_object = data.dtypes[data.dtypes==object].index
    var_date = [x for x in var_object if type(x)=='str' and x[-3:]=='dte' ]
    var_nonnum = [x for x in var_object if x not in var_date]
    var_num = data.dtypes[data.dtypes!=object].index
    return var_date,var_nonnum,var_num

var_date,var_nonnum,var_num = var_classfication(dfh)

In [ ]:
dummylist2 = [j for i,j in enumerate(dfh[var_nonnum]) if dfh[j].nunique()<=10]
dummylist2

In [ ]:
# create dummy variable
dummy_matrix=pd.DataFrame()
nan_map = lambda x: 0 if x!= x else x
for j in dummylist2:
    dummy = pd.get_dummies(dfh[j].map(nan_map),prefix=j)
    dummy_matrix = pd.concat([dummy_matrix,dummy],1)
dummy_matrix.head()

df_h2=pd.concat([dfh[var_num],dummy_matrix],1)
print df_h2.shape

In [ ]:
print set(df_h2.columns) - set(df1_short.columns)
print set(df1_short.columns) - set(df_h2.columns)

Check if any difference in attribute list between train and validation dataset


In [ ]:
misscolumn = (set(df1_short.columns) - set(df_h2.columns)) 
for a in misscolumn:
             df_h2[a]=0
df_h2 = df_h2[df1_short.columns]

In [ ]:
droplist2 = [i for i in df_h2.columns.values if i[4:15]=='PDUE_BUCKET']
df_h2 = df_h2.drop(droplist2,1)
df_h2.shape

In [ ]:
df_h2.to_csv('/data2/GMC/sample_oot_cln2.csv',index=False)