In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from scipy import interp
import pandas.core.algorithms as algos
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time
In [ ]:
col = pd.read_clipboard(header=None)
In [ ]:
droplist1 = pd.read_clipboard(header=None)[0].values
In [ ]:
df1 = pd.read_csv('/data2/GMC/sample_dev_cln.csv',header= None)#,converters = {'.':np.nan})#,nrows=5000)
df1.columns = col[0].values
df1=df1.set_index(df1.Seqnum)
df1.shape
In [ ]:
df1 = df1.drop(droplist1,1)
print df1.shape
In [ ]:
df1 = df1.replace('.',np.nan)
df1 = df1.apply(lambda x: pd.to_numeric(x, errors='ignore'))
In [ ]:
df1.head()
In [ ]:
def var_classfication(data):
var_object = data.dtypes[data.dtypes==object].index
var_date = [x for x in var_object if type(x)=='str' and x[-3:]=='dte' ]
var_nonnum = [x for x in var_object if x not in var_date]
var_num = data.dtypes[data.dtypes!=object].index
return var_date,var_nonnum,var_num
var_date,var_nonnum,var_num = var_classfication(df1)
In [ ]:
print var_nonnum
In [ ]:
for i,j in enumerate(df1[var_nonnum]):
ct = df1[j].nunique()
if ct >10:
print j,ct#i
In [ ]:
for i,j in enumerate(df1[var_nonnum]):
ct = df1[j].nunique()
if ct <=10:
print j,ct#i
In [ ]:
def sic2d(row_):
if row_==' ' or row_!=row_ or row_ == '' or row_ == ' ':
return 11
else:
row = int(row_)/100
#if row.isin(43, 90,91,92,93,94,95,96,97,98):
if row > 0 and row<=9:
return 1
elif row<=14:
return 2
elif row<-17:
return 3
elif row<=39:
return 4
elif row<=49:
return 5
elif row<=51:
return 6
elif row<=59:
return 7
elif row<=67:
return 8
elif row<=89:
return 9
elif row<=97:
return 10
else:
return 11
df1['sic2d'] = df1['SIC4'].map(sic2d)
def age(df):
df.YRSTART = df.YRSTART.replace(to_replace=[' ',' ',' ',' '], value=2016)
df.YRSTART = df.YRSTART.astype(float)
df['start_age'] = 2016 - df['YRSTART']
df.loc[df.start_age>200,:].start_age = 200
return df
age(df1)
In [ ]:
var_nonnum.remove('BUSINESSDATE')
var_nonnum.remove('BUSINESS_DATE')
In [ ]:
dummylist1 = [j for i,j in enumerate(df1[var_nonnum]) if df1[j].nunique()<=10]
In [ ]:
dummy_matrix=pd.DataFrame()
nan_map = lambda x: 0 if x!= x else x
for j in dummylist1:
dummy = pd.get_dummies(df1[j].map(nan_map),prefix=j)
dummy_matrix = pd.concat([dummy_matrix,dummy],1)
dummy_matrix.head()
In [ ]:
df2=pd.concat([df1[var_num],dummy_matrix],1)
df2.shape
In [ ]:
df3 = df2.fillna(0)
In [ ]:
df1.NEW_BAD.value_counts()
In [ ]:
#droplist2 = [i for i in df3.columns.values if i[4:15]=='PDUE_BUCKET']
df3 = df3.drop(droplist2,1)
In [ ]:
y= df3['NEW_BAD']
X = df3.drop(['NEW_BAD','SEGMENT_PM2016'],1)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=3243)
In [ ]:
In [ ]:
from sklearn.preprocessing import Imputer, MinMaxScaler
imp = Imputer(missing_values='NaN', strategy='median')
imp.fit(Xtrain)
Xtrain=imp.transform(Xtrain)
In [ ]:
forest = RandomForestClassifier( n_estimators=100,
#class_weight='balanced_subsample',
max_depth = 80,
min_samples_leaf=500,
#max_features=100,
n_jobs=-1)
model = forest
import time
start = time.time()
model.fit(X_train,y_train)
end = time.time()
print(end - start)
In [ ]:
proba= model.predict_proba(X_train)
print roc_auc_score(y_train,proba[:,1])
proba= model.predict_proba(X_test)
print roc_auc_score(y_test,proba[:,1])
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
feature_importance = model.feature_importances_
cols = X_train.columns
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance )#/ feature_importance.max())
sorted_idx = np.argsort(feature_importance)[::-1]
top_sorted_idx = sorted_idx[:20]
pos = np.arange(top_sorted_idx.shape[0]) + .5
#fig = plt.figure()
plt.barh(pos, feature_importance[top_sorted_idx][::-1], align='center')
plt.yticks(pos, cols[top_sorted_idx][::-1])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
#plt.savefig("var_importance1.png")
plt.show()
In [ ]:
top_sorted_idx = sorted_idx[:50]
t1 = cols[top_sorted_idx].values
t2 = feature_importance[top_sorted_idx]
In [ ]:
sum([i > 0.0 for i in feature_importance])
In [ ]:
np.argsort(feature_importance)[::-1]
In [ ]:
df2.to_csv('/data2/GMC/sample_dev_cln2.csv',index=False)
In [ ]:
meta = pd.read_table('sample_oot_meta.txt')
In [ ]:
col = meta.Variable.values
meta['pd_type'] = meta['Type'].map({'Num':np.float,'Char':str})
meta.loc[meta['Format'].astype(str).map(lambda x:x[:4])=='MMDD','pd_type'] = str
converter1 = meta.pd_type.to_dict()
meta.pd_type = str
converter2 = meta.pd_type.to_dict()
In [ ]:
###holdout_sample
start = time.time()
dfh = pd.read_csv('/data2/GMC/sample_cln.csv',na_values=['.'],header= None,converters =converter2)
dfh.columns = col
dfh=dfh.set_index(dfh.Seqnum)
dfh.shape
end = time.time()
print(end - start)
In [ ]:
#keeplist= col[0][~col[0].isin(droplist2)].values
In [ ]:
start = time.time()
dfh = dfh.replace('.',np.nan)
dfh = dfh.apply(lambda x: pd.to_numeric(x, errors='ignore'))
end = time.time()
print(end - start)
#df_h.fillna('0',inplace=True)
#df_h = df_h.replace(' ','0')
In [ ]:
df1_short = pd.read_csv('/data2/GMC/sample_dev_cln2.csv',nrows = 10)
In [ ]:
def var_classfication(data):
var_object = data.dtypes[data.dtypes==object].index
var_date = [x for x in var_object if type(x)=='str' and x[-3:]=='dte' ]
var_nonnum = [x for x in var_object if x not in var_date]
var_num = data.dtypes[data.dtypes!=object].index
return var_date,var_nonnum,var_num
var_date,var_nonnum,var_num = var_classfication(dfh)
In [ ]:
dummylist2 = [j for i,j in enumerate(dfh[var_nonnum]) if dfh[j].nunique()<=10]
dummylist2
In [ ]:
# create dummy variable
dummy_matrix=pd.DataFrame()
nan_map = lambda x: 0 if x!= x else x
for j in dummylist2:
dummy = pd.get_dummies(dfh[j].map(nan_map),prefix=j)
dummy_matrix = pd.concat([dummy_matrix,dummy],1)
dummy_matrix.head()
df_h2=pd.concat([dfh[var_num],dummy_matrix],1)
print df_h2.shape
In [ ]:
print set(df_h2.columns) - set(df1_short.columns)
print set(df1_short.columns) - set(df_h2.columns)
In [ ]:
misscolumn = (set(df1_short.columns) - set(df_h2.columns))
for a in misscolumn:
df_h2[a]=0
df_h2 = df_h2[df1_short.columns]
In [ ]:
droplist2 = [i for i in df_h2.columns.values if i[4:15]=='PDUE_BUCKET']
df_h2 = df_h2.drop(droplist2,1)
df_h2.shape
In [ ]:
df_h2.to_csv('/data2/GMC/sample_oot_cln2.csv',index=False)