In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
In [3]:
tr_data = pd.read_csv('../input/train.csv')
te_data = pd.read_csv('../input/test.csv')
print('train shape is: {} \r\ntest shape is: {} '.format(tr_data.shape,te_data.shape))
In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
y = tr_data.target
tr_ids = tr_data.id
te_ids = te_data.id
tr_data.drop(['id','target'],axis=1,inplace=True)
te_data.drop(['id'],axis=1,inplace=True)
In [5]:
tr_data['count_zeros'] = (tr_data == 0).astype(int).sum(axis=1)
te_data['count_zeros'] = (te_data == 0).astype(int).sum(axis=1)
tr_data['num_greater_than_3'] = (tr_data > 3).astype(int).sum(axis=1)
te_data['num_greater_than_3'] = (te_data > 3).astype(int).sum(axis=1)
tr_data['num_greater_than_10'] = (tr_data > 10).astype(int).sum(axis=1)
te_data['num_greater_than_10'] = (te_data > 10).astype(int).sum(axis=1)
In [6]:
tfidf = TfidfTransformer()
tr_data_tfidf = pd.DataFrame(tfidf.fit_transform(tr_data).toarray())
te_data_tfidf = pd.DataFrame(tfidf.transform(te_data).toarray())
tr_data_tfidf.columns = [str(x)+'tfidf' for x in tr_data_tfidf.columns]
te_data_tfidf.columns = [str(x)+'tfidf' for x in te_data_tfidf.columns]
tr_data_log1p = pd.DataFrame(tr_data.apply(lambda x: np.log1p(x)))
te_data_log1p = pd.DataFrame(te_data.apply(lambda x: np.log1p(x)))
tr_data_log1p.columns = [str(x)+'log1p' for x in tr_data_log1p.columns]
te_data_log1p.columns = [str(x)+'log1p' for x in te_data_log1p.columns]
tr_data_comb = pd.concat([tr_data,tr_data_tfidf,tr_data_log1p],axis=1)
te_data_comb = pd.concat([te_data,te_data_tfidf,te_data_log1p],axis=1)
In [7]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# y_encoded = le.fit_transform(y)
y_encoded = [int(x.split('_')[1]) for x in y]
y_encoded = [y-1 for y in y_encoded]
In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,log_loss
from sklearn.model_selection import train_test_split,KFold
X_train, X_val, y_train, y_val = train_test_split(tr_data_comb,y_encoded,test_size = 0.2,random_state =12345)
In [11]:
import xgboost as xgb
params = {'objective':'multi:softprob',
'learning_rate':0.1,
'subsample':0.8,
'colsample_bytree':0.9,
'colsample_bylevel':0.7,
'max_depth':7,
'nthread':4,
'eval_metric':'mlogloss',
'num_class':9,
'gamma':0.1,
'seed':1234}
bst_cv = xgb.cv(params=params,dtrain=xgb.DMatrix(tr_data_comb,label=y_encoded),verbose_eval=2,
nfold=5,early_stopping_rounds=20,num_boost_round=300)
In [38]:
bst = xgb.train(params=params,dtrain=xgb.DMatrix(tr_data_comb,label=y_encoded),num_boost_round=400)
In [97]:
pred = bst.predict(xgb.DMatrix(te_data_comb))
subm = pd.DataFrame(pred)
subm.columns = ['class_'+ str(x) for x in range(1,10)]
subm['id'] = pd.read_csv('../input/test.csv',usecols=['id'])
#subm.index_label = 'id'
subm.to_csv('../subm/tfidf_log1p_raw_xgb_sub1.csv',index=False)
In [1]:
tr_data = pd.read_csv('../input/train.csv')
te_data = pd.read_csv('../input/test.csv')
y = tr_data.target
tr_data.drop(['id','target'],axis=1,inplace=True)
te_data.drop(['id'],axis=1,inplace=True)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)
In [113]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,log_loss
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(tr_data,y_encoded,test_size = 0.2,random_state =12345)
knn2 = KNeighborsClassifier(n_jobs=4,n_neighbors=2,)
knn2.fit(X_train,y_train)
knn2_pred = knn2.predict_proba(X_val)
print('log loss for knn2: {}'.format(log_loss(y_pred = knn2_pred,y_true = y_val)))
knn4 = KNeighborsClassifier(n_jobs=4,n_neighbors=4,)
knn4.fit(X_train,y_train)
knn4_pred = knn4.predict_proba(X_val)
print('log loss for knn4: {}'.format(log_loss(y_pred = knn4_pred,y_true = y_val)))
knn8 = KNeighborsClassifier(n_jobs=8,n_neighbors=8,)
knn8.fit(X_train,y_train)
knn8_pred = knn8.predict_proba(X_val)
print('log loss for knn8: {}'.format(log_loss(y_pred = knn8_pred,y_true = y_val)))
knn16 = KNeighborsClassifier(n_jobs=4,n_neighbors=16,)
knn16.fit(X_train,y_train)
knn16_pred = knn16.predict_proba(X_val)
print('log loss for knn16: {}'.format(log_loss(y_pred = knn16_pred,y_true = y_val)))
knn32 = KNeighborsClassifier(n_jobs=4,n_neighbors=32,)
knn32.fit(X_train,y_train)
knn32_pred = knn32.predict_proba(X_val)
print('log loss for knn32: {}'.format(log_loss(y_pred = knn32_pred,y_true = y_val)))
knn64 = KNeighborsClassifier(n_jobs=4,n_neighbors=64,)
knn64.fit(X_train,y_train)
knn64_pred = knn64.predict_proba(X_val)
print('log loss for knn64: {}'.format(log_loss(y_pred = knn64_pred,y_true = y_val)))
knn128 = KNeighborsClassifier(n_jobs=4,n_neighbors=128,)
knn128.fit(X_train,y_train)
knn128_pred = knn128.predict_proba(X_val)
print('log loss for knn128: {}'.format(log_loss(y_pred = knn128_pred,y_true = y_val)))
In [101]:
class_weights = {0:1,1:1,2:1,3:10,4:1,5:1,6:1,7:1,8:1}
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(class_weight=class_weights,max_depth=4,max_features=92,min_samples_split=2,random_state=12345)
dtc.fit(X_train,y_train)
tree_pred = dtc.predict_proba(X_val)
print('log loss for dtc: {}'.format(log_loss(y_pred = tree_pred,y_true = y_val)))
In [102]:
# class_weights = {0:1,1:1,2:1,3:1,4:1,5:1,6:10,7:1,8:1}
# lets remove the class weights and check our score...
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=4,max_features=92,min_samples_split=2,random_state=12345)
dtc.fit(X_train,y_train)
tree_pred = dtc.predict_proba(X_val)
print('log loss for dtc: {}'.format(log_loss(y_pred = tree_pred,y_true = y_val)))
In [114]:
from sklearn.svm import SVC
svc = SVC(kernel='linear',C=0.1,max_iter=10000,random_state=12345,probability=True)
svc.fit(X_train,y_train)
svc_pred = svc.predict_proba(X_val)
print('log loss for svc: {}'.format(log_loss(y_pred = svc_pred,y_true = y_val)))
In [ ]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf',C=0.1,max_iter=10000,random_state=12345,probability=True,)
svc.fit(X_train,y_train)
svc_pred = svc.predict_proba(X_val)
print('log loss for svc: {}'.format(log_loss(y_pred = svc_pred,y_true = y_val)))
In [109]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=4,n_estimators=300)
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict_proba(X_val)
print('log loss for RFC: {}'.format(log_loss(y_pred = rfc_pred,y_true = y_val)))
In [61]:
rfc_test_pred = rfc.predict_proba(te_data)
In [92]:
import xgboost as xgb
dtrain=xgb.DMatrix(X_train,label=y_train)
dval=xgb.DMatrix(X_val,label=y_val)
watchlist = [(dtrain, 'train'),(dval, 'eval')]
params = {'objective':'multi:softprob',
'learning_rate':0.05,
'subsample':0.7,
'colsample_bytree':0.8,
'colsample_bylevel':0.7,
'max_depth':6,
'nthread':3,
'eval_metric':'mlogloss',
'num_class':9,
'silent':0,
'seed':1234}
bst = xgb.train(params=params,dtrain=xgb.DMatrix(X_train,label=y_train),num_boost_round=3000,early_stopping_rounds=80,
evals=watchlist,verbose_eval=50)
In [111]:
xgb_pred = bst.predict(xgb.DMatrix(X_val))
print('log loss for XGB: {}'.format(log_loss(y_pred = xgb_pred,y_true = y_val)))
In [62]:
xgb_test_pred = bst.predict(xgb.DMatrix(te_data))
combined_pred = xgb_test_pred *0.85 + rfc_test_pred *0.15
subm = pd.DataFrame(combined_pred)
subm.columns = ['class_'+ str(x) for x in range(1,10)]
subm['id'] = pd.read_csv('../input/test.csv',usecols=['id'])
#subm.index_label = 'id'
subm.to_csv('../subm/rf_xgb_sub1.csv',index=False)
In [14]:
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Merge,Embedding,BatchNormalization
from keras.utils import np_utils
OHE_y_train = np_utils.to_categorical(y_train)
OHE_y_val = np_utils.to_categorical(y_val)
In [15]:
from keras.callbacks import ReduceLROnPlateau
cb = ReduceLROnPlateau(patience=1,factor=0.1,epsilon=0.02)
In [28]:
model1 = Sequential()
model1.add(Dense(500,input_shape=(X_train.shape[1],),activation='relu'))
model1.add(Dropout(0.1))
model1.add(Dense(250,activation='relu'))
model1.add(Dropout(0.05))
model1.add(Dense(50,activation='relu'))
model1.add(Dense(9,activation='softmax'))
model1.compile(optimizer='adam',
loss='categorical_crossentropy')
model1.fit(X_train.values,OHE_y_train,validation_data=[X_val.values,OHE_y_val],callbacks=[cb],epochs=10)
Out[28]:
In [29]:
ann_pred = model1.predict_classes(X_val.values)
from sklearn.metrics import classification_report,log_loss
print(confusion_matrix(y_pred=ann_pred,y_true=y_val))
sns.heatmap(confusion_matrix(y_pred=ann_pred+1,y_true=y_val),cmap='Greens',xticklabels=range(1,10),yticklabels=range(1,10))
print('classification report results:\r\n' + classification_report(y_pred=ann_pred,y_true=y_val))
print('log-loss for classifier: {}'.format(log_loss(y_pred=model1.predict(X_val.values),y_true=y_val)))
In [30]:
model1 = Sequential()
model1.add(Dense(500,input_shape=(X_train.shape[1],),activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(250,activation='relu'))
model1.add(Dropout(0.1))
model1.add(Dense(50,activation='relu'))
model1.add(Dense(9,activation='softmax'))
model1.compile(optimizer='adam',
loss='categorical_crossentropy')
model1.fit(X_train.values,OHE_y_train,validation_data=[X_val.values,OHE_y_val],callbacks=[cb],epochs=10)
Out[30]:
In [54]:
Out[54]:
In [94]:
oof_preds = []
test_preds = []
kf = KFold(n_splits=5,shuffle=True,random_state=12345)
for train_index, val_index in kf.split(X=tr_data_comb,y=y_encoded):
print(train_index,val_index)
X_train, X_val = tr_data_comb.iloc[train_index,:], tr_data_comb.iloc[val_index,:]
y_train, y_val = np.array(y_encoded)[train_index], np.array(y_encoded)[val_index]
OHE_y_train = np_utils.to_categorical(y_train)
OHE_y_val = np_utils.to_categorical(y_val)
model1 = Sequential()
model1.add(Dense(1024,input_shape=(X_train.shape[1],),activation='relu'))
model1.add(BatchNormalization())
model1.add(Dropout(0.1))
model1.add(Dense(1024,activation='relu'))
model1.add(Dropout(0.1))
model1.add(Dense(1024,activation='relu'))
model1.add(Dense(9,activation='softmax'))
model1.compile(optimizer='adam',loss='categorical_crossentropy')
model1.fit(X_train.values,OHE_y_train,validation_data=[X_val.values,OHE_y_val],callbacks=[cb],epochs=10)
ann_pred = model1.predict(X_val.values)
ann_test_pred = model1.predict(te_data_comb.values)
oof_preds.append(ann_pred)
test_preds.append(ann_test_pred)
In [80]:
test_pred_arr = np.array(test_preds)
preds = pd.DataFrame(np.clip(np.mean(test_pred_arr,axis=0),a_max=0.999,a_min=0.001))
subm = pd.DataFrame(preds)
subm.columns = ['Class_'+ str(x) for x in range(1,10)]
subm['id'] = te_ids
subm.to_csv('../subm/ANN_5fold.csv',index=False)
In [78]:
subm
Out[78]:
In [35]:
model1 = Sequential()
model1.add(Dense(1500,input_shape=(X_train.shape[1],),activation='relu'))
model1.add(Dropout(0.3))
model1.add(Dense(750,activation='relu'))
model1.add(Dense(9,activation='softmax'))
model1.compile(optimizer='adam',
loss='categorical_crossentropy')
model1.fit(X_train.values,OHE_y_train,validation_data=[X_val.values,OHE_y_val],callbacks=[cb],epochs=10)
ann_pred = model1.predict_classes(X_val.values)
from sklearn.metrics import classification_report,log_loss
print(confusion_matrix(y_pred=ann_pred,y_true=y_val))
sns.heatmap(confusion_matrix(y_pred=ann_pred+1,y_true=y_val),cmap='Greens',xticklabels=range(1,10),yticklabels=range(1,10))
print('classification report results:\r\n' + classification_report(y_pred=ann_pred,y_true=y_val))
print('log-loss for classifier: {}'.format(log_loss(y_pred=model1.predict(X_val.values),y_true=y_val)))
In [42]:
print('log-loss for classifier: {}'.format(log_loss(y_pred=np.clip(model1.predict(X_val.values),a_min=0.0001,a_max=0.9999),y_true=y_val)))
In [43]:
test_pred = np.clip(,a_min=0.001,a_max=0.999)
subm = pd.DataFrame(test_pred)
subm.columns = ['class_'+ str(x) for x in range(1,10)]
subm['id'] = te_ids
subm.to_csv('../subm/ANN_0.4583.csv',index=False)
In [ ]: