In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from time import time
import pandas as pd
import numpy as np
% matplotlib inline
import pickle
In [3]:
def show_feat_importances(name, trained_model,feat_labels):
plt.figure(figsize=(12,5))
importances = trained_model.feature_importances_
plt.ylabel('Importances')
indices = np.argsort(importances)[::-1]
plt.bar(range(len(feat_labels)),importances[indices],color='lightblue',align='center')
plt.xticks(range(len(feat_labels)),feat_labels[indices],rotation=90)
plt.xlim([-1,len(feat_labels)])
plt.tight_layout()
plt.title('FEATURE IMPORTANCE | MODEL:{}'.format(name))
plt.grid()
plt.show()
In [4]:
df = pd.read_csv('https://s3.amazonaws.com/marweezys-bucket/all_state_insurance_prediction/train.csv')
ids = list(df['id'])
df.drop(labels='id', axis=1, inplace=True)
one_hot_df = pd.get_dummies(df.ix[:,:-1])
In [5]:
X = one_hot_df
y = df['loss']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
In [21]:
algo = GradientBoostingRegressor()
model = algo.fit(X_train,y_train)
train_score = model.score(X_train,y_train)
CV_score = np.mean(cross_val_score(algo,X_train,y_train,cv=10))
test_score = model.score(X_test,y_test)
print 'train_score:{} \t CV_score:{} \t test_score:{}'.format(train_score,CV_score,test_score)
show_feat_importances(name='Gradient Boost',trained_model=model,feat_labels=one_hot_df.columns)
In [ ]:
FI = list(model.feature_importances_)
d = dict(zip(list(one_hot_df),FI))
drops = [key for key,val in d.items() if val==0.0]
one_hot_df.drop(labels=drops,axis=1,inplace=True)
X = one_hot_df
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
In [30]:
algo = GradientBoostingRegressor()
model = algo.fit(X_train,y_train)
train_score = model.score(X_train,y_train)
CV_score = np.mean(cross_val_score(algo,X_train,y_train,cv=10))
test_score = model.score(X_test,y_test)
print 'train_score:{} \t CV_score:{} \t test_score:{}'.format(train_score,CV_score,test_score)
show_feat_importances(name='Gradient Boost',trained_model=model,feat_labels=one_hot_df.columns)
In [42]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
feats = one_hot_df.columns
drops = []
for i in indices:
if importances[i] < 0.0025:
drops.append(feats[i])
print len(feats-drops)
one_hot_df.drop(labels=drops,axis=1,inplace=True)
X = one_hot_df
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
In [51]:
algo = GradientBoostingRegressor()
model = algo.fit(X_train,y_train)
train_score = model.score(X_train,y_train)
CV_score = np.mean(cross_val_score(algo,X_train,y_train,cv=10))
test_score = model.score(X_test,y_test)
print 'train_score:{} \t CV_score:{} \t test_score:{}'.format(train_score,CV_score,test_score)
show_feat_importances(name='Gradient Boost',trained_model=model,feat_labels=one_hot_df.columns)
In [61]:
feats = list(one_hot_df.columns)
with open('GB_model.plk','wb') as f:
pickle.dump(model,f)
with open('GB_feats.plk','wb') as f:
pickle.dump(feats,f)