In [1]:
import pandas as pd
import numpy as np
import pylab as plt
# Set the global default size of figures
plt.rc('figure', figsize = (10,5))
# figure size with subplots
figsize_with_subplots = (10, 10)
# bin size
bin_size = 10
In [4]:
cd kaggle
In [5]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
In [23]:
def convert_data(df):
bad_columns = set(['Original_Quote_Date' , 'QuoteConversion_Flag', 'QuoteNumber'])
good_columns = filter(lambda x: x not in bad_columns and not x.startswith('Field') and not df[x].dtype == object, df.columns)
df1 = df[good_columns].copy()
print df1.shape
df1.replace([' ', '', -1], np.nan, inplace=True)
df1['NaNCount'] = df1.isnull().sum(axis=1)
#df1 = df[filter(lambda x: df[x].dtype != object and x.startswith('GeographicField'),
# df.columns[3:])]
print df1.shape
df1 = pd.get_dummies(df1)
print df1.shape
df1 = df1.fillna(df1.median())
df1 = df1.dropna(axis=1)
#labels = df1.QuoteConversion_Flag.values
print df1.shape
if 'QuoteConversion_Flag' in df.columns:
labels = df.QuoteConversion_Flag.values
else:
labels = None
return df1.values, labels
In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBC
clf = GBC() #RandomForestClassifier(n_estimators=50)
In [24]:
train_features, labels = convert_data(df)
In [25]:
test_features, _ = convert_data(df_test)
In [26]:
from sklearn import metrics
from sklearn.cross_validation import train_test_split
train_x, test_x, train_y, test_y = train_test_split(train_features,
labels,
test_size=0.20,
random_state=0)
print train_x.shape
print test_x.shape
print train_y.shape
print test_y.shape
In [27]:
clf = clf.fit(train_x, train_y)
predict_y = clf.predict(test_x)
from sklearn.metrics import accuracy_score
print clf.score(train_x, train_y)
print accuracy_score(test_y, predict_y)
In [28]:
test_predict = clf.predict(convert_data(df_test)[0])
In [31]:
df_test['QuoteConversion_Flag'] = test_predict
In [32]:
df_test[['QuoteNumber', 'QuoteConversion_Flag']].to_csv('gbt_submission.csv', index=False)
In [38]:
import xgboost as xgb
In [34]:
test_proba = clf.predict_proba(test_features)
test_proba.shape
Out[34]:
In [37]:
df_test['QuoteConversion_Flag'] = test_proba[:,1]
df_test[['QuoteNumber', 'QuoteConversion_Flag']].to_csv('gbt_submission.csv', index=False)
In [ ]: