In [1]:
import pandas as pd
import numpy as np
import pylab as plt
# Set the global default size of figures
plt.rc('figure', figsize = (10,5))
# figure size with subplots
figsize_with_subplots = (10, 10)
# bin size
bin_size = 10

In [4]:
cd kaggle


/Users/martiom/kaggle

In [5]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [23]:
def convert_data(df):
    bad_columns = set(['Original_Quote_Date' , 'QuoteConversion_Flag', 'QuoteNumber'])
    good_columns = filter(lambda x: x not in bad_columns and not x.startswith('Field') and not df[x].dtype == object, df.columns)

    df1 = df[good_columns].copy() 
    print df1.shape
    df1.replace([' ', '', -1], np.nan, inplace=True)
    df1['NaNCount'] = df1.isnull().sum(axis=1)
    #df1 = df[filter(lambda x: df[x].dtype != object and x.startswith('GeographicField'), 
    #                df.columns[3:])]
    print df1.shape
    df1 = pd.get_dummies(df1)
    print df1.shape
    df1 = df1.fillna(df1.median())
    df1 = df1.dropna(axis=1)
    #labels = df1.QuoteConversion_Flag.values
    print df1.shape
    if 'QuoteConversion_Flag' in df.columns:
        labels = df.QuoteConversion_Flag.values
    else:
        labels = None
    return df1.values, labels

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBC
clf = GBC() #RandomForestClassifier(n_estimators=50)

In [24]:
train_features, labels = convert_data(df)


(260753, 265)
(260753, 266)
(260753, 266)
(260753, 265)

In [25]:
test_features, _ = convert_data(df_test)


(173836, 265)
(173836, 266)
(173836, 266)
(173836, 265)

In [26]:
from sklearn import metrics
from sklearn.cross_validation import train_test_split
train_x, test_x, train_y, test_y = train_test_split(train_features, 
                                                    labels, 
                                                    test_size=0.20, 
                                                    random_state=0)
print train_x.shape
print test_x.shape
print train_y.shape
print test_y.shape


(208602, 265)
(52151, 265)
(208602,)
(52151,)

In [27]:
clf = clf.fit(train_x, train_y)
predict_y = clf.predict(test_x)
from sklearn.metrics import accuracy_score
print clf.score(train_x, train_y)
print accuracy_score(test_y, predict_y)


0.893816933682
0.891258077506

In [28]:
test_predict = clf.predict(convert_data(df_test)[0])


(173836, 265)
(173836, 266)
(173836, 266)
(173836, 265)

In [31]:
df_test['QuoteConversion_Flag'] = test_predict

In [32]:
df_test[['QuoteNumber', 'QuoteConversion_Flag']].to_csv('gbt_submission.csv', index=False)

In [38]:
import xgboost as xgb


---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-38-745aa3a2d734> in <module>()
----> 1 import xgboost as xgb

/Users/martiom/anaconda/lib/python2.7/site-packages/xgboost.py in <module>()
     80 
     81 # load the XGBoost library globally
---> 82 xglib = load_xglib()
     83 
     84 

/Users/martiom/anaconda/lib/python2.7/site-packages/xgboost.py in load_xglib()
     57     if len(dll_path) == 0:
     58         raise XGBoostLibraryNotFound('cannot find find the files in the candicate path ' + str(dll_path))
---> 59     lib = ctypes.cdll.LoadLibrary(lib_path[0])
     60 
     61     # DMatrix functions

/Users/martiom/anaconda/lib/python2.7/ctypes/__init__.pyc in LoadLibrary(self, name)
    441 
    442     def LoadLibrary(self, name):
--> 443         return self._dlltype(name)
    444 
    445 cdll = LibraryLoader(CDLL)

/Users/martiom/anaconda/lib/python2.7/ctypes/__init__.pyc in __init__(self, name, mode, handle, use_errno, use_last_error)
    363 
    364         if handle is None:
--> 365             self._handle = _dlopen(self._name, mode)
    366         else:
    367             self._handle = handle

OSError: dlopen(/Users/martiom/anaconda/lib/python2.7/site-packages/libxgboostwrapper.so, 6): Library not loaded: @rpath/./libgomp.1.dylib
  Referenced from: /Users/martiom/anaconda/lib/python2.7/site-packages/libxgboostwrapper.so
  Reason: image not found

In [34]:
test_proba = clf.predict_proba(test_features)
test_proba.shape


Out[34]:
(173836, 2)

In [37]:
df_test['QuoteConversion_Flag'] = test_proba[:,1]
df_test[['QuoteNumber', 'QuoteConversion_Flag']].to_csv('gbt_submission.csv', index=False)

In [ ]: