In [56]:
%matplotlib inline
from __future__ import absolute_import
from __future__ import print_function

In [63]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.svm import SVC

In [2]:
df = pd.read_csv("../data/Springleaf/train.csv")


/Library/Python/2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (8,9,10,11,12,43,157,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [76]:
df_one = df[(df.target==1)][:5000]
df_zero = df[(df.target==0)][:5000]

In [77]:
df_reduced = df_one.append(df_zero)

df_tmp = df_reduced[["VAR_0002","VAR_0003","VAR_0004","VAR_0006","VAR_0007","target"]].dropna()

df_train =df_tmp.values

In [79]:
df_train.size


Out[79]:
59982

In [80]:
np.random.shuffle(df_train)

X = df_train[:,:4]
Y = df_train[:,5]

model = RandomForestClassifier(n_estimators =100)
scores = cross_validation.cross_val_score(model, X, Y, cv=5)

In [84]:
print (scores)
print (scores.mean())


[ 0.582       0.596       0.59129565  0.58229115  0.5877939 ]
0.587876138069

In [82]:
model_svm= SVC()
cross_validation.cross_val_score(model_svm, X, Y, cv=5)


Out[82]:
array([ 0.503     ,  0.5045    ,  0.50425213,  0.50575288,  0.51825913])

In [ ]: