In [28]:
import os
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

# model data stricture
mdl_fit = ExtraTreesClassifier(n_estimators=700,max_features= 50, 
                               criterion = 'entropy',min_samples_split= 5,
                                max_depth= 50, min_samples_leaf= 5)      
   
def print_wd():
    print os.getcwd()


def train_model(train_data):
    # read in training data
    train = pd.DataFrame.from_csv(train_data,sep="\t")
    X_train = train
    response = ""
    mdl_fit.fit(X_train,response) 


def save_model(file_name):
    print "save model"
    
    

print "Here is",print_wd()

# read in training data
train = pd.read_csv("./py_train.tsv",sep="\t")


Here is /Users/jim/Desktop/Kaggle/BNPParibasCardif/src/sandbox
None

In [41]:
train.describe()


Out[41]:
v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 ... v123 v124 v125 v126 v127 v128 v129 v130 v131 all.var.na.count
count 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 ... 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000 229.000000
mean -457.196285 -453.935283 3.803493 -455.848313 -448.922292 -456.751555 -456.728075 -453.188346 -453.136161 2.025742 ... -464.849943 -453.319999 50.585153 -457.179424 -456.361006 -452.568517 0.358079 -457.003086 -457.174534 45.820961
std 499.662355 502.673814 0.743856 500.905473 502.849790 500.072037 500.093705 498.948690 503.407773 1.476173 ... 501.336131 498.828479 25.749433 499.677363 500.433242 499.514862 0.721346 499.840602 499.682392 49.565377
min -999.000000 -999.000000 1.000000 -999.000000 -999.000000 -999.000000 -999.000000 -999.000000 -999.000000 0.262583 ... -999.000000 -999.000000 2.000000 -999.000000 -999.000000 -999.000000 0.000000 -999.000000 -999.000000 0.000000
25% -999.000000 -999.000000 4.000000 -999.000000 -999.000000 -999.000000 -999.000000 -999.000000 -999.000000 1.050328 ... -999.000000 -999.000000 29.000000 -999.000000 -999.000000 -999.000000 0.000000 -999.000000 -999.000000 0.000000
50% 0.279245 3.322879 4.000000 2.465962 5.891056 1.623816 1.737342 0.019645 7.135679 1.312911 ... 1.249729 0.003253 54.000000 1.216116 1.254646 0.969578 0.000000 0.833333 0.336134 0.000000
75% 1.488934 7.860665 4.000000 4.264132 8.870209 2.452108 2.430972 0.258612 9.255320 2.428884 ... 2.890027 0.119788 73.000000 1.603019 3.084626 1.890949 0.000000 1.643242 1.666666 100.000000
max 5.870627 15.519629 4.000000 7.083427 16.597175 4.428996 4.349441 8.894753 13.684211 7.746171 ... 15.026228 10.837344 91.000000 2.956752 7.683966 12.003873 4.000000 6.466165 5.000001 100.000000

8 rows × 132 columns


In [42]:
predictors = list(set(train.columns.values) - set(['response']))
Xtrain = train[predictors]
Ytrain = train['response']

In [43]:
mdl_fit.fit(Xtrain,Ytrain)


Out[43]:
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=50, max_features=50, max_leaf_nodes=None,
           min_samples_leaf=5, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [46]:
mdl_fit.score()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-46-6a4de20802b0> in <module>()
----> 1 mdl_fit.score()

TypeError: score() takes at least 3 arguments (1 given)

In [ ]: