w5-practice-01--random-forest-size



In [37]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_val_score

In [3]:
data = pd.read_csv('resources/abalone.csv')

In [6]:
data['Sex'] = data['Sex'].map(lambda x: 1 if x == 'M' else (0 if x == 'I' else -1))

In [9]:
data.head(10)


Out[9]:
Sex Length Diameter Height WholeWeight ShuckedWeight VisceraWeight ShellWeight Rings
0 1 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
1 1 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
2 -1 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
3 1 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
4 0 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7
5 0 0.425 0.300 0.095 0.3515 0.1410 0.0775 0.120 8
6 -1 0.530 0.415 0.150 0.7775 0.2370 0.1415 0.330 20
7 -1 0.545 0.425 0.125 0.7680 0.2940 0.1495 0.260 16
8 1 0.475 0.370 0.125 0.5095 0.2165 0.1125 0.165 9
9 -1 0.550 0.440 0.150 0.8945 0.3145 0.1510 0.320 19

In [11]:
target = data['Rings']
target.head(10)


Out[11]:
0    15
1     7
2     9
3    10
4     7
5     8
6    20
7    16
8     9
9    19
Name: Rings, dtype: int64

In [24]:
features = data.loc[:, :'ShellWeight']
features.head(10)


Out[24]:
Sex Length Diameter Height WholeWeight ShuckedWeight VisceraWeight ShellWeight
0 1 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150
1 1 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070
2 -1 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210
3 1 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155
4 0 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055
5 0 0.425 0.300 0.095 0.3515 0.1410 0.0775 0.120
6 -1 0.530 0.415 0.150 0.7775 0.2370 0.1415 0.330
7 -1 0.545 0.425 0.125 0.7680 0.2940 0.1495 0.260
8 1 0.475 0.370 0.125 0.5095 0.2165 0.1125 0.165
9 -1 0.550 0.440 0.150 0.8945 0.3145 0.1510 0.320

In [30]:
kfold = KFold(target.size, n_folds=5, shuffle=True, random_state=1)

In [38]:
scores = [0.0]
for n in range(1, 51):
    model = RandomForestRegressor(n_estimators=n, random_state=1)
    score = np.mean(cross_val_score(model, features, target, cv=kfold, scoring='r2'))
    scores.append(score)

In [41]:
for i,v in enumerate(scores):
    print(i, v)


(0, 0.0)
(1, 0.10213869487724367)
(2, 0.33841675515802144)
(3, 0.4035798494618691)
(4, 0.44272239896668103)
(5, 0.4640207660674969)
(6, 0.4705816327587792)
(7, 0.4758306163523006)
(8, 0.4817418456255852)
(9, 0.4883478130215681)
(10, 0.49446412480247826)
(11, 0.4933965550001963)
(12, 0.4979658763976154)
(13, 0.5021364605722853)
(14, 0.5064286962257328)
(15, 0.5083311970432101)
(16, 0.5105131438322147)
(17, 0.5138482947993028)
(18, 0.5163275412739493)
(19, 0.519034688136388)
(20, 0.5186735928723822)
(21, 0.5198354233542501)
(22, 0.5201583536722211)
(23, 0.5210172709366251)
(24, 0.5224031825808091)
(25, 0.5226174639676071)
(26, 0.5238061581717052)
(27, 0.5241223522188981)
(28, 0.525053745123244)
(29, 0.5259703756348931)
(30, 0.5265378216675755)
(31, 0.5270998544306023)
(32, 0.5283894082092784)
(33, 0.5296165561229277)
(34, 0.5295639554438379)
(35, 0.529517247440473)
(36, 0.5295996267890184)
(37, 0.52913141344746)
(38, 0.5291602576334833)
(39, 0.5292409810274339)
(40, 0.529067015963163)
(41, 0.5293420056911183)
(42, 0.5295732065337546)
(43, 0.5293754946648532)
(44, 0.5291192337563251)
(45, 0.5283754660277202)
(46, 0.5285204275493458)
(47, 0.5287049445791908)
(48, 0.5296127515437734)
(49, 0.5303944554434498)
(50, 0.5305963435394535)