In [258]:
# rattle package in R has weather dataset
#(see help at http://artax.karlin.mff.cuni.cz/r-help/library/rattle/html/weather.html)

In [259]:
import os as os

In [260]:
import pandas as pd

In [261]:
os.getcwd()


Out[261]:
'/home/ajayohri'

In [262]:
os.listdir()


Out[262]:
['.hplip',
 '.xsession-errors.old',
 'VirtualBox VMs',
 'filename.pkl_04.npy',
 '.thunderbird',
 'SVM.R',
 'R',
 'Desktop',
 'filename.pkl_07.npy',
 '.cache',
 '.webex',
 'file.R',
 '.ipython',
 'unique_ids_for_list.html',
 'filename.pkl_11.npy',
 '.Xauthority',
 'Dropbox',
 'examples.desktop',
 'machine learning-plot and bagged pima indians.ipynb',
 'date time.ipynb',
 'Untitled.ipynb',
 '.rstudio-desktop',
 'filename.pkl_01.npy',
 'anaconda3',
 '.dropbox',
 'Music',
 '.pki',
 'rsconnect',
 'GoodReads.ipynb',
 '.config',
 'diamsum.html',
 'filename.pkl_06.npy',
 'data inspection .ipynb',
 '.sudo_as_admin_successful',
 '.continuum',
 '.java',
 'unique ids for list.R',
 '.bashrc-anaconda3.bak',
 '.texmf-var',
 'numpy scipy pandas.ipynb',
 'mozilla.pdf',
 '.dropbox-dist',
 '.bash_logout',
 '.jupyter',
 '.ecryptfs',
 '.dbus',
 '.local',
 '.lyx',
 '.xsession-errors',
 'hebrew',
 'RCommanderMarkdown.Rmd',
 '.bash_history',
 'SAS',
 'nbr2mp4.sh',
 '.adobe',
 '.Skype',
 'filename.pkl_05.npy',
 '.wajig',
 'ajay ohri.odt',
 '.macromedia',
 '.gphoto',
 '.oracle_jre_usage',
 'machine learning-rattle dataset from R.ipynb',
 '.profile',
 'file operations.ipynb',
 'Documents',
 'filename.pkl_09.npy',
 'Videos',
 'RCommander.R',
 'filename.pkl_08.npy',
 '.gstreamer-0.10',
 'SVM.html',
 '.Private',
 'RCommander.txt',
 're for searching strings.ipynb',
 '.Rhistory',
 'filename.pkl_02.npy',
 'RcmdrMarkdown.Rmd',
 'Scikit Tutorial',
 'machine learning.ipynb',
 '.ivy2',
 'assignment2.R',
 'assignment2.html',
 'filename.pkl_03.npy',
 'Public',
 'nbr2mp4.tar',
 'RcmdrMarkdown.md',
 '.bashrc',
 '.mozilla',
 'Pictures',
 'Data Viz Tutorial.ipynb',
 'filename.pkl_10.npy',
 '.RData',
 '.gconf',
 'data transformations.ipynb',
 'RcmdrMarkdown.html',
 'file.html',
 'Scikit Tutorial.ipynb',
 'Strings, Lists and Maps.ipynb',
 'filename.pkl',
 'weather.csv',
 'Downloads',
 '.gnupg',
 '.nano',
 'variables in strings.ipynb',
 'Templates',
 '.ICEauthority',
 '.ipynb_checkpoints']

In [263]:
#Finding only csv files in a directory using os and glob packages
import glob

path = os.getcwd()
extension = 'csv'
os.chdir(path)
result = [i for i in glob.glob('*.{}'.format(extension))]
print(result)


['weather.csv']

In [264]:
dataframe=pd.read_csv("weather.csv")

In [265]:
dataframe.head()


Out[265]:
Unnamed: 0 Date Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed ... Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RISK_MM RainTomorrow
0 1 2007-11-01 Canberra 8.0 24.3 0.0 3.4 6.3 NW 30.0 ... 29 1019.7 1015.0 7 7 14.4 23.6 No 3.6 Yes
1 2 2007-11-02 Canberra 14.0 26.9 3.6 4.4 9.7 ENE 39.0 ... 36 1012.4 1008.4 5 3 17.5 25.7 Yes 3.6 Yes
2 3 2007-11-03 Canberra 13.7 23.4 3.6 5.8 3.3 NW 85.0 ... 69 1009.5 1007.2 8 7 15.4 20.2 Yes 39.8 Yes
3 4 2007-11-04 Canberra 13.3 15.5 39.8 7.2 9.1 NW 54.0 ... 56 1005.5 1007.0 2 7 13.5 14.1 Yes 2.8 Yes
4 5 2007-11-05 Canberra 7.6 16.1 2.8 5.6 10.6 SSE 50.0 ... 49 1018.3 1018.5 7 7 11.1 15.4 Yes 0.0 No

5 rows � 25 columns


In [266]:
dataframe.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 25 columns):
Unnamed: 0       366 non-null int64
Date             366 non-null object
Location         366 non-null object
MinTemp          366 non-null float64
MaxTemp          366 non-null float64
Rainfall         366 non-null float64
Evaporation      366 non-null float64
Sunshine         363 non-null float64
WindGustDir      363 non-null object
WindGustSpeed    364 non-null float64
WindDir9am       335 non-null object
WindDir3pm       365 non-null object
WindSpeed9am     359 non-null float64
WindSpeed3pm     366 non-null int64
Humidity9am      366 non-null int64
Humidity3pm      366 non-null int64
Pressure9am      366 non-null float64
Pressure3pm      366 non-null float64
Cloud9am         366 non-null int64
Cloud3pm         366 non-null int64
Temp9am          366 non-null float64
Temp3pm          366 non-null float64
RainToday        366 non-null object
RISK_MM          366 non-null float64
RainTomorrow     366 non-null object
dtypes: float64(12), int64(6), object(7)
memory usage: 71.6+ KB

In [267]:
dataframe=dataframe.drop('Unnamed: 0', 1)

In [268]:
dataframe.describe()


/home/ajayohri/anaconda3/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[268]:
MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RISK_MM
count 366.000000 366.000000 366.000000 366.000000 363.000000 364.000000 359.000000 366.000000 366.000000 366.000000 366.000000 366.000000 366.000000 366.000000 366.000000 366.000000 366.000000
mean 7.265574 20.550273 1.428415 4.521858 7.909366 39.840659 9.651811 17.986339 72.035519 44.519126 1019.709016 1016.810383 3.890710 4.024590 12.358470 19.230874 1.428415
std 6.025800 6.690516 4.225800 2.669383 3.481517 13.059807 7.951929 8.856997 13.137058 16.850947 6.686212 6.469422 2.956131 2.666268 5.630832 6.640346 4.225800
min -5.300000 7.600000 0.000000 0.200000 0.000000 13.000000 0.000000 0.000000 36.000000 13.000000 996.500000 996.800000 0.000000 0.000000 0.100000 5.100000 0.000000
25% 2.300000 15.025000 0.000000 2.200000 NaN NaN NaN 11.000000 64.000000 32.250000 1015.350000 1012.800000 1.000000 1.000000 7.625000 14.150000 0.000000
50% 7.450000 19.650000 0.000000 4.200000 NaN NaN NaN 17.000000 72.000000 43.000000 1020.150000 1017.400000 3.500000 4.000000 12.550000 18.550000 0.000000
75% 12.500000 25.500000 0.200000 6.400000 NaN NaN NaN 24.000000 81.000000 55.000000 1024.475000 1021.475000 7.000000 7.000000 17.000000 24.000000 0.200000
max 20.900000 35.800000 39.800000 13.800000 13.600000 98.000000 41.000000 52.000000 99.000000 96.000000 1035.700000 1033.200000 8.000000 8.000000 24.700000 34.500000 39.800000

In [269]:
dataframe['RainTomorrow'].unique()


Out[269]:
array(['Yes', 'No'], dtype=object)

In [270]:
dataframe['RainToday'].unique()


Out[270]:
array(['No', 'Yes'], dtype=object)

In [271]:
dataframe['Location'].unique()


Out[271]:
array(['Canberra'], dtype=object)

In [272]:
dataframe['Date'].unique()


Out[272]:
array(['2007-11-01', '2007-11-02', '2007-11-03', '2007-11-04',
       '2007-11-05', '2007-11-06', '2007-11-07', '2007-11-08',
       '2007-11-09', '2007-11-10', '2007-11-11', '2007-11-12',
       '2007-11-13', '2007-11-14', '2007-11-15', '2007-11-16',
       '2007-11-17', '2007-11-18', '2007-11-19', '2007-11-20',
       '2007-11-21', '2007-11-22', '2007-11-23', '2007-11-24',
       '2007-11-25', '2007-11-26', '2007-11-27', '2007-11-28',
       '2007-11-29', '2007-11-30', '2007-12-01', '2007-12-02',
       '2007-12-03', '2007-12-04', '2007-12-05', '2007-12-06',
       '2007-12-07', '2007-12-08', '2007-12-09', '2007-12-10',
       '2007-12-11', '2007-12-12', '2007-12-13', '2007-12-14',
       '2007-12-15', '2007-12-16', '2007-12-17', '2007-12-18',
       '2007-12-19', '2007-12-20', '2007-12-21', '2007-12-22',
       '2007-12-23', '2007-12-24', '2007-12-25', '2007-12-26',
       '2007-12-27', '2007-12-28', '2007-12-29', '2007-12-30',
       '2007-12-31', '2008-01-01', '2008-01-02', '2008-01-03',
       '2008-01-04', '2008-01-05', '2008-01-06', '2008-01-07',
       '2008-01-08', '2008-01-09', '2008-01-10', '2008-01-11',
       '2008-01-12', '2008-01-13', '2008-01-14', '2008-01-15',
       '2008-01-16', '2008-01-17', '2008-01-18', '2008-01-19',
       '2008-01-20', '2008-01-21', '2008-01-22', '2008-01-23',
       '2008-01-24', '2008-01-25', '2008-01-26', '2008-01-27',
       '2008-01-28', '2008-01-29', '2008-01-30', '2008-01-31',
       '2008-02-01', '2008-02-02', '2008-02-03', '2008-02-04',
       '2008-02-05', '2008-02-06', '2008-02-07', '2008-02-08',
       '2008-02-09', '2008-02-10', '2008-02-11', '2008-02-12',
       '2008-02-13', '2008-02-14', '2008-02-15', '2008-02-16',
       '2008-02-17', '2008-02-18', '2008-02-19', '2008-02-20',
       '2008-02-21', '2008-02-22', '2008-02-23', '2008-02-24',
       '2008-02-25', '2008-02-26', '2008-02-27', '2008-02-28',
       '2008-02-29', '2008-03-01', '2008-03-02', '2008-03-03',
       '2008-03-04', '2008-03-05', '2008-03-06', '2008-03-07',
       '2008-03-08', '2008-03-09', '2008-03-10', '2008-03-11',
       '2008-03-12', '2008-03-13', '2008-03-14', '2008-03-15',
       '2008-03-16', '2008-03-17', '2008-03-18', '2008-03-19',
       '2008-03-20', '2008-03-21', '2008-03-22', '2008-03-23',
       '2008-03-24', '2008-03-25', '2008-03-26', '2008-03-27',
       '2008-03-28', '2008-03-29', '2008-03-30', '2008-03-31',
       '2008-04-01', '2008-04-02', '2008-04-03', '2008-04-04',
       '2008-04-05', '2008-04-06', '2008-04-07', '2008-04-08',
       '2008-04-09', '2008-04-10', '2008-04-11', '2008-04-12',
       '2008-04-13', '2008-04-14', '2008-04-15', '2008-04-16',
       '2008-04-17', '2008-04-18', '2008-04-19', '2008-04-20',
       '2008-04-21', '2008-04-22', '2008-04-23', '2008-04-24',
       '2008-04-25', '2008-04-26', '2008-04-27', '2008-04-28',
       '2008-04-29', '2008-04-30', '2008-05-01', '2008-05-02',
       '2008-05-03', '2008-05-04', '2008-05-05', '2008-05-06',
       '2008-05-07', '2008-05-08', '2008-05-09', '2008-05-10',
       '2008-05-11', '2008-05-12', '2008-05-13', '2008-05-14',
       '2008-05-15', '2008-05-16', '2008-05-17', '2008-05-18',
       '2008-05-19', '2008-05-20', '2008-05-21', '2008-05-22',
       '2008-05-23', '2008-05-24', '2008-05-25', '2008-05-26',
       '2008-05-27', '2008-05-28', '2008-05-29', '2008-05-30',
       '2008-05-31', '2008-06-01', '2008-06-02', '2008-06-03',
       '2008-06-04', '2008-06-05', '2008-06-06', '2008-06-07',
       '2008-06-08', '2008-06-09', '2008-06-10', '2008-06-11',
       '2008-06-12', '2008-06-13', '2008-06-14', '2008-06-15',
       '2008-06-16', '2008-06-17', '2008-06-18', '2008-06-19',
       '2008-06-20', '2008-06-21', '2008-06-22', '2008-06-23',
       '2008-06-24', '2008-06-25', '2008-06-26', '2008-06-27',
       '2008-06-28', '2008-06-29', '2008-06-30', '2008-07-01',
       '2008-07-02', '2008-07-03', '2008-07-04', '2008-07-05',
       '2008-07-06', '2008-07-07', '2008-07-08', '2008-07-09',
       '2008-07-10', '2008-07-11', '2008-07-12', '2008-07-13',
       '2008-07-14', '2008-07-15', '2008-07-16', '2008-07-17',
       '2008-07-18', '2008-07-19', '2008-07-20', '2008-07-21',
       '2008-07-22', '2008-07-23', '2008-07-24', '2008-07-25',
       '2008-07-26', '2008-07-27', '2008-07-28', '2008-07-29',
       '2008-07-30', '2008-07-31', '2008-08-01', '2008-08-02',
       '2008-08-03', '2008-08-04', '2008-08-05', '2008-08-06',
       '2008-08-07', '2008-08-08', '2008-08-09', '2008-08-10',
       '2008-08-11', '2008-08-12', '2008-08-13', '2008-08-14',
       '2008-08-15', '2008-08-16', '2008-08-17', '2008-08-18',
       '2008-08-19', '2008-08-20', '2008-08-21', '2008-08-22',
       '2008-08-23', '2008-08-24', '2008-08-25', '2008-08-26',
       '2008-08-27', '2008-08-28', '2008-08-29', '2008-08-30',
       '2008-08-31', '2008-09-01', '2008-09-02', '2008-09-03',
       '2008-09-04', '2008-09-05', '2008-09-06', '2008-09-07',
       '2008-09-08', '2008-09-09', '2008-09-10', '2008-09-11',
       '2008-09-12', '2008-09-13', '2008-09-14', '2008-09-15',
       '2008-09-16', '2008-09-17', '2008-09-18', '2008-09-19',
       '2008-09-20', '2008-09-21', '2008-09-22', '2008-09-23',
       '2008-09-24', '2008-09-25', '2008-09-26', '2008-09-27',
       '2008-09-28', '2008-09-29', '2008-09-30', '2008-10-01',
       '2008-10-02', '2008-10-03', '2008-10-04', '2008-10-05',
       '2008-10-06', '2008-10-07', '2008-10-08', '2008-10-09',
       '2008-10-10', '2008-10-11', '2008-10-12', '2008-10-13',
       '2008-10-14', '2008-10-15', '2008-10-16', '2008-10-17',
       '2008-10-18', '2008-10-19', '2008-10-20', '2008-10-21',
       '2008-10-22', '2008-10-23', '2008-10-24', '2008-10-25',
       '2008-10-26', '2008-10-27', '2008-10-28', '2008-10-29',
       '2008-10-30', '2008-10-31'], dtype=object)

In [273]:
# Bagged Decision Trees for Classification
from sklearn import cross_validation
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [274]:
dataframe.columns


Out[274]:
Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')

In [275]:
del dataframe['Date']

In [276]:
del dataframe['Location']

In [277]:
del dataframe['WindDir9am']

In [278]:
del dataframe['WindSpeed3pm']

In [279]:
del dataframe['WindGustDir']
del dataframe['WindDir3pm']
del dataframe['RISK_MM']

In [280]:
dataframe=dataframe.replace(['Yes', 'No'], [1, 0]) #using replace to change string to numeric values

In [281]:
dataframe=dataframe.dropna()

In [282]:
dataframe.head()


Out[282]:
MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed WindSpeed9am Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow
0 8.0 24.3 0.0 3.4 6.3 30.0 6.0 68 29 1019.7 1015.0 7 7 14.4 23.6 0 1
1 14.0 26.9 3.6 4.4 9.7 39.0 4.0 80 36 1012.4 1008.4 5 3 17.5 25.7 1 1
2 13.7 23.4 3.6 5.8 3.3 85.0 6.0 82 69 1009.5 1007.2 8 7 15.4 20.2 1 1
3 13.3 15.5 39.8 7.2 9.1 54.0 30.0 62 56 1005.5 1007.0 2 7 13.5 14.1 1 1
4 7.6 16.1 2.8 5.6 10.6 50.0 20.0 68 49 1018.3 1018.5 7 7 11.1 15.4 1 0

In [283]:
len(dataframe)


Out[283]:
354

In [284]:
len(dataframe.columns)


Out[284]:
17

In [285]:
names=dataframe.columns
names


Out[285]:
Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [286]:
dataframe.describe()


Out[286]:
MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed WindSpeed9am Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow
count 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000 354.000000
mean 7.362429 20.601412 1.420904 4.558192 7.925424 40.011299 9.666667 71.875706 44.454802 1019.562147 1016.692090 3.920904 4.019774 12.438701 19.271469 0.180791 0.180791
std 6.010927 6.708966 4.235358 2.667877 3.510039 13.034488 7.978489 13.161939 16.944316 6.602685 6.373679 2.962363 2.672312 5.630160 6.663681 0.385390 0.385390
min -5.300000 7.600000 0.000000 0.200000 0.000000 13.000000 0.000000 36.000000 13.000000 996.500000 996.800000 0.000000 0.000000 0.100000 5.100000 0.000000 0.000000
25% 2.400000 15.100000 0.000000 2.400000 5.925000 31.000000 6.000000 64.000000 32.000000 1015.225000 1012.725000 1.000000 1.000000 7.725000 14.300000 0.000000 0.000000
50% 7.500000 19.750000 0.000000 4.200000 8.650000 39.000000 7.000000 72.000000 43.000000 1020.000000 1017.200000 4.000000 4.000000 12.600000 18.600000 0.000000 0.000000
75% 12.500000 25.500000 0.200000 6.400000 10.600000 46.000000 13.000000 80.000000 54.750000 1024.400000 1021.350000 7.000000 7.000000 17.000000 24.000000 0.000000 0.000000
max 20.900000 35.800000 39.800000 13.800000 13.600000 98.000000 41.000000 99.000000 96.000000 1035.700000 1033.200000 8.000000 8.000000 24.700000 34.500000 1.000000 1.000000

In [287]:
type(dataframe)


Out[287]:
pandas.core.frame.DataFrame

In [288]:
array = dataframe.values

In [289]:
pd.value_counts(dataframe["RainTomorrow"])


Out[289]:
0    290
1     64
Name: RainTomorrow, dtype: int64

In [290]:
array


Out[290]:
array([[  8. ,  24.3,   0. , ...,  23.6,   0. ,   1. ],
       [ 14. ,  26.9,   3.6, ...,  25.7,   1. ,   1. ],
       [ 13.7,  23.4,   3.6, ...,  20.2,   1. ,   1. ],
       ..., 
       [ 12.5,  19.9,   0. , ...,  18.3,   0. ,   0. ],
       [ 12.5,  26.9,   0. , ...,  25.9,   0. ,   0. ],
       [ 12.3,  30.2,   0. , ...,  28.6,   0. ,   0. ]])

In [291]:
X = array[:,0:16]
Y = array[:,16]
num_folds = 10
num_instances = len(X)
seed = 7

In [292]:
type(X)


Out[292]:
numpy.ndarray

In [293]:
X


Out[293]:
array([[  8. ,  24.3,   0. , ...,  14.4,  23.6,   0. ],
       [ 14. ,  26.9,   3.6, ...,  17.5,  25.7,   1. ],
       [ 13.7,  23.4,   3.6, ...,  15.4,  20.2,   1. ],
       ..., 
       [ 12.5,  19.9,   0. , ...,  14.5,  18.3,   0. ],
       [ 12.5,  26.9,   0. , ...,  15.8,  25.9,   0. ],
       [ 12.3,  30.2,   0. , ...,  23.8,  28.6,   0. ]])

In [294]:
#Y[Y == "Yes"] = 1 An alternative way to make a NumPy arraye change values
#Y[Y == "No"] = 0
Y


Out[294]:
array([ 1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  1.,  1.,  1.,
        0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.])

In [295]:
dtr = tree.DecisionTreeRegressor(max_depth=3)
dtr.fit(X, Y)


Out[295]:
DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [296]:
# from sklearn.metrics import roc_curve, auc

In [297]:
#!sudo pip install pydotplus
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
# http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/
# http://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

In [298]:
#!pip freeze
#checking if we have the right packages

In [299]:
#!pip install --upgrade pip

In [300]:
#!pip install pydotplus

In [301]:
import pydotplus as pydot

from IPython.display import Image

from sklearn.externals.six import StringIO

In [302]:
# Graphviz
#sudo add-apt-repository ppa:gviz-adm/graphviz-dev
# sudo apt-get update
# http://www.graphviz.org/Download_linux_ubuntu.php

In [303]:
dot_data = StringIO()

In [304]:
tree.export_graphviz(dtr, out_file=dot_data,feature_names=names[:-1])

In [305]:
graph = pydot.graph_from_dot_data(dot_data.getvalue())

In [306]:
Image(graph.create_png())


Out[306]:

In [307]:
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)

In [308]:
model


Out[308]:
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=100, n_jobs=1, oob_score=False,
         random_state=7, verbose=0, warm_start=False)

In [309]:
kfold


Out[309]:
sklearn.cross_validation.KFold(n=354, n_folds=10, shuffle=False, random_state=7)

In [310]:
results = cross_validation.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


0.850873015873

In [311]:
results


Out[311]:
array([ 0.75      ,  0.86111111,  0.69444444,  0.88888889,  0.88571429,
        0.82857143,  0.91428571,  0.85714286,  0.94285714,  0.88571429])