In [8]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.cross_validation import cross_val_score
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split

In [4]:
# https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/
names = ['num_times_pregnant', 'glucose_level', 'blood_pres', 'skin_thickness', 'insulin', 'bmi', 'dia_pedigree', 'age', 'has_diabetes']
data = pd.read_csv("pima-indians-diabetes.data", names = names)
input_data = data[['num_times_pregnant', 'glucose_level', 'blood_pres', 'skin_thickness', 'insulin', 'bmi', 'dia_pedigree', 'age']]
expected_output = data['has_diabetes']
print(input_data.head())
print(expected_output.head())


   num_times_pregnant  glucose_level  blood_pres  skin_thickness  insulin  \
0                   6            148          72              35        0   
1                   1             85          66              29        0   
2                   8            183          64               0        0   
3                   1             89          66              23       94   
4                   0            137          40              35      168   

    bmi  dia_pedigree  age  
0  33.6         0.627   50  
1  26.6         0.351   31  
2  23.3         0.672   32  
3  28.1         0.167   21  
4  43.1         2.288   33  
0    1
1    0
2    1
3    0
4    1
Name: has_diabetes, dtype: int64

In [ ]:
# Create a Random forest, Logistic regression and SVM instance:

In [6]:
# Print the mean for the 3 algorithms, using the cross_val_score function


Random Forest: 0.7486842105263157
SVM:  0.651059466849
Logistic:  0.766968557758

Now actually run the 3 machine learning algorithms


In [10]:
# Create the test train split

In [16]:
# Run the data with Random forest


Out[16]:
0.74015748031496065

In [17]:
# Run the data with Logistic Regression


Out[17]:
0.76771653543307083

In [18]:
# Run the data with SVM


Out[18]:
0.67322834645669294