In [1]:
# loading the libraries
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
# Feature Importance
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
In [2]:
#Load the dataset
gapdata= pd.read_csv("gap.csv", low_memory=False)
data_clean=gapdata.dropna()
In [3]:
# Data pre-processing tasks
data_clean['breastcancerper100th']= data_clean['breastcancerper100th'].convert_objects(convert_numeric=True)
data_clean['femaleemployrate']= data_clean['femaleemployrate'].convert_objects(convert_numeric=True)
data_clean['alcconsumption']= data_clean['alcconsumption'].convert_objects(convert_numeric=True)
In [4]:
#Create binary Breast Cancer Rate
def bin2cancer (row):
if row['breastcancerper100th'] <= 20 :
return 0
elif row['breastcancerper100th'] > 20 :
return 1
#Create binary Alcohol consumption
def bin2alcohol(row):
if row['alcconsumption'] <= 5 :
return 0
elif row['alcconsumption'] > 5 :
return 1
# create binary Female employee rate
def bin2femalemployee(row):
if row['femaleemployrate'] <= 50 :
return 0
elif row['femaleemployrate'] > 50 :
return 1
#Apply the new variables bin2alcohol,bin2femalemployee, bin2cancer to the gapmind dataset
data_clean['bin2femalemployee'] = data_clean.apply (lambda row: bin2femalemployee (row),axis=1)
data_clean['bin2alcohol'] = data_clean.apply (lambda row: bin2alcohol (row),axis=1)
data_clean['bin2cancer']=data_clean.apply(lambda row: bin2cancer(row),axis=1)
In [5]:
data_clean.dtypes
Out[5]:
In [6]:
data_clean.describe()
Out[6]:
In [7]:
# Assign predictor and traget variable
predictors=data_clean[['bin2alcohol','bin2femalemployee']]
target=data_clean.bin2cancer
In [8]:
#Split into training and testing sets
pred_train,pred_test,tar_train,tar_test=train_test_split(predictors,target,test_size=0.4)
In [9]:
pred_train.shape
Out[9]:
In [33]:
#tar_test.head
In [10]:
pred_test.shape
Out[10]:
In [11]:
tar_train.shape
Out[11]:
In [12]:
tar_test.shape
Out[12]:
In [24]:
#Build model on training data
from sklearn.ensemble import RandomForestClassifier
classifier=DecisionTreeClassifier()
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
In [25]:
sklearn.metrics.confusion_matrix(tar_test,predictions)
Out[25]:
In [26]:
sklearn.metrics.accuracy_score(tar_test, predictions)
Out[26]:
In [27]:
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)
Out[27]:
In [22]:
# display the relative importance of each attribute
print(model.feature_importances_)
In [28]:
"""
Running a different number of trees and see the effect
of that on the accuracy of the prediction
"""
trees=range(25)
accuracy=np.zeros(25)
for idx in range(len(trees)):
classifier=RandomForestClassifier(n_estimators=idx + 1)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions)
In [29]:
%matplotlib inline
plt.cla()
plt.plot(trees, accuracy)
Out[29]:
In [ ]: