notebook.community

Edit and run



In [1]:

    
# loading the libraries
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
 # Feature Importance
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier



In [2]:

    
#Load the dataset
gapdata= pd.read_csv("gap.csv", low_memory=False)
data_clean=gapdata.dropna()



In [3]:

    
# Data pre-processing tasks
data_clean['breastcancerper100th']= data_clean['breastcancerper100th'].convert_objects(convert_numeric=True)
data_clean['femaleemployrate']= data_clean['femaleemployrate'].convert_objects(convert_numeric=True)
data_clean['alcconsumption']= data_clean['alcconsumption'].convert_objects(convert_numeric=True)









    



C:\Anaconda2\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':
C:\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  from ipykernel import kernelapp as app
C:\Anaconda2\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  app.launch_new_instance()



In [4]:

    
#Create binary Breast Cancer Rate
def bin2cancer (row):
   if row['breastcancerper100th'] <= 20 :
      return 0
   elif row['breastcancerper100th'] > 20 :
      return 1

#Create binary Alcohol consumption
def bin2alcohol(row):
   if row['alcconsumption'] <= 5 :
      return 0
   elif row['alcconsumption'] > 5 :
      return 1
# create binary Female employee rate
def bin2femalemployee(row):
   if row['femaleemployrate'] <= 50 :
      return 0
   elif row['femaleemployrate'] > 50 :
      return 1
#Apply the new variables bin2alcohol,bin2femalemployee, bin2cancer to the gapmind dataset
data_clean['bin2femalemployee'] = data_clean.apply (lambda row: bin2femalemployee (row),axis=1)
data_clean['bin2alcohol'] = data_clean.apply (lambda row: bin2alcohol (row),axis=1)
data_clean['bin2cancer']=data_clean.apply(lambda row: bin2cancer(row),axis=1)



In [5]:

    
data_clean.dtypes









    Out[5]:





alcconsumption          int64
breastcancerper100th    int64
femaleemployrate        int64
bin2femalemployee       int64
bin2alcohol             int64
bin2cancer              int64
dtype: object



In [6]:

    
data_clean.describe()









    Out[6]:






  
    
      
      alcconsumption
      breastcancerper100th
      femaleemployrate
      bin2femalemployee
      bin2alcohol
      bin2cancer
    
  
  
    
      count
      165.000000
      165.000000
      165.000000
      165.000000
      165.000000
      165.000000
    
    
      mean
      6.672727
      37.521212
      47.951515
      0.424242
      0.539394
      0.751515
    
    
      std
      4.985545
      23.038035
      14.789089
      0.495732
      0.499963
      0.433450
    
    
      min
      0.000000
      4.000000
      12.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      3.000000
      21.000000
      39.000000
      0.000000
      0.000000
      1.000000
    
    
      50%
      6.000000
      30.000000
      48.000000
      0.000000
      1.000000
      1.000000
    
    
      75%
      10.000000
      50.000000
      56.000000
      1.000000
      1.000000
      1.000000
    
    
      max
      23.000000
      101.000000
      83.000000
      1.000000
      1.000000
      1.000000



In [7]:

    
# Assign predictor and traget variable
predictors=data_clean[['bin2alcohol','bin2femalemployee']]
target=data_clean.bin2cancer



In [8]:

    
#Split into training and testing sets
pred_train,pred_test,tar_train,tar_test=train_test_split(predictors,target,test_size=0.4)



In [9]:

    
pred_train.shape









    Out[9]:





(99, 2)



In [33]:

    
#tar_test.head



In [10]:

    
pred_test.shape









    Out[10]:





(66, 2)



In [11]:

    
tar_train.shape









    Out[11]:





(99L,)



In [12]:

    
tar_test.shape









    Out[12]:





(66L,)



In [24]:

    
#Build model on training data
from sklearn.ensemble import RandomForestClassifier
classifier=DecisionTreeClassifier()
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)



In [25]:

    
sklearn.metrics.confusion_matrix(tar_test,predictions)









    Out[25]:





array([[ 8,  7],
       [ 4, 47]])



In [26]:

    
sklearn.metrics.accuracy_score(tar_test, predictions)









    Out[26]:





0.83333333333333337



In [27]:

    
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)









    Out[27]:





ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)



In [22]:

    
# display the relative importance of each attribute
print(model.feature_importances_)









    



[ 0.49408698  0.50591302]



In [28]:

    
"""
Running a different number of trees and see the effect
 of that on the accuracy of the prediction
"""
trees=range(25)
accuracy=np.zeros(25)
for idx in range(len(trees)):
   classifier=RandomForestClassifier(n_estimators=idx + 1)
   classifier=classifier.fit(pred_train,tar_train)
   predictions=classifier.predict(pred_test)
   accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions)



In [29]:

    
%matplotlib inline
plt.cla()
plt.plot(trees, accuracy)









    Out[29]:





[<matplotlib.lines.Line2D at 0xaa53ef0>]



In [ ]:

	alcconsumption	breastcancerper100th	femaleemployrate	bin2femalemployee	bin2alcohol	bin2cancer
count	165.000000	165.000000	165.000000	165.000000	165.000000	165.000000
mean	6.672727	37.521212	47.951515	0.424242	0.539394	0.751515
std	4.985545	23.038035	14.789089	0.495732	0.499963	0.433450
min	0.000000	4.000000	12.000000	0.000000	0.000000	0.000000
25%	3.000000	21.000000	39.000000	0.000000	0.000000	1.000000
50%	6.000000	30.000000	48.000000	0.000000	1.000000	1.000000
75%	10.000000	50.000000	56.000000	1.000000	1.000000	1.000000
max	23.000000	101.000000	83.000000	1.000000	1.000000	1.000000