In [1]:
##############################
#Author Skye Ouyang
#Date: 24th Apr
##############################
import pandas as pd
import numpy as np
from pandas import DataFrame
import sklearn
import requests
import StringIO
import json
import re
In [2]:
base_path='./'
whole_data=DataFrame.from_csv(base_path+'Text Analytics.csv',index_col='BOOK ID')
whole_data
Out[2]:
In [3]:
whole_data.head()
Out[3]:
In [4]:
whole_data.describe()
Out[4]:
In [6]:
Whole_data=(whole_data-whole_data.min())/(whole_data.max()-whole_data.min())
whole_data
Out[6]:
In [7]:
Whole_data.describe()
Out[7]:
In [8]:
corrmat = Whole_data.corr()
corrmat
Out[8]:
In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
# set up the matplotlib figure
f, ax=plt.subplots(figsize=(12,9))
# draw the heatmap using seaborn
sns.heatmap(corrmat,vmax=.8,square=True)
plt.show()
In [10]:
c = Whole_data.corr().abs()
s = c.unstack()
so = s.order(kind='quicksort')
so[so > 0.9 ]
Out[10]:
separate independent and dependent variable for regression
In [11]:
data_y = Whole_data['Label']
data_X = Whole_data.ix[:,Whole_data.columns.difference(['Label','Avg_sen_len','Blank'])]
print data_y.shape
print data_X.shape
print data_y.dtype
print data_X.dtypes
In [12]:
data_X.describe()
Out[12]:
In [13]:
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
In [14]:
#prepare models
X = data_X
Y = data_y
seed=5
models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB',GradientBoostingClassifier()))
models.append(('NN', MLPClassifier()))
In [15]:
#evaluate the models
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
In [16]:
fig = plt.figure()
fig.suptitle('Model Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
In [17]:
#split training and test datasets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size = 0.3, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))
In [21]:
#test the impact of tree size on prediction accuracy
trees = range(25)
accuracy = np.zeros(25)
for idx in range(len(trees)):
classifier=RandomForestClassifier(n_estimators=idx + 1)
classifier=classifier.fit(X_train,y_train)
predictions=classifier.predict(X_test)
accuracy[idx]=accuracy_score(y_test,predictions)
In [22]:
#plot the effect of tree size
plt.cla()
plt.plot(trees,accuracy)
plt.show()
In [50]:
#fit in random forest argorithm
print'Random Forest'
rf = RandomForestClassifier(n_estimators=17,min_samples_leaf=1)
print 'Fitting model'
rf_fit=rf.fit(X_train,y_train)
print 'Predcting on test set'
y_pred_rf =rf.predict(X_test)
print confusion_matrix(y_test,y_pred_rf)
print accuracy_score(y_test,y_pred_rf)
print f1_score(y_test,y_pred_rf)
In [51]:
#fit in GradientBoosting
print'Gradient Boosting'
gb = GradientBoostingClassifier(n_estimators=250,learning_rate=0.05, max_depth=10, max_features = 0.8, min_samples_leaf=4, random_state=0,subsample =0.9)
print 'Fitting model'
gb.fit(X_train,y_train)
print 'Predicting on test set'
y_pred_gb = gb.predict(X_test)
print confusion_matrix(y_test,y_pred_gb)
print accuracy_score(y_test,y_pred_gb)
print f1_score(y_test,y_pred_gb)
In [52]:
fpr_rf, tpr_rf, _ = metrics.roc_curve(y_test, y_pred_rf)
fpr_gb,tpr_gb, _= metrics.roc_curve(y_test, y_pred_gb)
plt.plot(fpr_rf,tpr_rf)
plt.plot(fpr_gb,tpr_gb,color='orange')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.title('ROC curve')
plt.xlabel('False Positive Rate(1-Specificity)')
plt.ylabel('True Positive Rate(Sensitivity)')
plt.grid(True)
plt.plot((0.0,1.0),(0.0,1.0),color='grey',linewidth=1,linestyle='--')
plt.show()
print ('Auc for random forest is ' + str(auc(fpr_rf, tpr_rf)))
print ('Auc for Gradient Boosting is ' + str(auc(fpr_gb,tpr_gb)))
In [53]:
# get the feature importance
names = data_X.columns.values
print "Features sorted by their score"
print sorted(zip(map(lambda x: round(x,4),rf.feature_importances_),names),reverse=True)
In [54]:
#create parameter for plot
importances=rf.feature_importances_
indices = np.argsort(importances)[::-1]
std = np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
print importances
print indices
print std
In [55]:
# Plot the feature importances of random forest
import matplotlib.pyplot as plt
plt.figure()
plt.title("Feature importances")
plt.bar(range(data_X.shape[1]), importances[indices],
color="r", yerr=std[indices],align="center")
plt.xticks(range(data_X.shape[1]), indices)
plt.xlim([-1, data_X.shape[1]])
plt.show()
In [56]:
log_loss(y_test,y_pred_rf,normalize=True)
Out[56]:
In [57]:
def score(sss,yyy):
i=0
compare = list()
while i< len(sss):
if sss[i] == yyy[i]:
compare.append(1)
else:
compare.append(0)
i+=1
return(compare)
In [58]:
ytest=Whole_data['Label'].values
In [59]:
c=score(y_pred_rf,ytest)
In [60]:
float(sum(c))/float(len(c))
Out[60]:
In [61]:
print 'GradientBoosting'
gb = GradientBoostingClassifier()
print 'Fitting model'
gb.fit(X_train,y_train)
print 'Predicting on test set'
y_pred_gb = gb.predict(X_test)
In [63]:
from sklearn.metrics import precision_score,recall_score, confusion_matrix, classification_report,accuracy_score, f1_score
print 'Accuracy:', accuracy_score(y_test, y_pred_rf)
print 'F1 score:', f1_score(y_test, y_pred_rf)
print 'Recall:', recall_score(y_test, y_pred_rf)
print 'Precision:', precision_score(y_test, y_pred_rf)
print '\n clasification report:\n', classification_report(y_test,y_pred_rf)
print '\n confussion matrix:\n',confusion_matrix(y_test, y_pred_rf)