In [61]:
import gender_classifier
from pylab import *
data_set = gender_classifier.load_gender_with_comments_from_file("ModifiedDataSet")
data_set_original = gender_classifier.load_gender_with_comments_from_file("OriginalDataSet")
#print data_set[0]
#print data_set[2]
sentiment_danish = gender_classifier.sentiment_danish_words()
gender_classifier.preprocessing(data_set[0][0])
cleaned_data_set = gender_classifier.clean_comments(data_set)
#print cleaned_data_set[2]
feature_set = gender_classifier.generate_feature_set(cleaned_data_set, sentiment_danish)
#print feature_set[0]
X, y, an, cn = gender_classifier.feature_extractor_to_scikitLearn(feature_set)
X = gender_classifier.standardize_features(X)
trainAcAB, testAcAB, featureImportanceAB, cmAB = gender_classifier.classification(X, y, "ada_boost")
trainAcRD, testAcRF, featureImportanceRF, cmRF = gender_classifier.classification(X, y, "random_forest")
trainAcSVM, testAcSVM, featureImportance, cmSVM = gender_classifier.classification(X, y, "svm")
trainAcLR, testAcLR, featureImportance, cmLR = gender_classifier.classification(X, y, "logistic_regression")
print an, cn
print trainAcAB, testAcAB, featureImportanceAB, cmAB
In [ ]:
In [6]:
%matplotlib inline
#data
#np.random.seed(42)
data = featureImportanceRF
names = an
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
ax = plt.subplot(111)
width=0.7
bins = map(lambda x: x-width/2,range(1,len(data)+1))
ax.bar(bins,data,width=width)
ax.set_xticks(map(lambda x: x, range(1,len(data)+1)))
ax.set_xticklabels(names,rotation=20, rotation_mode="anchor", ha="right")
ylabel('Feature importance (%)');
title('Feature importance for Random Forest classifier');
#fig.set_size_inches(8,6)
fig.savefig('feature_importance.png', format='png')
#plt.show()
In [34]:
import numpy as np
import matplotlib.pyplot as plt
accuracy = sum(testAcRF)/len(testAcRF)
conf_arr = cmRF
labels = cn
print labels
norm_conf = []
for i in conf_arr:
a = 0
tmp_arr = []
a = sum(i, 0)
for j in i:
tmp_arr.append(float(j)/float(a))
norm_conf.append(tmp_arr)
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
#fig = plt.figure()
plt.clf()
ax = fig.add_subplot(111)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
#ax.set_aspect(1)
res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet,
interpolation='nearest')
width = len(conf_arr)
height = len(conf_arr[0])
for x in xrange(width):
for y in xrange(height):
ax.annotate(str(conf_arr[x][y]), xy=(y, x),
horizontalalignment='center',
verticalalignment='center')
cb = fig.colorbar(res)
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
#xticks(range(C)); yticks(range(C));
xlabel('Predicted class'); ylabel('Actual class');
title('Confusion matrix for Random Forest classifier (Accuracy: {0}%)'.format(accuracy));
fig.savefig('confusion_matrix.png', format='png')
In [60]:
labels = cn
cm = cmRF
con_arr = cmRF
print(cm)
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
norm_conf = []
for i in conf_arr:
a = 0
tmp_arr = []
a = sum(i, 0)
for j in i:
tmp_arr.append(float(j)/float(a))
norm_conf.append(tmp_arr)
ax.set_aspect(1)
res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet,
interpolation='nearest')
width = len(conf_arr)
height = len(conf_arr[0])
for x in xrange(width):
for y in xrange(height):
ax.annotate(str(conf_arr[x][y]), xy=(y, x),
horizontalalignment='center',
verticalalignment='center')
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
xlabel('Predicted')
ylabel('True')
show()
In [31]:
from scipy import stats
#print Error_dectree
# Use T-test to check if classifiers are significantly different
[tstatistic, pvalue] = stats.ttest_ind(testAcAB,testAcRF)
if pvalue<=0.05:
print('Classifiers are significantly different. (p={0})'.format(pvalue[0]))
else:
print('Classifiers are not significantly different (p={0})'.format(pvalue[0]))
# Boxplot to compare classifier error distributions
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
boxplot(np.bmat('testAcAB, testAcRF, testAcSVM, testAcLR'))
xlabel('AdaBoost | Random Forest | Support Vector Machine | Logistic Regression')
ylabel('Cross-validation accuracy [%]')
title('Accuracy of the four classifiers with 10-fold CV');
fig.savefig('accuracy_boxplot.png', format='png')
In [41]:
%matplotlib inline
# Compute values of N, M and C.
N = len(y)
M = len(an)
C = len(cn)
# Subtract mean value from data
Y = X - np.ones((N, 1))*X.mean(0)
# PCA by computing SVD of Y
U,S,V = linalg.svd(Y,full_matrices=False)
V = mat(V).T
# Project the centered data onto principal component space
Z = Y * V
# Indices of the principal components to be plotted
i = 0
j = 1
# Plot PCA of the data
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
fig.hold()
title('Susceptibility data')
for c in range(C):
# select indices belonging to class c:
class_mask = y.ravel()==c
plot(Z[class_mask,i], Z[class_mask,j], 'o')
legend(cn)
xlabel('PC{0}'.format(i+1))
ylabel('PC{0}'.format(j+1))
# Output result to screen
fig.savefig('PCAplot.png', format='png')
In [27]:
# Compute variance explained by principal components
rho = (S*S) / (S*S).sum()
# Plot variance explained
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
plot(rho,'o-')
title('Variance explained by principal components');
xlabel('Principal component');
ylabel('Variance explained value');
fig.savefig('VariancePCAplot.png', format='png')
In [59]:
from __future__ import division
import nltk
cFemale = 0
cMale = 0
sum_male_likes = 0
sum_female_likes = 0
sum_total_likes = 0
sum_words_male = 0
sum_words_female = 0
#data_set = data_set_original[1:]
#print data_set[0]
for t in data_set:
if t[0] == "Female":
cFemale += 1
sum_female_likes += int(t[3])
words = nltk.word_tokenize(t[1])
sum_words_female = sum_words_female + len(words)
else:
cMale += 1
sum_male_likes += int(t[2])
sum_words_male = sum_words_male + len(words)
sum_total_likes += int(t[4])
#print t[1]
#print len(t[1])
total = cFemale + cMale
procent_male = cMale/total
procent_female = cFemale/total
print "Procent male: %s" %procent_male
print "Procent female: %s" %procent_female
print "Total comments: %s" %total
print "Male comments: %s" %cFemale
print "Female comments: %s" %cMale
avg_male_likes = sum_male_likes/cMale
avg_female_likes = sum_female_likes/cFemale
avg_total_likes = sum_total_likes/total
avg_length_comments_female = sum_words_female / cFemale
avg_length_comments_male = sum_words_male / cMale
print "average male likes pr comment: %s" %avg_male_likes
print "average female likes pr comment: %s" %avg_female_likes
print "average total likes pr comment: %s" %avg_total_likes
print "average number of words pr comment pr female: %s" %avg_length_comments_female
print "average number of words pr comment pr male: %s" %avg_length_comments_male
In [ ]: