In [4]:
import gender_classifier_gammel 
import sys

from pylab import *

data_set = gender_classifier.load_gender_with_comments_from_file("ModifiedDataSet")
data_set_original = gender_classifier.load_gender_with_comments_from_file("OriginalDataSet") 
#print data_set[0]
#print data_set[2]

sentiment_danish = gender_classifier.sentiment_danish_words()
gender_classifier.preprocessing(data_set[0][0])

cleaned_data_set = gender_classifier.clean_comments(data_set)
#print cleaned_data_set[2]
feature_set = gender_classifier.generate_feature_set(cleaned_data_set, sentiment_danish)
#print feature_set[0]
X, y, an, cn = gender_classifier.feature_extractor_to_scikitLearn(feature_set)

X = gender_classifier.standardize_features(X)
trainAcAB, testAcAB, featureImportanceAB, cmAB = gender_classifier.classification(X, y, "ada_boost")
trainAcRD, testAcRF, featureImportanceRF, cmRF = gender_classifier.classification(X, y, "random_forest")
trainAcSVM, testAcSVM, featureImportance, cmSVM = gender_classifier.classification(X, y, "svm")
trainAcLR, testAcLR, featureImportance, cmLR = gender_classifier.classification(X, y, "logistic_regression")

print an, cn
print trainAcAB, testAcAB, featureImportanceAB, cmAB


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-4-27d3e5bc97b6> in <module>()
----> 1 import gender_classifier_gammel
      2 import sys
      3 
      4 from pylab import *
      5 

/Users/Henrik/InfoMine/infomine/gender_classifier_gammel.py in <module>()
    302         #print featuresets
    303 
--> 304 data_set = load_gender_with_comments_from_file("ModifiedDataSet")
    305 print data_set[0]
    306 print data_set[2]

/Users/Henrik/InfoMine/infomine/gender_classifier_gammel.py in load_gender_with_comments_from_file(filename)
     33     data_dir = os.path.join(os.path.dirname(__file__), '../data')
     34 
---> 35     with open(os.path.join(data_dir, data_set_file), 'r') as in_file:
     36         for line in csv.reader(in_file):
     37             data_set.append((line[0].decode("utf-8"), line[1].decode("utf-8"), line[2], line[3], line[4], line[5]))

IOError: [Errno 2] No such file or directory: '/Users/Henrik/InfoMine/data/ModifiedDataSet.csv'

In [42]:
%matplotlib inline 
#data
#np.random.seed(42)
data = featureImportanceRF
names = an
fig = figure(num=None, figsize=(18, 6), dpi=100, facecolor='w', edgecolor='k')
ax = plt.subplot(111)
width=0.8
bins = map(lambda x: x-width/2,range(1,len(data)+1))
ax.bar(bins,data,width=width)
ax.set_xticks(map(lambda x: x, range(1,len(data)+1)))
ax.set_xticklabels(names,rotation=20, rotation_mode="anchor", ha="right")
ylabel('Feature importance (%)');
title('Feature importance for Random Forest classifier');
#fig.set_size_inches(8,6)
fig.savefig('feature_importance.png', format='png')
#plt.show()



In [2]:
#import numpy as np
#import matplotlib.pyplot as plt

accuracy = round(100*(sum(testAcRF)/len(testAcRF)))
print accuracy
conf_arr = cmRF
labels = cn
print labels
norm_conf = []

for i in conf_arr:
    a = 0
    tmp_arr = []
    a = sum(i, 0)
    for j in i:
        tmp_arr.append(float(j)/float(a))
    norm_conf.append(tmp_arr)

fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
#fig = plt.figure()
plt.clf()
ax = fig.add_subplot(111)

ax.set_aspect(1)
res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet, 
               interpolation='nearest')

width = len(conf_arr)
height = len(conf_arr[0])

for x in xrange(width):
    for y in xrange(height):
        ax.annotate(str(conf_arr[x][y]), xy=(y, x), 
                    horizontalalignment='center',
                    verticalalignment='center')

cb = fig.colorbar(res)
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

#ax.set_xticklabels(labels)
#ax.set_yticklabels(labels)
xticks(range(C)); yticks(range(C));
xlabel('Predicted class'); ylabel('Actual class');
title('Confusion matrix for Random Forest classifier (Accuracy: {0}%)'.format(accuracy));
fig.savefig('confusion_matrix.png', format='png')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-50f7a8062f8d> in <module>()
      2 #import matplotlib.pyplot as plt
      3 
----> 4 accuracy = round(100*(sum(testAcRF)/len(testAcRF)))
      5 print accuracy
      6 conf_arr = cmRF

NameError: name 'testAcRF' is not defined

In [1]:
labels = cn
cm = cmRF
con_arr = cmRF
print(cm)
accuracy = round(100*(sum(testAcAB)/len(testAcAB)))
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')

norm_conf = []

for i in conf_arr:
    a = 0
    tmp_arr = []
    a = sum(i, 0)
    for j in i:
        tmp_arr.append(float(j)/float(a))
    norm_conf.append(tmp_arr)

ax.set_aspect(1)
res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet, 
               interpolation='nearest')
width = len(conf_arr)
height = len(conf_arr[0])

for x in xrange(width):
    for y in xrange(height):
        ax.annotate(str(conf_arr[x][y]), xy=(y, x), 
                    horizontalalignment='center',
                    verticalalignment='center')
          
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
title('Confusion matrix for Random Forest classifier (Accuracy: {0}%)'.format(accuracy));
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
xlabel('Predicted')
ylabel('True')
fig.savefig('confusion_matrix.png', format='png')
show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-cb566cafdf32> in <module>()
----> 1 labels = cn
      2 cm = cmRF
      3 con_arr = cmRF
      4 print(cm)
      5 accuracy = round(100*(sum(testAcAB)/len(testAcAB)))

NameError: name 'cn' is not defined

In [48]:
from scipy import stats
#print Error_dectree
# Use T-test to check if classifiers are significantly different
[tstatistic, pvalue] = stats.ttest_ind(testAcSVM,testAcLR)
if pvalue<=0.05:
    print('Classifiers are significantly different. (p={0})'.format(pvalue[0]))
else:
    print('Classifiers are not significantly different (p={0})'.format(pvalue[0]))        
    
# Boxplot to compare classifier error distributions
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
boxplot(np.bmat('testAcAB, testAcRF, testAcSVM, testAcLR'))
xlabel('AdaBoost | Random Forest | Support Vector Machine | Logistic Regression')
ylabel('Cross-validation accuracy [%]')
title('Accuracy of the four classifiers with 10-fold CV');
fig.savefig('accuracy_boxplot.png', format='png')


Classifiers are not significantly different (p=0.0854817999509)

In [17]:
X, y, an, cn = gender_classifier.feature_extractor_to_scikitLearn(feature_set)
X = gender_classifier.standardize_features(X)
print y


[[0]
 [0]
 [0]
 ..., 
 [1]
 [0]
 [0]]

In [18]:
%matplotlib inline  

# Compute values of N, M and C.
N = len(y)
M = len(an)
C = len(cn)

# Subtract mean value from data
Y = X - np.ones((N, 1))*X.mean(0)

# PCA by computing SVD of Y
U,S,V = linalg.svd(Y,full_matrices=False)
V = mat(V).T

# Project the centered data onto principal component space
Z = Y * V

# Indices of the principal components to be plotted
i = 0
j = 1

# Plot PCA of the data
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')

fig.hold()
title('Susceptibility data')
for c in range(C):
    # select indices belonging to class c:
    class_mask = y.ravel()==c
    plot(Z[class_mask,i], Z[class_mask,j], 'o')
legend(cn)
xlabel('PC{0}'.format(i+1))
ylabel('PC{0}'.format(j+1))

# Output result to screen
fig.savefig('PCAplot.png', format='png')



In [19]:
# Compute variance explained by principal components
rho = (S*S) / (S*S).sum() 


# Plot variance explained
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
plot(rho,'o-')
title('Variance explained by principal components');
xlabel('Principal component');
ylabel('Variance explained value');
fig.savefig('VariancePCAplot.png', format='png')



In [59]:
from __future__ import division
import nltk

cFemale = 0
cMale = 0
sum_male_likes = 0
sum_female_likes = 0
sum_total_likes = 0
sum_words_male = 0
sum_words_female = 0

#data_set = data_set_original[1:]
#print data_set[0]

for t in data_set:
    if t[0] == "Female":
        cFemale += 1
        sum_female_likes += int(t[3])
        words = nltk.word_tokenize(t[1])
        sum_words_female = sum_words_female + len(words)
    else:
        cMale += 1
        sum_male_likes += int(t[2])
        sum_words_male = sum_words_male + len(words)

    sum_total_likes += int(t[4])

#print t[1]
#print len(t[1])
total = cFemale + cMale
procent_male = cMale/total
procent_female = cFemale/total
print "Procent male: %s" %procent_male
print "Procent female: %s" %procent_female
print "Total comments: %s" %total
print "Male comments: %s" %cFemale
print "Female comments: %s" %cMale

avg_male_likes = sum_male_likes/cMale
avg_female_likes = sum_female_likes/cFemale
avg_total_likes = sum_total_likes/total
avg_length_comments_female = sum_words_female / cFemale
avg_length_comments_male = sum_words_male / cMale
print "average male likes pr comment: %s" %avg_male_likes
print "average female likes pr comment: %s" %avg_female_likes
print "average total likes pr comment: %s" %avg_total_likes
print "average number of words pr comment pr female: %s" %avg_length_comments_female
print "average number of words pr comment pr male: %s" %avg_length_comments_male


Procent male: 0.5484
Procent female: 0.4516
Total comments: 2500
Male comments: 1129
Female comments: 1371
average male likes pr comment: 2.95185995624
average female likes pr comment: 0.82019486271
average total likes pr comment: 3.7472
average number of words pr comment pr female: 108.781222321
average number of words pr comment pr male: 116.218818381

In [ ]: