notebook.community

Edit and run



In [61]:

    
import gender_classifier
from pylab import *
data_set = gender_classifier.load_gender_with_comments_from_file("ModifiedDataSet")
data_set_original = gender_classifier.load_gender_with_comments_from_file("OriginalDataSet") 
#print data_set[0]
#print data_set[2]

sentiment_danish = gender_classifier.sentiment_danish_words()
gender_classifier.preprocessing(data_set[0][0])

cleaned_data_set = gender_classifier.clean_comments(data_set)
#print cleaned_data_set[2]
feature_set = gender_classifier.generate_feature_set(cleaned_data_set, sentiment_danish)
#print feature_set[0]
X, y, an, cn = gender_classifier.feature_extractor_to_scikitLearn(feature_set)

X = gender_classifier.standardize_features(X)
trainAcAB, testAcAB, featureImportanceAB, cmAB = gender_classifier.classification(X, y, "ada_boost")
trainAcRD, testAcRF, featureImportanceRF, cmRF = gender_classifier.classification(X, y, "random_forest")
trainAcSVM, testAcSVM, featureImportance, cmSVM = gender_classifier.classification(X, y, "svm")
trainAcLR, testAcLR, featureImportance, cmLR = gender_classifier.classification(X, y, "logistic_regression")

print an, cn
print trainAcAB, testAcAB, featureImportanceAB, cmAB









    



['number_of_words', 'number_of_sentences', 'lexical_diversity', 'average_sentiment', 'maximum_sentiment', 'minimum_sentiment', 'male_likes', 'female_likes', 'total_likes', 'male_female_likes_ratio'] ['Male', 'Female']
[[ 0.59629815]
 [ 0.614     ]
 [ 0.616     ]
 [ 0.6135    ]
 [ 0.62418791]] [[ 0.57884232]
 [ 0.542     ]
 [ 0.57      ]
 [ 0.526     ]
 [ 0.49498998]] [ 0.196  0.1    0.368  0.088  0.056  0.024  0.032  0.044  0.064  0.028] [[184  90]
 [162  63]]



In [ ]:



In [6]:

    
%matplotlib inline 
#data
#np.random.seed(42)
data = featureImportanceRF
names = an
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
ax = plt.subplot(111)
width=0.7
bins = map(lambda x: x-width/2,range(1,len(data)+1))
ax.bar(bins,data,width=width)
ax.set_xticks(map(lambda x: x, range(1,len(data)+1)))
ax.set_xticklabels(names,rotation=20, rotation_mode="anchor", ha="right")
ylabel('Feature importance (%)');
title('Feature importance for Random Forest classifier');
#fig.set_size_inches(8,6)
fig.savefig('feature_importance.png', format='png')
#plt.show()



In [34]:

    
import numpy as np
import matplotlib.pyplot as plt

accuracy = sum(testAcRF)/len(testAcRF)
conf_arr = cmRF
labels = cn
print labels
norm_conf = []

for i in conf_arr:
    a = 0
    tmp_arr = []
    a = sum(i, 0)
    for j in i:
        tmp_arr.append(float(j)/float(a))
    norm_conf.append(tmp_arr)

fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
#fig = plt.figure()
plt.clf()
ax = fig.add_subplot(111)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
#ax.set_aspect(1)
res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet, 
               interpolation='nearest')

width = len(conf_arr)
height = len(conf_arr[0])

for x in xrange(width):
    for y in xrange(height):
        ax.annotate(str(conf_arr[x][y]), xy=(y, x), 
                    horizontalalignment='center',
                    verticalalignment='center')

cb = fig.colorbar(res)
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

#xticks(range(C)); yticks(range(C));
xlabel('Predicted class'); ylabel('Actual class');
title('Confusion matrix for Random Forest classifier (Accuracy: {0}%)'.format(accuracy));
fig.savefig('confusion_matrix.png', format='png')









    



['Male', 'Female']



In [60]:

    
labels = cn
cm = cmRF
con_arr = cmRF
print(cm)
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')

norm_conf = []

for i in conf_arr:
    a = 0
    tmp_arr = []
    a = sum(i, 0)
    for j in i:
        tmp_arr.append(float(j)/float(a))
    norm_conf.append(tmp_arr)

ax.set_aspect(1)
res = ax.imshow(np.array(norm_conf), cmap=plt.cm.jet, 
               interpolation='nearest')
width = len(conf_arr)
height = len(conf_arr[0])

for x in xrange(width):
    for y in xrange(height):
        ax.annotate(str(conf_arr[x][y]), xy=(y, x), 
                    horizontalalignment='center',
                    verticalalignment='center')
          
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
xlabel('Predicted')
ylabel('True')
show()



In [31]:

    
from scipy import stats
#print Error_dectree
# Use T-test to check if classifiers are significantly different
[tstatistic, pvalue] = stats.ttest_ind(testAcAB,testAcRF)
if pvalue<=0.05:
    print('Classifiers are significantly different. (p={0})'.format(pvalue[0]))
else:
    print('Classifiers are not significantly different (p={0})'.format(pvalue[0]))        
    
# Boxplot to compare classifier error distributions
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
boxplot(np.bmat('testAcAB, testAcRF, testAcSVM, testAcLR'))
xlabel('AdaBoost | Random Forest | Support Vector Machine | Logistic Regression')
ylabel('Cross-validation accuracy [%]')
title('Accuracy of the four classifiers with 10-fold CV');
fig.savefig('accuracy_boxplot.png', format='png')









    



Classifiers are significantly different. (p=0.0472021532754)



In [41]:

    
%matplotlib inline  

# Compute values of N, M and C.
N = len(y)
M = len(an)
C = len(cn)

# Subtract mean value from data
Y = X - np.ones((N, 1))*X.mean(0)

# PCA by computing SVD of Y
U,S,V = linalg.svd(Y,full_matrices=False)
V = mat(V).T

# Project the centered data onto principal component space
Z = Y * V

# Indices of the principal components to be plotted
i = 0
j = 1

# Plot PCA of the data
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')

fig.hold()
title('Susceptibility data')
for c in range(C):
    # select indices belonging to class c:
    class_mask = y.ravel()==c
    plot(Z[class_mask,i], Z[class_mask,j], 'o')
legend(cn)
xlabel('PC{0}'.format(i+1))
ylabel('PC{0}'.format(j+1))

# Output result to screen
fig.savefig('PCAplot.png', format='png')









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-41-28ffae1dc8f4> in <module>()
      3 
      4 # Compute values of N, M and C.
----> 5 N = len(y)
      6 M = len(an)
      7 C = len(cn)

TypeError: object of type 'int' has no len()



In [27]:

    
# Compute variance explained by principal components
rho = (S*S) / (S*S).sum() 


# Plot variance explained
fig = figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
plot(rho,'o-')
title('Variance explained by principal components');
xlabel('Principal component');
ylabel('Variance explained value');
fig.savefig('VariancePCAplot.png', format='png')



In [59]:

    
from __future__ import division
import nltk

cFemale = 0
cMale = 0
sum_male_likes = 0
sum_female_likes = 0
sum_total_likes = 0
sum_words_male = 0
sum_words_female = 0

#data_set = data_set_original[1:]
#print data_set[0]

for t in data_set:
    if t[0] == "Female":
        cFemale += 1
        sum_female_likes += int(t[3])
        words = nltk.word_tokenize(t[1])
        sum_words_female = sum_words_female + len(words)
    else:
        cMale += 1
        sum_male_likes += int(t[2])
        sum_words_male = sum_words_male + len(words)

    sum_total_likes += int(t[4])

#print t[1]
#print len(t[1])
total = cFemale + cMale
procent_male = cMale/total
procent_female = cFemale/total
print "Procent male: %s" %procent_male
print "Procent female: %s" %procent_female
print "Total comments: %s" %total
print "Male comments: %s" %cFemale
print "Female comments: %s" %cMale

avg_male_likes = sum_male_likes/cMale
avg_female_likes = sum_female_likes/cFemale
avg_total_likes = sum_total_likes/total
avg_length_comments_female = sum_words_female / cFemale
avg_length_comments_male = sum_words_male / cMale
print "average male likes pr comment: %s" %avg_male_likes
print "average female likes pr comment: %s" %avg_female_likes
print "average total likes pr comment: %s" %avg_total_likes
print "average number of words pr comment pr female: %s" %avg_length_comments_female
print "average number of words pr comment pr male: %s" %avg_length_comments_male









    



Procent male: 0.5484
Procent female: 0.4516
Total comments: 2500
Male comments: 1129
Female comments: 1371
average male likes pr comment: 2.95185995624
average female likes pr comment: 0.82019486271
average total likes pr comment: 3.7472
average number of words pr comment pr female: 108.781222321
average number of words pr comment pr male: 116.218818381



In [ ]: