In [1]:
#import files
import os
import numpy as np
#get titles
from BeautifulSoup import BeautifulSoup
moviehtmldir = './movie/'
moviedict = {}
for filename in [f for f in os.listdir(moviehtmldir) if f[0]!='.']:
    id = filename.split('.')[0]
    f = open(moviehtmldir+'/'+filename)
    parsed_html = BeautifulSoup(f.read())
    try:
       title = parsed_html.body.h1.text
       
    except:
       title = 'none'
    moviedict[id] = title

In [2]:
def ListDocs(dirname):
    docs = []
    titles = []
    for filename in [f for f in os.listdir(dirname) if str(f)[0]!='.']:
        f = open(dirname+'/'+filename,'r')
        id = filename.split('.')[0].split('_')[1]
        titles.append(moviedict[id])
        docs.append(f.read())
    return docs,titles

dir = './review_polarity/txt_sentoken/'
pos_textreviews,pos_titles = ListDocs(dir+'pos/')
neg_textreviews,neg_titles = ListDocs(dir+'neg/')
tot_textreviews = pos_textreviews+neg_textreviews
tot_titles = pos_titles+neg_titles

In [6]:
#LDA
import gensim.models
from gensim import models

from nltk.tokenize import RegexpTokenizer
tknzr = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
class GenSimCorpus(object):
           def __init__(self, texts, stoplist=[],bestwords=[],stem=False):
               self.texts = texts
               self.stoplist = stoplist
               self.stem = stem
               self.bestwords = bestwords
               self.dictionary = gensim.corpora.Dictionary(self.iter_docs(texts, stoplist))
            
           def __len__(self):
               return len(self.texts)
           def __iter__(self):
               for tokens in self.iter_docs(self.texts, self.stoplist):
                   yield self.dictionary.doc2bow(tokens)
           def iter_docs(self,texts, stoplist):
               for text in texts:
                   if self.stem:
                      yield (stemmer.stem(w) for w in [x for x in tknzr.tokenize(text) if x not in stoplist])
                   else:
                      if len(self.bestwords)>0:
                         yield (x for x in tknzr.tokenize(text) if x in self.bestwords)
                      else:
                         yield (x for x in tknzr.tokenize(text) if x not in stoplist)            
            
num_topics = 10
corpus = GenSimCorpus(tot_textreviews, stoplist,[],False)
dict_lda = corpus.dictionary
lda = models.LdaModel(corpus, num_topics=num_topics, id2word=dict_lda,passes=10, iterations=50)
print lda.show_topics(num_topics=num_topics)


[(0, u'0.006*movie + 0.004*paulie + 0.003*leila + 0.003*one + 0.003*film + 0.003*shrek + 0.003*story + 0.002*get + 0.002*like + 0.002*good'), (1, u'0.014*film + 0.007*one + 0.005*like + 0.005*movie + 0.003*even + 0.003*would + 0.003*time + 0.003*good + 0.003*first + 0.002*much'), (2, u'0.011*film + 0.006*one + 0.004*movie + 0.003*like + 0.003*even + 0.002*way + 0.002*two + 0.002*batman + 0.002*character + 0.002*films'), (3, u'0.011*film + 0.006*one + 0.005*movie + 0.004*like + 0.003*time + 0.003*first + 0.003*get + 0.003*good + 0.003*would + 0.003*story'), (4, u'0.013*film + 0.007*one + 0.007*movie + 0.005*like + 0.003*story + 0.003*even + 0.003*much + 0.003*character + 0.003*time + 0.003*well'), (5, u'0.012*film + 0.008*one + 0.007*movie + 0.005*like + 0.003*good + 0.003*also + 0.003*time + 0.003*first + 0.003*would + 0.003*story'), (6, u'0.010*film + 0.007*one + 0.006*movie + 0.004*like + 0.004*story + 0.003*even + 0.003*character + 0.003*characters + 0.003*much + 0.003*time'), (7, u'0.017*film + 0.010*movie + 0.010*one + 0.006*like + 0.004*even + 0.004*good + 0.004*time + 0.003*well + 0.003*would + 0.003*characters'), (8, u'0.010*film + 0.006*one + 0.005*movie + 0.004*time + 0.004*joe + 0.003*like + 0.003*even + 0.003*life + 0.002*love + 0.002*story'), (9, u'0.011*movie + 0.009*film + 0.008*one + 0.006*like + 0.004*good + 0.004*even + 0.004*time + 0.003*would + 0.003*much + 0.003*story')]

In [7]:
import copy
#filter out very common words like mobie and film or very unfrequent terms
out_ids = [tokenid for tokenid, docfreq in dict_lda.dfs.iteritems() if docfreq > 1000 or docfreq < 3 ]
dict_lfq = copy.deepcopy(dict_lda)
dict_lfq.filter_tokens(out_ids)
dict_lfq.compactify()
corpus = [dict_lfq.doc2bow(tknzr.tokenize(text)) for text in tot_textreviews]

In [8]:
lda_lfq = models.LdaModel(corpus, num_topics=num_topics, id2word=dict_lfq,passes=10, iterations=50,alpha=0.01,eta=0.01)
for t in range(num_topics):
    print 'topic ',t,'  words: ',lda_lfq.print_topic(t,topn=10)
    print


topic  0   words:  0.004*characters + 0.003*bad + 0.003*plot + 0.003*star + 0.003*horror + 0.003*series + 0.003*movies + 0.003*scream + 0.002*really + 0.002*know

topic  1   words:  0.004*see + 0.003*people + 0.003*characters + 0.003*new + 0.003*never + 0.003*show + 0.003*know + 0.003*plot + 0.003*something + 0.002*really

topic  2   words:  0.003*life + 0.003*love + 0.003*scene + 0.003*see + 0.003*director + 0.003*best + 0.003*little + 0.003*alien + 0.003*man + 0.002*films

topic  3   words:  0.005*harry + 0.003*never + 0.003*carter + 0.003*wrestling + 0.003*see + 0.003*scenes + 0.003*life + 0.003*characters + 0.003*people + 0.003*williams

topic  4   words:  0.003*jackie + 0.003*director + 0.003*characters + 0.003*films + 0.003*never + 0.002*action + 0.002*scene + 0.002*plot + 0.002*comedy + 0.002*could

topic  5   words:  0.005*really + 0.005*bad + 0.004*see + 0.003*little + 0.003*funny + 0.003*films + 0.003*know + 0.003*could + 0.003*characters + 0.003*people

topic  6   words:  0.003*action + 0.003*bad + 0.003*man + 0.003*life + 0.003*characters + 0.003*scenes + 0.002*world + 0.002*movies + 0.002*many + 0.002*could

topic  7   words:  0.004*life + 0.003*little + 0.003*disney + 0.003*plot + 0.003*best + 0.002*could + 0.002*really + 0.002*man + 0.002*see + 0.002*people

topic  8   words:  0.004*characters + 0.003*films + 0.003*see + 0.003*life + 0.002*scene + 0.002*best + 0.002*love + 0.002*never + 0.002*great + 0.002*people

topic  9   words:  0.003*life + 0.003*scene + 0.003*see + 0.003*really + 0.003*characters + 0.003*new + 0.003*go + 0.003*people + 0.002*ryan + 0.002*love


In [9]:
#topics for each doc
def GenerateDistrArrays(corpus):
         for i,dist in enumerate(corpus[:10]):
             dist_array = np.zeros(num_topics)
             for d in dist:
                 dist_array[d[0]] =d[1]
             if dist_array.argmax() == 6 :
                print tot_titles[i]
corpus_lda = lda_lfq[corpus]
GenerateDistrArrays(corpus_lda)


From Hell (2001)
Lumumba (2000)

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
tknzr = WordPunctTokenizer()

from nltk.tokenize import RegexpTokenizer
tknzr = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)

nltk.download('stopwords')
stoplist = stopwords.words('english')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()


from collections import namedtuple

def PreprocessReviews(text,stop=[],stem=False):
    #print profile
    words = tknzr.tokenize(text)
    if stem:
       words_clean = [stemmer.stem(w) for w in [i.lower() for i in words if i not in stop]]
    else:
       words_clean = [i.lower() for i in words if i not in stop]
    return words_clean

Review = namedtuple('Review','words title tags')
dir = './review_polarity/txt_sentoken/'
do2vecstem = True
reviews_pos = []
cnt = 0
for filename in [f for f in os.listdir(dir+'pos/') if str(f)[0]!='.']:
    f = open(dir+'pos/'+filename,'r')
    id = filename.split('.')[0].split('_')[1]
    reviews_pos.append(Review(PreprocessReviews(f.read(),stoplist,do2vecstem),moviedict[id],['pos_'+str(cnt)]))
    cnt+=1
    
reviews_neg = []
cnt= 0
for filename in [f for f in os.listdir(dir+'neg/') if str(f)[0]!='.']:
    f = open(dir+'neg/'+filename,'r')
    id = filename.split('.')[0].split('_')[1]
    reviews_neg.append(Review(PreprocessReviews(f.read(),stoplist,do2vecstem),moviedict[id],['neg_'+str(cnt)]))
    cnt+=1

tot_reviews = reviews_pos + reviews_neg


[nltk_data] Downloading package 'stopwords' to
[nltk_data]     /Users/andrea/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [11]:
#split in test training sets
def word_features(words):
    return dict([(word, True) for word in words])
negfeatures = [(word_features(r.words), 'neg') for r in reviews_neg]
posfeatures = [(word_features(r.words), 'pos') for r in reviews_pos]
portionpos = int(len(posfeatures)*0.8)
portionneg = int(len(negfeatures)*0.8)
print portionpos,'-',portionneg
trainfeatures = negfeatures[:portionneg] + posfeatures[:portionpos]
print len(trainfeatures)
testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
#shuffle(testfeatures)


800 - 800
1600

In [12]:
from nltk.classify import NaiveBayesClassifier
#training naive bayes 
classifier = NaiveBayesClassifier.train(trainfeatures)
##testing
err = 0
print 'test on: ',len(testfeatures)
for r in testfeatures:
    sent = classifier.classify(r[0])
    if sent != r[1]:
       err +=1.
print 'error rate: ',err/float(len(testfeatures))


test on:  400
error rate:  0.2975

In [117]:



test on:  400
error rate:  0.2825

In [16]:
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from random import shuffle

#train bigram:
def bigrams_words_features(words, nbigrams=200,measure=BigramAssocMeasures.chi_sq):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(measure, nbigrams)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

negfeatures = [(bigrams_words_features(r.words,500), 'neg') for r in reviews_neg]
posfeatures = [(bigrams_words_features(r.words,500), 'pos') for r in reviews_pos]
portionpos = int(len(posfeatures)*0.8)
portionneg = int(len(negfeatures)*0.8)
print portionpos,'-',portionneg
trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg]
print len(trainfeatures)
classifier = NaiveBayesClassifier.train(trainfeatures)
##test bigram
testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
shuffle(testfeatures)
err = 0
print 'test on: ',len(testfeatures)
for r in testfeatures:
    sent = classifier.classify(r[0])
    #print r[1],'-pred: ',sent
    if sent != r[1]:
       err +=1.
print 'error rate: ',err/float(len(testfeatures))


800 - 800
1600
test on:  400
error rate:  0.2075

In [21]:
import nltk.classify.util, nltk.metrics
tot_poswords = [val for l in [r.words for r in reviews_pos] for val in l]
tot_negwords = [val for l in [r.words for r in reviews_neg] for val in l]
from nltk.probability import FreqDist, ConditionalFreqDist
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
 
for word in tot_poswords:
    word_fd[word.lower()] +=1
    label_word_fd['pos'][word.lower()] +=1
 
for word in tot_negwords:
    word_fd[word.lower()] +=1
    label_word_fd['neg'][word.lower()] +=1
pos_words = len(tot_poswords)
neg_words = len(tot_negwords)

tot_words = pos_words + neg_words
#select the best words in terms of information contained in the two classes pos and neg
word_scores = {}
 
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                (freq, pos_words), tot_words)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                (freq, neg_words), tot_words)
    word_scores[word] = pos_score + neg_score
print 'total: ',len(word_scores)
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])


total:  26060

In [22]:
#training naive bayes with chi square feature selection of best words
def best_words_features(words):
    return dict([(word, True) for word in words if word in bestwords])

negfeatures = [(best_words_features(r.words), 'neg') for r in reviews_neg]
posfeatures = [(best_words_features(r.words), 'pos') for r in reviews_pos]
portionpos = int(len(posfeatures)*0.8)
portionneg = int(len(negfeatures)*0.8)
print portionpos,'-',portionneg
trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg]
print len(trainfeatures)
classifier = NaiveBayesClassifier.train(trainfeatures)
##test with feature chi square selection
testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
shuffle(testfeatures)
err = 0
print 'test on: ',len(testfeatures)
for r in testfeatures:
    sent = classifier.classify(r[0])
    #print r[1],'-pred: ',sent
    if sent != r[1]:
       err +=1.
print 'error rate: ',err/float(len(testfeatures))


800 - 800
1600
test on:  400
error rate:  0.13

In [23]:
from gensim.models import Doc2Vec

import multiprocessing

shuffle(tot_reviews)
cores = multiprocessing.cpu_count()
vec_size = 500
model_d2v = Doc2Vec(dm=1, dm_concat=0, size=vec_size, window=5, negative=0, hs=0, min_count=1, workers=cores)

#build vocab
model_d2v.build_vocab(tot_reviews)
#train
numepochs= 20
for epoch in range(numepochs):
    try:
        print 'epoch %d' % (epoch)
        model_d2v.train(tot_reviews)
        model_d2v.alpha *= 0.99
        model_d2v.min_alpha = model_d2v.alpha
    except (KeyboardInterrupt, SystemExit):
        break


epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9
epoch 10
epoch 11
epoch 12
epoch 13
epoch 14
epoch 15
epoch 16
epoch 17
epoch 18
epoch 19

In [24]:
#split train,test sets
trainingsize = 2*int(len(reviews_pos)*0.8)

train_d2v = np.zeros((trainingsize, vec_size))
train_labels = np.zeros(trainingsize)
test_size = len(tot_reviews)-trainingsize
test_d2v = np.zeros((test_size, vec_size))
test_labels = np.zeros(test_size)

cnt_train = 0
cnt_test = 0
for r in reviews_pos:
    name_pos = r.tags[0]
    if int(name_pos.split('_')[1])>= int(trainingsize/2.):
        test_d2v[cnt_test] = model_d2v.docvecs[name_pos]
        test_labels[cnt_test] = 1
        cnt_test +=1
    else:
        train_d2v[cnt_train] = model_d2v.docvecs[name_pos]
        train_labels[cnt_train] = 1
        cnt_train +=1

for r in reviews_neg:
    name_neg = r.tags[0]
    if int(name_neg.split('_')[1])>= int(trainingsize/2.):
        test_d2v[cnt_test] = model_d2v.docvecs[name_neg]
        test_labels[cnt_test] = 0
        cnt_test +=1
    else:
        train_d2v[cnt_train] = model_d2v.docvecs[name_neg]       
        train_labels[cnt_train] = 0
        cnt_train +=1

In [27]:
#train log regre
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(train_d2v, train_labels)
print 'accuracy:',classifier.score(test_d2v,test_labels)

from sklearn.svm import SVC
clf = SVC()
clf.fit(train_d2v, train_labels)
print 'accuracy:',clf.score(test_d2v,test_labels)


accuracy: 0.5175
accuracy: 0.5275

In [108]:
#svm linear
clf = SVC(kernel='linear')
clf.fit(train_d2v, train_labels)
print clf.score(test_d2v,test_labels)


test on:  400
error rate:  0.1275