In [1]:
from tqdm import *

In [2]:
import pandas as pd

# Read data from files 
train = pd.read_csv( "cs231n/datasets/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "cs231n/datasets/testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "cs231n/datasets/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

# Verify the number of reviews that were read (100,000 in total)
print "Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,  test["review"].size, unlabeled_train["review"].size )


Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews


In [3]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [4]:
# Download the punkt tokenizer for sentence splitting
import nltk.data

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [6]:
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for review in tqdm(train["review"]): 
    try:
        sentences += review_to_sentences(review, tokenizer)
    except: pass

print "Parsing sentences from unlabeled set"
for review in tqdm(unlabeled_train["review"]): 
    try:
        sentences += review_to_sentences(review, tokenizer)
    except: pass


Parsing sentences from training set
Parsing sentences from unlabeled set
                                                                           

In [7]:
print len(sentences)
print sentences[0]
print sentences[100]


778569
[u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the', u'odd', u'documentary', u'here', u'and', u'there', u'watched', u'the', u'wiz', u'and', u'watched', u'moonwalker', u'again']
[u'the', u'young', u'actors', u'some', u'of', u'whom', u'i', u'recognized', u'from', u'british', u'tv', u'such', u'as', u'shameless', u'were', u'exuberant', u'in', u'representing', u'the', u'usual', u'range', u'of', u'junior', u'high', u'social', u'pressures']

In [9]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "cs231n/datasets/300features_40minwords_10context"
model.save(model_name)


Training model...

In [5]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
from gensim.models import Word2Vec
model = Word2Vec.load("cs231n/datasets/300features_40minwords_10context")

In [6]:
model.doesnt_match("man woman child kitchen".split())


Out[6]:
'kitchen'

In [7]:
model.doesnt_match("france england germany berlin".split())


Out[7]:
'berlin'

In [8]:
model.most_similar("man")


Out[8]:
[(u'woman', 0.6292121410369873),
 (u'guy', 0.4850691556930542),
 (u'boy', 0.47825199365615845),
 (u'person', 0.47457921504974365),
 (u'men', 0.45443958044052124),
 (u'girl', 0.45102375745773315),
 (u'lady', 0.4452756643295288),
 (u'young', 0.4034602642059326),
 (u'himself', 0.40146955847740173),
 (u'son', 0.38437342643737793)]

In [9]:
model.most_similar("queen")


Out[9]:
[(u'latifah', 0.4675486981868744),
 (u'princess', 0.46697551012039185),
 (u'bee', 0.4544464945793152),
 (u'victoria', 0.41605091094970703),
 (u'prince', 0.415138304233551),
 (u'alice', 0.38413459062576294),
 (u'throne', 0.38237684965133667),
 (u'marie', 0.3734177350997925),
 (u'bride', 0.3647081255912781),
 (u'king', 0.35882601141929626)]

In [10]:
model.most_similar("awful")


Out[10]:
[(u'terrible', 0.676790177822113),
 (u'horrible', 0.6072491407394409),
 (u'dreadful', 0.5976172685623169),
 (u'atrocious', 0.5666143894195557),
 (u'horrendous', 0.5406253933906555),
 (u'lousy', 0.5127788782119751),
 (u'laughable', 0.5019239187240601),
 (u'abysmal', 0.48727792501449585),
 (u'lame', 0.48325756192207336),
 (u'unbelievably', 0.47077295184135437)]

In [11]:
print type(model.syn0)
print model.syn0.shape


<type 'numpy.ndarray'>
(16247, 300)

In [6]:
import numpy as np  # Make sure that numpy is imported

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    print "Averaging %d reviews" % len(reviews)
    for review in tqdm(reviews):
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
       #
       # Increment the counter
       counter = counter + 1.
    return reviewFeatureVecs

In [7]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

print "Creating average feature vecs for training reviews"
clean_train_reviews = []
for review in tqdm(train["review"]):
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print "Creating average feature vecs for test reviews"
clean_test_reviews = []
for review in tqdm(test["review"]):
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )


Creating average feature vecs for training reviews
Averaging 25000 reviews
Creating average feature vecs for test reviews
|###-------| 9739/25000  38% [elapsed: 01:22 left: 02:09, 117.81 iters/sec]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-7-9282b170b207> in <module>()
     14 clean_test_reviews = []
     15 for review in tqdm(test["review"]):
---> 16     clean_test_reviews.append( review_to_wordlist( review,         remove_stopwords=True ))
     17 
     18 testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

<ipython-input-3-d75a96fa4f7f> in review_to_wordlist(review, remove_stopwords)
      9     #
     10     # 1. Remove HTML
---> 11     review_text = BeautifulSoup(review).get_text()
     12     #
     13     # 2. Remove non-letters

/usr/lib/python2.7/dist-packages/bs4/__init__.pyc in __init__(self, markup, features, builder, parse_only, from_encoding, **kwargs)
    170 
    171         try:
--> 172             self._feed()
    173         except StopParsing:
    174             pass

/usr/lib/python2.7/dist-packages/bs4/__init__.pyc in _feed(self)
    183         self.builder.reset()
    184 
--> 185         self.builder.feed(self.markup)
    186         # Close out any unfinished strings and close all the open tags.
    187         self.endData()

/usr/lib/python2.7/dist-packages/bs4/builder/_html5lib.pyc in feed(self, markup)
     35             warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
     36         parser = html5lib.HTMLParser(tree=self.create_treebuilder)
---> 37         doc = parser.parse(markup, encoding=self.user_specified_encoding)
     38 
     39         # Set the character encoding detected by the tokenizer.

/usr/lib/python2.7/dist-packages/html5lib/html5parser.pyc in parse(self, stream, encoding, parseMeta, useChardet)
    222         """
    223         self._parse(stream, innerHTML=False, encoding=encoding,
--> 224                     parseMeta=parseMeta, useChardet=useChardet)
    225         return self.tree.getDocument()
    226 

/usr/lib/python2.7/dist-packages/html5lib/html5parser.pyc in _parse(self, stream, innerHTML, container, encoding, parseMeta, useChardet, **kwargs)
     86                                               parseMeta=parseMeta,
     87                                               useChardet=useChardet,
---> 88                                               parser=self, **kwargs)
     89         self.reset()
     90 

/usr/lib/python2.7/dist-packages/html5lib/tokenizer.pyc in __init__(self, stream, encoding, parseMeta, useChardet, lowercaseElementName, lowercaseAttrName, parser)
     38                  lowercaseElementName=True, lowercaseAttrName=True, parser=None):
     39 
---> 40         self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
     41         self.parser = parser
     42 

/usr/lib/python2.7/dist-packages/html5lib/inputstream.pyc in HTMLInputStream(source, encoding, parseMeta, chardet)
    135         return HTMLUnicodeInputStream(source)
    136     else:
--> 137         return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
    138 
    139 

/usr/lib/python2.7/dist-packages/html5lib/inputstream.pyc in __init__(self, source, encoding, parseMeta, chardet)
    414         # Detect encoding iff no explicit "transport level" encoding is supplied
    415         if (self.charEncoding[0] is None):
--> 416             self.charEncoding = self.detectEncoding(parseMeta, chardet)
    417 
    418         # Call superclass

/usr/lib/python2.7/dist-packages/html5lib/inputstream.pyc in detectEncoding(self, parseMeta, chardet)
    469                         break
    470                     buffers.append(buffer)
--> 471                     detector.feed(buffer)
    472                 detector.close()
    473                 encoding = detector.result['encoding']

/usr/lib/python2.7/dist-packages/chardet/universaldetector.pyc in feed(self, aBuf)
    113                 self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()]
    114             for prober in self._mCharSetProbers:
--> 115                 if prober.feed(aBuf) == constants.eFoundIt:
    116                     self.result = {'encoding': prober.get_charset_name(),
    117                                    'confidence': prober.get_confidence()}

/usr/lib/python2.7/dist-packages/chardet/charsetgroupprober.pyc in feed(self, aBuf)
     57             if not prober: continue
     58             if not prober.active: continue
---> 59             st = prober.feed(aBuf)
     60             if not st: continue
     61             if st == constants.eFoundIt:

/usr/lib/python2.7/dist-packages/chardet/utf8prober.pyc in feed(self, aBuf)
     50     def feed(self, aBuf):
     51         for c in aBuf:
---> 52             codingState = self._mCodingSM.next_state(c)
     53             if codingState == eError:
     54                 self._mState = constants.eNotMe

KeyboardInterrupt: 

In [16]:
# Fit a random forest to the training data, using 100 trees
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier( n_estimators = 100 )

print "Fitting a random forest to labeled training data..."
forest = forest.fit( trainDataVecs, train["sentiment"] )

# Test & extract results 
result = forest.predict( testDataVecs )

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "cs231n/datasets/Word2Vec_AverageVectors.csv", index=False, quoting=3 )


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-16-f00e4e34a734> in <module>()
      1 # Fit a random forest to the training data, using 100 trees
----> 2 from sklearn.ensemble import RandomForestClassifier
      3 forest = RandomForestClassifier( n_estimators = 100 )
      4 
      5 print "Fitting a random forest to labeled training data..."

ImportError: No module named sklearn.ensemble

In [ ]: