In [1]:
import pandas as pd

In [3]:
train = pd.read_csv('../Datasets/IMDB/labeledTrainData.tsv', delimiter='\t')

In [4]:
test = pd.read_csv('../Datasets/IMDB/testData.tsv', delimiter='\t')

In [5]:
train.head()


Out[5]:
id sentiment review
0 5814_8 1 With all this stuff going down at the moment w...
1 2381_9 1 \The Classic War of the Worlds\" by Timothy Hi...
2 7759_3 0 The film starts with a manager (Nicholas Bell)...
3 3630_4 0 It must be assumed that those who praised this...
4 9495_8 1 Superbly trashy and wondrously unpretentious 8...

In [7]:
test.head()


Out[7]:
id review
0 12311_10 Naturally in a film who's main themes are of m...
1 8348_2 This movie is a disaster within a disaster fil...
2 5828_4 All in all, this is a movie for kids. We saw i...
3 7186_2 Afraid of the Dark left me with the impression...
4 12128_7 A very accurate depiction of small time mob li...

In [8]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [9]:
def review_to_text(review, remove_stopwords):
    raw_text = BeautifulSoup(review, 'html').get_text()
    
    letters = re.sub('[^a-zA-Z]', ' ', raw_text)
    
    words = letters.lower().split()
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    
    return words

In [11]:
# import nltk
# nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /Users/jinze/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Out[11]:
True

In [12]:
X_train = []

for review in train['review']:
    X_train.append(' '.join(review_to_text(review, True)))
    
y_train = train['sentiment']
    
X_test = []

for review in test['review']:
    X_test.append(' '.join(review_to_text(review, True)))

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV


/Users/jinze/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/jinze/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [14]:
pip_count = Pipeline([('count_vec', CountVectorizer(analyzer='word')), ('mnb', MultinomialNB())])

pip_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')), ('mnb', MultinomialNB())])

params_count = {'count_vec__binary':[True, False], 'count_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}
params_tfidf = {'tfidf_vec__binary':[True, False], 'tfidf_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}

gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_tfidf = GridSearchCV(pip_tfidf, params_tfidf, cv=4, n_jobs=-1, verbose=1)

In [15]:
gs_count.fit(X_train, y_train)

print gs_count.best_score_
print gs_count.best_params_


Fitting 4 folds for each of 12 candidates, totalling 48 fits
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.8min finished
0.88204
{'mnb__alpha': 1.0, 'count_vec__binary': True, 'count_vec__ngram_range': (1, 2)}

In [16]:
count_y_predict = gs_count.predict(X_test)

In [17]:
gs_tfidf.fit(X_train, y_train)

print gs_tfidf.best_score_
print gs_tfidf.best_params_


Fitting 4 folds for each of 12 candidates, totalling 48 fits
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.9min finished
0.88712
{'tfidf_vec__ngram_range': (1, 2), 'tfidf_vec__binary': True, 'mnb__alpha': 0.1}

In [18]:
tfidf_y_predict = gs_tfidf.predict(X_test)

In [19]:
submission_count = pd.DataFrame({'id': test['id'], 'sentiment': count_y_predict})

submission_tfidf= pd.DataFrame({'id': test['id'], 'sentiment': tfidf_y_predict})


submission_count.to_csv('../Datasets/IMDB/submission_count.csv', index=False)
submission_tfidf.to_csv('../Datasets/IMDB/submission_tfidf.csv', index=False)

In [20]:
unlabeled_train = pd.read_csv('../Datasets/IMDB/unlabeledTrainData.tsv', delimiter='\t', quoting=3)

In [22]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/jinze/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Out[22]:
True

In [23]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [24]:
def review_to_sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_text(raw_sentence, False))
    
    return sentences

In [25]:
corpora = []  
    
for review in unlabeled_train['review']:
    corpora += review_to_sentences(review.decode('utf8'), tokenizer)


/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:219: UserWarning: "." looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.archive.org/details/LovefromaStranger"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.loosechangeguide.com/LooseChangeGuide.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.msnbc.msn.com/id/4972055/site/newsweek/"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:219: UserWarning: ".." looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.youtube.com/watch?v=a0KSqelmgN8"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://jake-weird.blogspot.com/2007/08/beneath.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup

In [117]:
# Set values for various parameters
num_features = 32    # Word vector dimensionality                      
min_word_count = 20   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [118]:
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(corpora, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

model.init_sims(replace=True)

model_name = "../Datasets/IMDB/300features_20minwords_10context"
model.save(model_name)


Training model...

In [119]:
from gensim.models import Word2Vec
model = Word2Vec.load("../Datasets/IMDB/300features_20minwords_10context")
model.most_similar("man")


/Users/jinze/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  This is separate from the ipykernel package so we can avoid doing imports until
Out[119]:
[(u'doctor', 0.8096544146537781),
 (u'woman', 0.7690240144729614),
 (u'boy', 0.7499479055404663),
 (u'soldier', 0.7452120780944824),
 (u'priest', 0.7431017160415649),
 (u'lady', 0.7341427803039551),
 (u'angus', 0.7268729209899902),
 (u'scientist', 0.7172072529792786),
 (u'assassin', 0.7108720541000366),
 (u'monk', 0.7091008424758911)]

In [31]:
import numpy as np  

def makeFeatureVec(words, model, num_features):

    featureVec = np.zeros((num_features,),dtype="float32")

    nwords = 0.

    index2word_set = set(model.wv.index2word)

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
            
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):

    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")

    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        
        counter += 1

    return reviewFeatureVecs

In [120]:
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_text( review, remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )
trainDataVecsCopy = trainDataVecs

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_text( review, remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )
testDataVecsCopy = testDataVecs


/Users/jinze/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:14: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  
/Users/jinze/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:16: RuntimeWarning: invalid value encountered in divide
  app.launch_new_instance()

In [143]:
trainDataVecs = trainDataVecsCopy
y_train = train['sentiment']
testDataVecs = testDataVecsCopy


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

gbc = GradientBoostingClassifier()

params_gbc = {'n_estimators':[10, 50, 100], 'learning_rate':[0.01, 0.1, 1.0], 'max_depth': [2, 3, 4]}
gs = GridSearchCV(gbc, params_gbc, cv=3, n_jobs=-1, verbose=1)

print np.isnan(trainDataVecs).sum()
print trainDataVecs.shape
# print trainDataVecs[0]
print len(X_train)
print len(trainDataVecs)
print y_train.shape
idx = 0
idxList = []
for x in trainDataVecs:
    if np.isnan(x).any():
#         trainDataVecs = np.delete(trainDataVecs, idx)
#         y_train = np.delete(y_train, idx)
        idxList.append(idx)
        print idx,x
        idx = idx + 1
    else:
        idx = idx + 1

idxList.reverse()
for i in idxList:
    print i
    trainDataVecs = np.delete(trainDataVecs, i, axis = 0)
#     y_train = np.delete(y_train, i, axis = None)
    y_train = y_train.drop(i)
idxList=[]
# x = trainDataVecs[np.isnan(trainDataVecs)]
# print x
# y_train=y_train[~np.isnan(trainDataVecs)]


# trainDataVecs.dropna(inplace=True)
# trainDataVecs.fillna('0.1')
# trainDataVecs.replace([NaN],0.1)

gs.fit(trainDataVecs, y_train)

print gs.best_score_
print gs.best_params_

idx = 0
idxList = []
for x in testDataVecs:
    if np.isnan(x).any():
        idxList.append(idx)
        print idx,x
        idx = idx + 1
    else:
        idx = idx + 1

idxList.reverse()
for i in idxList:
    print i
    testDataVecs = np.delete(testDataVecs, i, axis = 0)
    test = test.drop(i)
idxList=[]
result = gs.predict(testDataVecs)
# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "../Datasets/IMDB/submission_w2v.csv", index=False, quoting=3)


32
(25000, 32)
25000
25000
(25000,)
24008 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
24008
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   52.5s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  1.8min finished
0.813952558102
{'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 4}
1065 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
1073 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
20459 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
20459
1073
1065

In [98]:
print idxList


[24008]

In [122]:
idxList=[]
trainDataVecs = trainDataVecsCopy
y_train = train['sentiment']
print y_train.shape
print type(y_train)
# x1 = y_train.drop(24008)
# np.delete(y_train,24008)
print x1.shape
print y_train.shape
# print x1


(25000,)
<class 'pandas.core.series.Series'>
(24999,)
(25000,)

In [124]:
testDataVecsCopy = testDataVecs

In [133]:
print testDataVecs.shape
print testDataVecsCopy.shape
print idxList


(24997, 32)
(25000, 32)
[]

In [132]:
testDataVecs[np.isnan(testDataVecs)]


Out[132]:
array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
      dtype=float32)

In [138]:
idx = 0
idxList = []
for x in testDataVecs:
    if np.isnan(x).any():
        idxList.append(idx)
        print idx,x
        idx = idx + 1
    else:
        idx = idx + 1
print idxList
print idxList.reverse()
print reversed(idxList)
print type(idxList)
idxList = []


1072 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
20457 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
[1072, 20457]
None
<listreverseiterator object at 0x1a9afb8ad0>
<type 'list'>

In [142]:
print test.shape
type(test)


(25000, 2)
Out[142]:
pandas.core.frame.DataFrame

In [ ]: