notebook.community

Edit and run



In [1]:

    
import pandas as pd



In [3]:

    
train = pd.read_csv('../Datasets/IMDB/labeledTrainData.tsv', delimiter='\t')



In [4]:

    
test = pd.read_csv('../Datasets/IMDB/testData.tsv', delimiter='\t')



In [5]:

    
train.head()









    Out[5]:







  
    
      
      id
      sentiment
      review
    
  
  
    
      0
      5814_8
      1
      With all this stuff going down at the moment w...
    
    
      1
      2381_9
      1
      \The Classic War of the Worlds\" by Timothy Hi...
    
    
      2
      7759_3
      0
      The film starts with a manager (Nicholas Bell)...
    
    
      3
      3630_4
      0
      It must be assumed that those who praised this...
    
    
      4
      9495_8
      1
      Superbly trashy and wondrously unpretentious 8...



In [7]:

    
test.head()









    Out[7]:







  
    
      
      id
      review
    
  
  
    
      0
      12311_10
      Naturally in a film who's main themes are of m...
    
    
      1
      8348_2
      This movie is a disaster within a disaster fil...
    
    
      2
      5828_4
      All in all, this is a movie for kids. We saw i...
    
    
      3
      7186_2
      Afraid of the Dark left me with the impression...
    
    
      4
      12128_7
      A very accurate depiction of small time mob li...



In [8]:

    
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords



In [9]:

    
def review_to_text(review, remove_stopwords):
    raw_text = BeautifulSoup(review, 'html').get_text()
    
    letters = re.sub('[^a-zA-Z]', ' ', raw_text)
    
    words = letters.lower().split()
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    
    return words



In [11]:

    
# import nltk
# nltk.download('stopwords')









    



[nltk_data] Downloading package stopwords to /Users/jinze/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.






    Out[11]:





True



In [12]:

    
X_train = []

for review in train['review']:
    X_train.append(' '.join(review_to_text(review, True)))
    
y_train = train['sentiment']
    
X_test = []

for review in test['review']:
    X_test.append(' '.join(review_to_text(review, True)))



In [13]:

    
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV









    



/Users/jinze/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/jinze/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)



In [14]:

    
pip_count = Pipeline([('count_vec', CountVectorizer(analyzer='word')), ('mnb', MultinomialNB())])

pip_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')), ('mnb', MultinomialNB())])

params_count = {'count_vec__binary':[True, False], 'count_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}
params_tfidf = {'tfidf_vec__binary':[True, False], 'tfidf_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}

gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_tfidf = GridSearchCV(pip_tfidf, params_tfidf, cv=4, n_jobs=-1, verbose=1)



In [15]:

    
gs_count.fit(X_train, y_train)

print gs_count.best_score_
print gs_count.best_params_









    



Fitting 4 folds for each of 12 candidates, totalling 48 fits






    



[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.8min finished






    



0.88204
{'mnb__alpha': 1.0, 'count_vec__binary': True, 'count_vec__ngram_range': (1, 2)}



In [16]:

    
count_y_predict = gs_count.predict(X_test)



In [17]:

    
gs_tfidf.fit(X_train, y_train)

print gs_tfidf.best_score_
print gs_tfidf.best_params_









    



Fitting 4 folds for each of 12 candidates, totalling 48 fits






    



[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.9min finished






    



0.88712
{'tfidf_vec__ngram_range': (1, 2), 'tfidf_vec__binary': True, 'mnb__alpha': 0.1}



In [18]:

    
tfidf_y_predict = gs_tfidf.predict(X_test)



In [19]:

    
submission_count = pd.DataFrame({'id': test['id'], 'sentiment': count_y_predict})

submission_tfidf= pd.DataFrame({'id': test['id'], 'sentiment': tfidf_y_predict})


submission_count.to_csv('../Datasets/IMDB/submission_count.csv', index=False)
submission_tfidf.to_csv('../Datasets/IMDB/submission_tfidf.csv', index=False)



In [20]:

    
unlabeled_train = pd.read_csv('../Datasets/IMDB/unlabeledTrainData.tsv', delimiter='\t', quoting=3)



In [22]:

    
import nltk
nltk.download('punkt')









    



[nltk_data] Downloading package punkt to /Users/jinze/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.






    Out[22]:





True



In [23]:

    
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



In [24]:

    
def review_to_sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_text(raw_sentence, False))
    
    return sentences



In [25]:

    
corpora = []  
    
for review in unlabeled_train['review']:
    corpora += review_to_sentences(review.decode('utf8'), tokenizer)









    



/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:219: UserWarning: "." looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.archive.org/details/LovefromaStranger"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.loosechangeguide.com/LooseChangeGuide.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.msnbc.msn.com/id/4972055/site/newsweek/"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:219: UserWarning: ".." looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.youtube.com/watch?v=a0KSqelmgN8"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/jinze/anaconda2/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://jake-weird.blogspot.com/2007/08/beneath.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup



In [117]:

    
# Set values for various parameters
num_features = 32    # Word vector dimensionality                      
min_word_count = 20   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words



In [118]:

    
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(corpora, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

model.init_sims(replace=True)

model_name = "../Datasets/IMDB/300features_20minwords_10context"
model.save(model_name)









    



Training model...



In [119]:

    
from gensim.models import Word2Vec
model = Word2Vec.load("../Datasets/IMDB/300features_20minwords_10context")
model.most_similar("man")









    



/Users/jinze/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  This is separate from the ipykernel package so we can avoid doing imports until






    Out[119]:





[(u'doctor', 0.8096544146537781),
 (u'woman', 0.7690240144729614),
 (u'boy', 0.7499479055404663),
 (u'soldier', 0.7452120780944824),
 (u'priest', 0.7431017160415649),
 (u'lady', 0.7341427803039551),
 (u'angus', 0.7268729209899902),
 (u'scientist', 0.7172072529792786),
 (u'assassin', 0.7108720541000366),
 (u'monk', 0.7091008424758911)]



In [31]:

    
import numpy as np  

def makeFeatureVec(words, model, num_features):

    featureVec = np.zeros((num_features,),dtype="float32")

    nwords = 0.

    index2word_set = set(model.wv.index2word)

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
            
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):

    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")

    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        
        counter += 1

    return reviewFeatureVecs



In [120]:

    
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_text( review, remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )
trainDataVecsCopy = trainDataVecs

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_text( review, remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )
testDataVecsCopy = testDataVecs









    



/Users/jinze/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:14: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  
/Users/jinze/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:16: RuntimeWarning: invalid value encountered in divide
  app.launch_new_instance()



In [143]:

    
trainDataVecs = trainDataVecsCopy
y_train = train['sentiment']
testDataVecs = testDataVecsCopy


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

gbc = GradientBoostingClassifier()

params_gbc = {'n_estimators':[10, 50, 100], 'learning_rate':[0.01, 0.1, 1.0], 'max_depth': [2, 3, 4]}
gs = GridSearchCV(gbc, params_gbc, cv=3, n_jobs=-1, verbose=1)

print np.isnan(trainDataVecs).sum()
print trainDataVecs.shape
# print trainDataVecs[0]
print len(X_train)
print len(trainDataVecs)
print y_train.shape
idx = 0
idxList = []
for x in trainDataVecs:
    if np.isnan(x).any():
#         trainDataVecs = np.delete(trainDataVecs, idx)
#         y_train = np.delete(y_train, idx)
        idxList.append(idx)
        print idx,x
        idx = idx + 1
    else:
        idx = idx + 1

idxList.reverse()
for i in idxList:
    print i
    trainDataVecs = np.delete(trainDataVecs, i, axis = 0)
#     y_train = np.delete(y_train, i, axis = None)
    y_train = y_train.drop(i)
idxList=[]
# x = trainDataVecs[np.isnan(trainDataVecs)]
# print x
# y_train=y_train[~np.isnan(trainDataVecs)]


# trainDataVecs.dropna(inplace=True)
# trainDataVecs.fillna('0.1')
# trainDataVecs.replace([NaN],0.1)

gs.fit(trainDataVecs, y_train)

print gs.best_score_
print gs.best_params_

idx = 0
idxList = []
for x in testDataVecs:
    if np.isnan(x).any():
        idxList.append(idx)
        print idx,x
        idx = idx + 1
    else:
        idx = idx + 1

idxList.reverse()
for i in idxList:
    print i
    testDataVecs = np.delete(testDataVecs, i, axis = 0)
    test = test.drop(i)
idxList=[]
result = gs.predict(testDataVecs)
# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "../Datasets/IMDB/submission_w2v.csv", index=False, quoting=3)









    



32
(25000, 32)
25000
25000
(25000,)
24008 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
24008
Fitting 3 folds for each of 27 candidates, totalling 81 fits






    



[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   52.5s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  1.8min finished






    



0.813952558102
{'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 4}
1065 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
1073 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
20459 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
20459
1073
1065



In [98]:

    
print idxList



In [122]:

    
idxList=[]
trainDataVecs = trainDataVecsCopy
y_train = train['sentiment']
print y_train.shape
print type(y_train)
# x1 = y_train.drop(24008)
# np.delete(y_train,24008)
print x1.shape
print y_train.shape
# print x1









    



(25000,)
<class 'pandas.core.series.Series'>
(24999,)
(25000,)



In [124]:

    
testDataVecsCopy = testDataVecs



In [133]:

    
print testDataVecs.shape
print testDataVecsCopy.shape
print idxList









    



(24997, 32)
(25000, 32)
[]



In [132]:

    
testDataVecs[np.isnan(testDataVecs)]









    Out[132]:





array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
      dtype=float32)



In [138]:

    
idx = 0
idxList = []
for x in testDataVecs:
    if np.isnan(x).any():
        idxList.append(idx)
        print idx,x
        idx = idx + 1
    else:
        idx = idx + 1
print idxList
print idxList.reverse()
print reversed(idxList)
print type(idxList)
idxList = []









    



1072 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
20457 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
[1072, 20457]
None
<listreverseiterator object at 0x1a9afb8ad0>
<type 'list'>



In [142]:

    
print test.shape
type(test)









    



(25000, 2)






    Out[142]:





pandas.core.frame.DataFrame



In [ ]:

	id	sentiment	review
0	5814_8	1	With all this stuff going down at the moment w...
1	2381_9	1	\The Classic War of the Worlds\" by Timothy Hi...
2	7759_3	0	The film starts with a manager (Nicholas Bell)...
3	3630_4	0	It must be assumed that those who praised this...
4	9495_8	1	Superbly trashy and wondrously unpretentious 8...

	id	review
0	12311_10	Naturally in a film who's main themes are of m...
1	8348_2	This movie is a disaster within a disaster fil...
2	5828_4	All in all, this is a movie for kids. We saw i...
3	7186_2	Afraid of the Dark left me with the impression...
4	12128_7	A very accurate depiction of small time mob li...