In [1]:
import pandas as pd
In [3]:
train = pd.read_csv('../Datasets/IMDB/labeledTrainData.tsv', delimiter='\t')
In [4]:
test = pd.read_csv('../Datasets/IMDB/testData.tsv', delimiter='\t')
In [5]:
train.head()
Out[5]:
In [7]:
test.head()
Out[7]:
In [8]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
In [9]:
def review_to_text(review, remove_stopwords):
raw_text = BeautifulSoup(review, 'html').get_text()
letters = re.sub('[^a-zA-Z]', ' ', raw_text)
words = letters.lower().split()
if remove_stopwords:
stop_words = set(stopwords.words('english'))
words = [w for w in words if w not in stop_words]
return words
In [11]:
# import nltk
# nltk.download('stopwords')
Out[11]:
In [12]:
X_train = []
for review in train['review']:
X_train.append(' '.join(review_to_text(review, True)))
y_train = train['sentiment']
X_test = []
for review in test['review']:
X_test.append(' '.join(review_to_text(review, True)))
In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
In [14]:
pip_count = Pipeline([('count_vec', CountVectorizer(analyzer='word')), ('mnb', MultinomialNB())])
pip_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')), ('mnb', MultinomialNB())])
params_count = {'count_vec__binary':[True, False], 'count_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}
params_tfidf = {'tfidf_vec__binary':[True, False], 'tfidf_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}
gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_tfidf = GridSearchCV(pip_tfidf, params_tfidf, cv=4, n_jobs=-1, verbose=1)
In [15]:
gs_count.fit(X_train, y_train)
print gs_count.best_score_
print gs_count.best_params_
In [16]:
count_y_predict = gs_count.predict(X_test)
In [17]:
gs_tfidf.fit(X_train, y_train)
print gs_tfidf.best_score_
print gs_tfidf.best_params_
In [18]:
tfidf_y_predict = gs_tfidf.predict(X_test)
In [19]:
submission_count = pd.DataFrame({'id': test['id'], 'sentiment': count_y_predict})
submission_tfidf= pd.DataFrame({'id': test['id'], 'sentiment': tfidf_y_predict})
submission_count.to_csv('../Datasets/IMDB/submission_count.csv', index=False)
submission_tfidf.to_csv('../Datasets/IMDB/submission_tfidf.csv', index=False)
In [20]:
unlabeled_train = pd.read_csv('../Datasets/IMDB/unlabeledTrainData.tsv', delimiter='\t', quoting=3)
In [22]:
import nltk
nltk.download('punkt')
Out[22]:
In [23]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
In [24]:
def review_to_sentences(review, tokenizer):
raw_sentences = tokenizer.tokenize(review.strip())
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(review_to_text(raw_sentence, False))
return sentences
In [25]:
corpora = []
for review in unlabeled_train['review']:
corpora += review_to_sentences(review.decode('utf8'), tokenizer)
In [117]:
# Set values for various parameters
num_features = 32 # Word vector dimensionality
min_word_count = 20 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
In [118]:
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(corpora, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
model.init_sims(replace=True)
model_name = "../Datasets/IMDB/300features_20minwords_10context"
model.save(model_name)
In [119]:
from gensim.models import Word2Vec
model = Word2Vec.load("../Datasets/IMDB/300features_20minwords_10context")
model.most_similar("man")
Out[119]:
In [31]:
import numpy as np
def makeFeatureVec(words, model, num_features):
featureVec = np.zeros((num_features,),dtype="float32")
nwords = 0.
index2word_set = set(model.wv.index2word)
for word in words:
if word in index2word_set:
nwords = nwords + 1.
featureVec = np.add(featureVec,model[word])
featureVec = np.divide(featureVec,nwords)
return featureVec
def getAvgFeatureVecs(reviews, model, num_features):
counter = 0
reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
for review in reviews:
reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
counter += 1
return reviewFeatureVecs
In [120]:
clean_train_reviews = []
for review in train["review"]:
clean_train_reviews.append( review_to_text( review, remove_stopwords=True ))
trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )
trainDataVecsCopy = trainDataVecs
clean_test_reviews = []
for review in test["review"]:
clean_test_reviews.append( review_to_text( review, remove_stopwords=True ))
testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )
testDataVecsCopy = testDataVecs
In [143]:
trainDataVecs = trainDataVecsCopy
y_train = train['sentiment']
testDataVecs = testDataVecsCopy
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
gbc = GradientBoostingClassifier()
params_gbc = {'n_estimators':[10, 50, 100], 'learning_rate':[0.01, 0.1, 1.0], 'max_depth': [2, 3, 4]}
gs = GridSearchCV(gbc, params_gbc, cv=3, n_jobs=-1, verbose=1)
print np.isnan(trainDataVecs).sum()
print trainDataVecs.shape
# print trainDataVecs[0]
print len(X_train)
print len(trainDataVecs)
print y_train.shape
idx = 0
idxList = []
for x in trainDataVecs:
if np.isnan(x).any():
# trainDataVecs = np.delete(trainDataVecs, idx)
# y_train = np.delete(y_train, idx)
idxList.append(idx)
print idx,x
idx = idx + 1
else:
idx = idx + 1
idxList.reverse()
for i in idxList:
print i
trainDataVecs = np.delete(trainDataVecs, i, axis = 0)
# y_train = np.delete(y_train, i, axis = None)
y_train = y_train.drop(i)
idxList=[]
# x = trainDataVecs[np.isnan(trainDataVecs)]
# print x
# y_train=y_train[~np.isnan(trainDataVecs)]
# trainDataVecs.dropna(inplace=True)
# trainDataVecs.fillna('0.1')
# trainDataVecs.replace([NaN],0.1)
gs.fit(trainDataVecs, y_train)
print gs.best_score_
print gs.best_params_
idx = 0
idxList = []
for x in testDataVecs:
if np.isnan(x).any():
idxList.append(idx)
print idx,x
idx = idx + 1
else:
idx = idx + 1
idxList.reverse()
for i in idxList:
print i
testDataVecs = np.delete(testDataVecs, i, axis = 0)
test = test.drop(i)
idxList=[]
result = gs.predict(testDataVecs)
# Write the test results
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "../Datasets/IMDB/submission_w2v.csv", index=False, quoting=3)
In [98]:
print idxList
In [122]:
idxList=[]
trainDataVecs = trainDataVecsCopy
y_train = train['sentiment']
print y_train.shape
print type(y_train)
# x1 = y_train.drop(24008)
# np.delete(y_train,24008)
print x1.shape
print y_train.shape
# print x1
In [124]:
testDataVecsCopy = testDataVecs
In [133]:
print testDataVecs.shape
print testDataVecsCopy.shape
print idxList
In [132]:
testDataVecs[np.isnan(testDataVecs)]
Out[132]:
In [138]:
idx = 0
idxList = []
for x in testDataVecs:
if np.isnan(x).any():
idxList.append(idx)
print idx,x
idx = idx + 1
else:
idx = idx + 1
print idxList
print idxList.reverse()
print reversed(idxList)
print type(idxList)
idxList = []
In [142]:
print test.shape
type(test)
Out[142]:
In [ ]: