Please first download the data from here: https://www.kaggle.com/c/word2vec-nlp-tutorial/data
In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from os.path import join
from bs4 import BeautifulSoup
In [2]:
root_dir = '/Users/arman/kaggledata/popcorn'
dfTrain = pd.read_csv(join(root_dir,'labeledTrainData.tsv'),header=0,\
delimiter="\t",quoting=3)
dfTest = pd.read_csv(join(root_dir,'testData.tsv'), header=0,\
delimiter="\t", quoting=3 )
In [3]:
dfTrain.head(5)
Out[3]:
In [4]:
dfTest.head(5)
Out[4]:
In [5]:
dfTrain['review'][11]
Out[5]:
In [6]:
target = dfTrain['sentiment']
In [7]:
def review_to_wordlist(review, remove_stopwords=False, split=False):
"""
Simple text cleaning function,
uses BeautifulSoup to extract text content from html
removes all non-alphabet
converts to lower case
can remove stopwords
can perform simple tokenization using split by whitespace
"""
review_text = BeautifulSoup(review, 'lxml').get_text()
review_text = re.sub("[^a-zA-Z]"," ", review_text)
words = review_text.lower().split()
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
if split:
return(words)
else:
return(' '.join(words))
In [8]:
review_to_wordlist(dfTrain['review'][11])
Out[8]:
In [9]:
review_to_wordlist(dfTrain['review'][11],remove_stopwords=True)
Out[9]:
In [10]:
token = review_to_wordlist(dfTrain['review'][11],remove_stopwords=True, split=True)
print(token)
In [11]:
dfTrain['review'] = dfTrain['review'].map(review_to_wordlist)
dfTest['review'] = dfTest['review'].map(review_to_wordlist)
train_len = len(dfTrain)
In [12]:
corpus = list(dfTrain['review']) + list(dfTest['review'])
In [13]:
tfv = TfidfVectorizer(min_df=3, max_features=None, ngram_range=(1, 2),\
use_idf=True,smooth_idf=True,sublinear_tf=True,\
stop_words = 'english')
tfv.fit(corpus)
Out[13]:
In [14]:
X_all = tfv.transform(corpus)
In [15]:
print(X_all.shape)
In [16]:
train = X_all[:train_len]
test = X_all[train_len:]
c
In [17]:
Cs = [1,3,10,30,100,300]
for c in Cs:
clf = LogisticRegression(penalty='l2', dual=True, tol=0.0001,\
C=c, fit_intercept=True, intercept_scaling=1.0,\
class_weight=None, random_state=None)
print("c:",c," score:", np.mean(cross_val_score(clf, train, target,\
cv=5, scoring='roc_auc')))
In [18]:
clf = LogisticRegression(penalty='l2', dual=True, tol=0.0001,\
C=30, fit_intercept=True, intercept_scaling=1.0,\
class_weight=None, random_state=None)
clf.fit(train,target)
Out[18]:
In [19]:
preds = clf.predict_proba(test)[:,1]
dfOut = pd.DataFrame( data={"id":dfTest["id"], "sentiment":preds} )
dfOut.to_csv(join(root_dir,'submission.csv'), index=False, quoting=3)
"!" has sentimental value!)
In [ ]: