In [1]:
%load_ext watermark
%watermark -a 'Vahid Mirjalili' -d -p scikit-learn,numpy,numexpr,pandas,matplotlib,plotly -v
In [2]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import scipy
import sklearn
%matplotlib inline
In [3]:
df = pd.read_table('../data/labeledTrainData.tsv')
df.head()
Out[3]:
In [4]:
df_test = pd.read_table('../data/testData.tsv')
df_test.head()
Out[4]:
In [5]:
data_train = df.loc[:, 'review']
y_train = df.loc[:, 'sentiment']
data_train.head()
Out[5]:
In [6]:
data_test = df_test.loc[:, 'review']
data_test.tail()
Out[6]:
In [7]:
import nltk
import string
import re
from collections import Counter
from nltk.corpus import stopwords
Transform to lower-case
Remove the punctuations
Remove the stopwrods
Tokenize the remaining string
In [8]:
## For more info, see http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = nltk.stem.porter.PorterStemmer()
def get_tokens(inp_txt):
## Lower case: ABC -> abc
txt_lower = inp_txt.lower()
## Remove punctuations (!, ', ", ., :, ;, )
#txt_lower_nopunct = txt_lower.translate(string.maketrans("",""), string.punctuation)
#print(txt_lower_nopunct)
## Tokenize:
tokens = nltk.word_tokenize(txt_lower) #_nopunct)
#tokens = nltk.wordpunct_tokenize(txt_lower)
## remove stop-words:
tokens_filtered = [w for w in tokens if not w in stopwords.words('english')]
## stemming:
stems = [stemmer.stem(t) for t in tokens_filtered]
stems_nopunct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
return (stems_nopunct)
In [9]:
get_tokens("What's in a name? That which we call a rose by any other name would smell as sweet.")
## Note: you need to download punkt package in nltk:
# import nltk
# nltk.download(punkt)
Out[9]:
In [10]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(
encoding = 'utf-8',
decode_error = 'replace',
strip_accents = 'ascii',
analyzer = 'word',
smooth_idf = True,
tokenizer = get_tokens
)
tfidf
Out[10]:
In [11]:
## Shakespear quote
example_txt_1 = "What's in a name? That which we call a rose by any other name would smell as sweet."
example_txt_2 = "To be, or not to be: that is the question."
tfidf = tfidf.fit([example_txt_1 + example_txt_2])
example1 = tfidf.transform([example_txt_1])
example2 = tfidf.transform([example_txt_2])
print('Features: %s' %tfidf.get_feature_names())
print('Example1: %s' %example1.toarray())
print('Example2: %s' %example2.toarray())
In [12]:
tfidf_train = tfidf.fit(data_train.ravel())
print('Feature-set size: %s' %len(tfidf_train.get_feature_names()))
In [ ]:
import pickle
pkl_out = open('../data/pickle/tfidf_object.pkl', 'w')
pickle.dump(tfidf, pkl_out)
pkl_out.close()
In [13]:
### Vectorizing the training set:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(data_train)
print("Number of samples N= %d, Number of features d= %d" % X_train.shape)
### Transforming the test dataset:
X_test = vectorizer.transform(data_test)
print("Number of Test Documents: %d, Number of features: %d" %X_test.shape)
In [14]:
from sklearn import metrics
### Train a classifier object and test it on the test set:
def apply_classifier(clf):
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
#score = metrics.f1_score(y_train, pred)
return(pred)
In [15]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from timeit import timeit
In [16]:
%timeit pred_multNB = apply_classifier(MultinomialNB(alpha=.01))
In [35]:
pred_multNB = apply_classifier(MultinomialNB(alpha=.01))
pred_multNB = np.vstack((df_test.loc[:, 'id'], pred_multNB)).T
pred_multNB.shape
Out[35]:
In [38]:
np.savetxt('../results/pred.multinomialNB.csv', pred_multNB, fmt='%s,%1d', delimiter=',', header='id,sentiment')
In [ ]: