In [2]:
%load_ext watermark
%watermark -a 'Vahid Mirjalili' -d -p scikit-learn,numpy,numexpr,pandas,matplotlib,plotly -v
In [3]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import scipy
import sklearn
%matplotlib inline
In [4]:
df = pd.read_table('../data/labeledTrainData.tsv')
df.head()
Out[4]:
In [5]:
X_train = df.loc[:, 'review']
y_train = df.loc[:, 'sentiment']
X_train.head()
Out[5]:
In [6]:
import nltk
import string
import re
from collections import Counter
from nltk.corpus import stopwords
Transform to lower-case
Remove the punctuations
Remove the stopwrods
Tokenize the remaining string
In [7]:
## For more info, see http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = nltk.stem.porter.PorterStemmer()
def get_tokens(inp_txt):
## Lower case: ABC -> abc
txt_lower = inp_txt.lower()
## Remove punctuations (!, ', ", ., :, ;, )
#txt_lower_nopunct = txt_lower.translate(string.maketrans("",""), string.punctuation)
#print(txt_lower_nopunct)
## Tokenize:
tokens = nltk.word_tokenize(txt_lower) #_nopunct)
#tokens = nltk.wordpunct_tokenize(txt_lower)
## remove stop-words:
tokens_filtered = [w for w in tokens if not w in stopwords.words('english')]
## stemming:
stems = [stemmer.stem(t) for t in tokens_filtered]
stems_nopunct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
return (stems_nopunct)
In [8]:
get_tokens("What's in a name? That which we call a rose by any other name would smell as sweet.")
## Note: you need to download punkt package in nltk:
# import nltk
# nltk.download(punkt)
Out[8]:
In [9]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(
encoding = 'utf-8',
decode_error = 'replace',
strip_accents = 'ascii',
analyzer = 'word',
smooth_idf = True,
tokenizer = get_tokens
)
tfidf
Out[9]:
In [10]:
## Shakespear quote
example_txt_1 = "What's in a name? That which we call a rose by any other name would smell as sweet."
example_txt_2 = "To be, or not to be: that is the question."
tfidf = tfidf.fit([example_txt_1 + example_txt_2])
example1 = tfidf.transform([example_txt_1])
example2 = tfidf.transform([example_txt_2])
print('Features: %s' %tfidf.get_feature_names())
print('Example1: %s' %example1.toarray())
print('Example2: %s' %example2.toarray())
In [12]:
tfidf_train = tfidf.fit_transform(X_train.ravel())
print('Feature-set size: %s' %len(tfidf_train.get_feature_names()))
In [ ]:
import pickle
pkl_out = open('../data/pickle/tfidf_object.pkl', 'w')
pickle.dump(tfidf, pkl_out)
pkl_out.close()