Adapted from NLP Crash Course by Charlie Greenbacker and Introduction to NLP by Dan Jurafsky
NLP requires an understanding of the language and the world.
In [1]:
!pip install textblob
In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
In [3]:
# read yelp.csv into a DataFrame
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/yelp.csv'
yelp = pd.read_csv(url)
# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
# define X and y
X = yelp_best_worst.text
y = yelp_best_worst.stars
# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
In [4]:
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
In [5]:
# rows are documents, columns are terms (aka "tokens" or "features")
X_train_dtm.shape
Out[5]:
In [6]:
# last 50 features
print(vect.get_feature_names()[-50:])
In [7]:
# show vectorizer options
vect
Out[7]:
In [8]:
# don't convert to lowercase
vect = CountVectorizer(lowercase=False)
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape
vect.get_feature_names()[-10:]
Out[8]:
In [9]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2), min_df=5)
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape
Out[9]:
In [10]:
# last 50 features
print(vect.get_feature_names()[-50:])
Predicting the star rating:
In [11]:
# use default options for CountVectorizer
vect = CountVectorizer()
# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
# calculate accuracy
print(metrics.accuracy_score(y_test, y_pred_class))
In [12]:
# calculate null accuracy
y_test_binary = np.where(y_test==5, 1, 0)
print(y_test_binary.mean())
print(1 - y_test_binary.mean())
In [13]:
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
X_train_dtm = vect.fit_transform(X_train)
print('Features: ', X_train_dtm.shape[1])
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
In [14]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 3), min_df=2, max_features=10000)
tokenize_test(vect)
In [15]:
# show vectorizer options
vect
Out[15]:
In [16]:
# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)
vect.get_params()
Out[16]:
In [17]:
# set of stop words
print(vect.get_stop_words())
In [18]:
# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)
In [19]:
# all 100 features
print(vect.get_feature_names())
In [20]:
# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 2), max_features=100000)
tokenize_test(vect)
In [21]:
# include 1-grams and 2-grams, and only include terms that appear at least 2 times
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)
TextBlob: "Simplified Text Processing"
In [22]:
# print the first review
print(yelp_best_worst.text[0])
In [23]:
# save it as a TextBlob object
review = TextBlob(yelp_best_worst.text[0])
In [24]:
# list the words
review.words
Out[24]:
In [25]:
# list the sentences
review.sentences
Out[25]:
In [26]:
# some string methods are available
review.lower()
Out[26]:
Stemming:
In [27]:
# initialize stemmer
stemmer = SnowballStemmer('english')
# stem each word
print([stemmer.stem(word) for word in review.words])
Lemmatization
In [28]:
# assume every word is a noun
print([word.lemmatize() for word in review.words])
In [29]:
# assume every word is a verb
print([word.lemmatize(pos='v') for word in review.words])
In [30]:
# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
#text = unicode(text, 'utf-8').lower()
words = TextBlob(text).words
return [word.lemmatize() for word in words]
In [31]:
# use split_into_lemmas as the feature extraction function (WARNING: SLOW!)
vect = CountVectorizer(analyzer=split_into_lemmas, decode_error='replace')
tokenize_test(vect)
In [32]:
# last 50 features
print(vect.get_feature_names()[-50:])
In [33]:
# example documents
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
In [34]:
# Term Frequency
vect = CountVectorizer()
tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
tf
Out[34]:
In [35]:
# Document Frequency
vect = CountVectorizer(binary=True)
df = vect.fit_transform(simple_train).toarray().sum(axis=0)
pd.DataFrame(df.reshape(1, 6), columns=vect.get_feature_names())
Out[35]:
In [36]:
# Term Frequency-Inverse Document Frequency (simple version)
tf/df
Out[36]:
In [37]:
# TfidfVectorizer
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
Out[37]:
More details: TF-IDF is about what matters
Reddit's autotldr uses the SMMRY algorithm, which is based on TF-IDF!
In [38]:
# create a document-term matrix using TF-IDF
vect = TfidfVectorizer(stop_words='english')
dtm = vect.fit_transform(yelp.text)
features = vect.get_feature_names()
dtm.shape
Out[38]:
In [39]:
def summarize():
# choose a random review that is at least 300 characters
review_length = 0
while review_length < 300:
review_id = np.random.randint(0, len(yelp))
review_text = yelp.text[review_id]
review_length = len(review_text)
# create a dictionary of words and their TF-IDF scores
word_scores = {}
for word in TextBlob(review_text).words:
word = word.lower()
if word in features:
word_scores[word] = dtm[review_id, features.index(word)]
# print words with the top 5 TF-IDF scores
print('TOP SCORING WORDS:')
top_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for word, score in top_scores:
print(word)
# print 5 random words
print('\n' + 'RANDOM WORDS:')
random_words = np.random.choice(list(word_scores.keys()), size=5, replace=False)
for word in random_words:
print(word)
# print the review
print('\n' + review_text)
In [40]:
summarize()
In [41]:
print(review)
In [42]:
# polarity ranges from -1 (most negative) to 1 (most positive)
review.sentiment.polarity
Out[42]:
In [43]:
# understanding the apply method
yelp['length'] = yelp.text.apply(len)
yelp.head(1)
Out[43]:
In [44]:
# define a function that accepts text and returns the polarity
def detect_sentiment(text):
return TextBlob(text).sentiment.polarity
In [45]:
# create a new DataFrame column for sentiment (WARNING: SLOW!)
yelp['sentiment'] = yelp.text.apply(detect_sentiment)
In [46]:
# box plot of sentiment grouped by stars
yelp.boxplot(column='sentiment', by='stars')
Out[46]:
In [47]:
# reviews with most positive sentiment
yelp[yelp.sentiment == 1].text.head()
Out[47]:
In [48]:
# reviews with most negative sentiment
yelp[yelp.sentiment == -1].text.head()
Out[48]:
In [49]:
# widen the column display
pd.set_option('max_colwidth', 500)
In [50]:
# negative sentiment in a 5-star review
yelp[(yelp.stars == 5) & (yelp.sentiment < -0.3)].head(1)
Out[50]:
In [51]:
# positive sentiment in a 1-star review
yelp[(yelp.stars == 1) & (yelp.sentiment > 0.5)].head(1)
Out[51]:
In [52]:
# reset the column display width
pd.reset_option('max_colwidth')
In [53]:
# create a DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
# define X and y
feature_cols = ['text', 'sentiment', 'cool', 'useful', 'funny']
X = yelp_best_worst[feature_cols]
y = yelp_best_worst.stars
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
In [54]:
# use CountVectorizer with text column only
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train.text)
X_test_dtm = vect.transform(X_test.text)
print(X_train_dtm.shape)
print(X_test_dtm.shape)
In [55]:
# shape of other four feature columns
X_train.drop('text', axis=1).shape
Out[55]:
In [56]:
# cast other feature columns to float and convert to a sparse matrix
extra = sp.sparse.csr_matrix(X_train.drop('text', axis=1).astype(float))
extra.shape
Out[56]:
In [57]:
# combine sparse matrices
X_train_dtm_extra = sp.sparse.hstack((X_train_dtm, extra))
X_train_dtm_extra.shape
Out[57]:
In [58]:
# repeat for testing set
extra = sp.sparse.csr_matrix(X_test.drop('text', axis=1).astype(float))
X_test_dtm_extra = sp.sparse.hstack((X_test_dtm, extra))
X_test_dtm_extra.shape
Out[58]:
In [59]:
# use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
print(metrics.accuracy_score(y_test, y_pred_class))
In [60]:
# use logistic regression with all features
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm_extra, y_train)
y_pred_class = logreg.predict(X_test_dtm_extra)
print(metrics.accuracy_score(y_test, y_pred_class))
In [61]:
# spelling correction
TextBlob('15 minuets late').correct()
Out[61]:
In [62]:
# spellcheck
Word('parot').spellcheck()
Out[62]:
In [63]:
# definitions
Word('bank').define('v')
Out[63]:
In [64]:
# language identification
TextBlob('Hola amigos').detect_language()
Out[64]:
In [65]:
import re
p = re.compile('[\'!@#$%^&*(),<>.?/:"\|}{};]')
# return p.sub('', text).lower().strip()
In [66]:
text = 'TTThe other one tttthe re, the blithe one.'
reg = re.compile('[tT]{1,3}he')
reg.sub('', text)
Out[66]: