by Alejandro Correa Bahnsen and Jesus Solano
version 1.5, March 2019
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. Special thanks goes to Kevin Markham
NLP requires an understanding of the language and the world.
In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
# from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
In [2]:
df = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/mashable_texts.csv', index_col=0)
In [3]:
df.head()
Out[3]:
In [5]:
y = df.shares
y.describe()
Out[5]:
In [6]:
y = pd.cut(y, [0, 893, 1200, 2275, 63200], labels=[0, 1, 2, 3])
In [7]:
y.value_counts()
Out[7]:
In [8]:
df['y'] = y
In [9]:
X = df.text
In [10]:
# use CountVectorizer to create document-term matrices from X
vect = CountVectorizer()
X_dtm = vect.fit_transform(X)
In [16]:
temp=X_dtm.todense()
In [17]:
vect.vocabulary_
Out[17]:
In [10]:
# rows are documents, columns are terms (aka "tokens" or "features")
X_dtm.shape
Out[10]:
In [11]:
# last 50 features
print(vect.get_feature_names()[-150:-100])
In [12]:
# show vectorizer options
vect
Out[12]:
In [19]:
vect = CountVectorizer(lowercase=False)
X_dtm = vect.fit_transform(X)
X_dtm.shape
Out[19]:
In [25]:
X_dtm.todense()[0].argmax()
Out[25]:
In [26]:
vect.get_feature_names()[8097]
Out[26]:
In [29]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 4))
X_dtm = vect.fit_transform(X)
X_dtm.shape
Out[29]:
In [ ]:
In [30]:
# last 50 features
print(vect.get_feature_names()[-1000:-950])
In [31]:
# Default CountVectorizer
vect = CountVectorizer()
X_dtm = vect.fit_transform(X)
# use Naive Bayes to predict the star rating
nb = MultinomialNB()
pd.Series(cross_val_score(nb, X_dtm, y, cv=10)).describe()
Out[31]:
In [32]:
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
X_dtm = vect.fit_transform(X)
print('Features: ', X_dtm.shape[1])
nb = MultinomialNB()
print(pd.Series(cross_val_score(nb, X_dtm, y, cv=10)).describe())
In [33]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)
In [34]:
# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)
In [35]:
# set of stop words
print(vect.get_stop_words())
In [36]:
# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)
In [37]:
# all 100 features
print(vect.get_feature_names())
In [38]:
# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 2), max_features=1000)
tokenize_test(vect)
In [24]:
# include 1-grams and 2-grams, and only include terms that appear at least 2 times
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)
Stemming:
In [39]:
# initialize stemmer
stemmer = SnowballStemmer('english')
# words
In [40]:
vect = CountVectorizer()
vect.fit(X)
Out[40]:
In [41]:
words = list(vect.vocabulary_.keys())[:100]
In [42]:
# stem each word
print([stemmer.stem(word) for word in words])
Lemmatization
In [29]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
In [30]:
import nltk
nltk.download('wordnet')
Out[30]:
In [31]:
# assume every word is a noun
print([wordnet_lemmatizer.lemmatize(word) for word in words])
In [32]:
# assume every word is a verb
print([wordnet_lemmatizer.lemmatize(word,pos='v') for word in words])
In [33]:
# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
text = text.lower()
words = text.split()
return [wordnet_lemmatizer.lemmatize(word) for word in words]
In [34]:
# use split_into_lemmas as the feature extraction function (WARNING: SLOW!)
vect = CountVectorizer(analyzer=split_into_lemmas)
tokenize_test(vect)
In [43]:
# example documents
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
In [44]:
# Term Frequency
vect = CountVectorizer()
tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
tf
Out[44]:
In [45]:
# Document Frequency
vect = CountVectorizer(binary=True)
df_ = vect.fit_transform(simple_train).toarray().sum(axis=0)
pd.DataFrame(df_.reshape(1, 6), columns=vect.get_feature_names())
Out[45]:
In [46]:
# Term Frequency-Inverse Document Frequency (simple version)
tf/df_
Out[46]:
In [47]:
# TfidfVectorizer
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
Out[47]:
More details: TF-IDF is about what matters
In [48]:
# create a document-term matrix using TF-IDF
vect = TfidfVectorizer(stop_words='english')
dtm = vect.fit_transform(X)
features = vect.get_feature_names()
dtm.shape
Out[48]:
In [49]:
# choose a random text
review_id = 40
review_text = X[review_id]
review_length = len(review_text)
In [50]:
# create a dictionary of words and their TF-IDF scores
word_scores = {}
for word in vect.vocabulary_.keys():
word = word.lower()
if word in features:
word_scores[word] = dtm[review_id, features.index(word)]
In [43]:
# print words with the top 5 TF-IDF scores
print('TOP SCORING WORDS:')
top_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for word, score in top_scores:
print(word)
In [51]:
# print 5 random words
print('\n' + 'RANDOM WORDS:')
random_words = np.random.choice(list(word_scores.keys()), size=5, replace=False)
for word in random_words:
print(word)
In [ ]: