Import needed modules
In [1]:
import os
import pickle
# visualization
import matplotlib.pyplot as plt
# pandas
import pandas as pd
# sklearnlinter
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
# nlp
import spacy
import textacy
Set the project path to your local folder.
In [2]:
# set path to project -> change if needed
project_path = '/Users/EB/Google Drive/Projects/breweries'
Configure matplotlib: use a cleaner template.
In [3]:
# configure matplotlib
plt.style.use('seaborn-white')
Define functions.
In [4]:
def spacy_pipe(nlp):
"""Custom spacy pipeline."""
return(nlp.tagger, nlp.parser)
def get_lem(doc):
"""Return lemma of spacy doc if lemma is noun / adjective."""
interesting_pos = ('NOUN', 'PROPN', 'ADJ')
lems = [word.lemma_ for word in doc if word.pos_ in interesting_pos]
return lems
def get_chunk(noun_chunk):
"""Return interesting parts of noun chunks."""
interesting_pos = ('NOUN', 'PROPN', 'ADJ', 'ADV', 'VERB')
chunk = [tok.lemma_ for tok in noun_chunk if tok.pos_ in interesting_pos]
if len(chunk) > 1:
return ' '.join(chunk)
else:
return ''
def term_list(doc):
"""Return term list item which is used to create term document matrix."""
tl = []
# lemmata of nouns and adjectives
tl.extend(get_lem(doc))
# noun chunks
chunks = [get_chunk(chunk) for chunk in doc.noun_chunks]
chunks = [chunk for chunk in chunks if chunk]
tl.extend(chunks)
return tl
def get_top_topic(model, doc_topic_matrix):
"""Return top topic of estimated topic model."""
top_topics = model.top_doc_topics(doc_topic_matrix, top_n=1)
top_topics = [topics[0] for doc_idx, topics in top_topics]
return top_topics
def accuracy(pred, actual):
"""Calculate accuracy of predictions."""
return sum(pred == actual) / len(pred)
Load pickled data.
In [5]:
os.chdir(project_path + '/data/')
data = pickle.load(open('2styles_sample.p', 'rb'))
Load spacy pipeline for English and parse reviews.
In [6]:
# Load spacy pipeline for English
nlp = spacy.load('en', create_pipeline=spacy_pipe)
# parse via pipeline
texts = [doc for doc in nlp.pipe(data[1], n_threads=-1)]
Create term list and document term matrix.
In [7]:
# create term list
tl = [term_list(doc) for doc in texts]
# document term matrix
dtm_specs = {'terms_lists': (tl),
'weighting': 'tfidf',
'normalize': 'True',
'smooth_idf': 'True',
'min_df': 5,
'max_df': 0.95}
dtm, id2t = textacy.vsm.doc_term_matrix(**dtm_specs)
We use Latent Dirichlet allocation (LDA) as model to group our texts into 5 different topics.
In [8]:
model = textacy.tm.TopicModel('lda', n_topics=5)
model.fit(dtm)
We can now read out the top 6 terms for each topic.
In [9]:
for topic_idx, top_terms in model.top_topic_terms(id2t, top_n=6):
print('Topic', topic_idx, ':', ', '.join(top_terms))
We can now assign these topics back to our document term matrix and check how the topic relates to the beer style.
In [10]:
doc_topic_matrix = model.transform(dtm)
In [11]:
df = pd.DataFrame(doc_topic_matrix)
df['style_name'] = [i['style_name'] for i in data[0]]
df['top_topic'] = get_top_topic(model, doc_topic_matrix)
In [12]:
print(pd.crosstab(df['style_name'], df['top_topic']))
Topics 1 and 2 mostly relate to American Pale Lager while topics 0, 3 and 4 seem to relate to American Porter. However, we can also visualize the frequency in percentages.
In [13]:
ct = pd.crosstab(df['top_topic'], df['style_name'])
ct = ct.apply(lambda r: r / r.sum(), axis=1)
beer_plot = ct.plot.bar(stacked=True, color=['#ead61c', '#845422'],
edgecolor='none', lw=2, rot=0, figsize=(7, 6))
beer_plot.set_xlabel('Topic')
beer_plot.set_ylabel('Frequency (%)')
beer_plot.legend(bbox_to_anchor=(1.12, 1.12))
plt.show()
We can either use the document topic matrix or the document term matrix as explaining variables in the SVM model
In [14]:
X = pd.DataFrame(doc_topic_matrix)
# numeric encoding for y variable (beer style)
bin_encode = {'American Pale Lager': 0,
'American Porter': 1}
y = df['style_name'].map(bin_encode)
In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
In [16]:
seed = 123
folds = 5
In [17]:
# hyperparameters -> usually more tunings, just for illustration
parameters = {'clf__kernel': ('rbf', 'poly', 'linear', 'sigmoid'),
'clf__gamma': ('auto', 1),
'clf__C': (10, 1.0, 0.1)}
In [18]:
piper = Pipeline([('clf', SVC(random_state=seed))])
grid_search = GridSearchCV(piper, parameters, n_jobs=3, verbose=1,
refit=True, cv=folds)
grid_search.fit(X_train, y_train)
print('Best score: %0.3f' % grid_search.best_score_)
In [19]:
print(grid_search.best_estimator_)
In [20]:
y_pred = grid_search.predict(X_test)
res = pd.DataFrame({'y_test': pd.Series(y_test)})
res['y_pred'] = y_pred
print(pd.crosstab(res['y_test'], res['y_pred'], rownames=['True'],
colnames=['Predicted']))
print('Accuracy in test set: %0.3f' % accuracy(res['y_pred'], res['y_test']))
In [21]:
X = dtm
# split in train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.1,
random_state=123)
In [22]:
parameters = {'clf__kernel': ('rbf', 'linear')}
piper = Pipeline([('clf', SVC(C=1.0, gamma='auto', random_state=seed))])
grid_search = GridSearchCV(piper, parameters, n_jobs=3, verbose=1,
refit=True, cv=folds)
grid_search.fit(X_train, y_train)
print('Best score: %0.3f' % grid_search.best_score_)
In [23]:
print(grid_search.best_estimator_)
In [24]:
y_pred = grid_search.predict(X_test)
res = pd.DataFrame({'y_test': pd.Series(y_test)})
res['y_pred'] = y_pred
In [25]:
print(pd.crosstab(res['y_test'], res['y_pred'], rownames=['True'],
colnames=['Predicted']))
In [26]:
print('Accuracy in test set: %0.3f' % accuracy(res['y_pred'], res['y_test']))