Mostly adapted from: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
In [1]:
%matplotlib inline
import numpy as np
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import patsy
import statsmodels.api as sm
from sklearn import tree, linear_model, metrics, dummy, naive_bayes, neighbors
from IPython.display import Image
import pydotplus
import nltk
import gensim
import wordcloud
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.gensim
In [2]:
sns.set_context("paper")
sns.set_style("ticks")
def get_confusion_matrix(clf, X, y, verbose=True, classes=None):
y_pred = clf.predict(X)
cm = metrics.confusion_matrix(y_true=y, y_pred=y_pred)
clf_report = metrics.classification_report(y, y_pred)
if classes is None:
classes = clf.classes_
df_cm = pd.DataFrame(cm, columns=classes, index=classes)
if verbose:
print clf_report
print df_cm
return clf_report, df_cm
In [4]:
from sklearn.datasets import fetch_20newsgroups
In [5]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
In [6]:
len(twenty_train.data), len(twenty_test.data)
Out[6]:
In [11]:
twenty_train.target_names
Out[11]:
In [13]:
print(twenty_train.data[0])
In [18]:
"\n".join(twenty_train.data[0].splitlines()[6:-5])
Out[18]:
In [20]:
twenty_train.target_names
Out[20]:
In [21]:
twenty_train.target[0]
Out[21]:
In [22]:
twenty_train.target_names
Out[22]:
In [23]:
classification_categories = ["soc.religion.christian", "sci.med"]
In [24]:
classes = np.array(twenty_train.target_names)
In [25]:
classes
Out[25]:
In [26]:
from sklearn.feature_extraction.text import CountVectorizer
In [27]:
CountVectorizer?
In [28]:
count_vec = CountVectorizer(stop_words="english", token_pattern="[a-z]+")
In [29]:
X_train_counts = count_vec.fit_transform(twenty_train.data)
X_test_counts = count_vec.transform(twenty_test.data)
In [30]:
print "X_train_counts.shape =", X_train_counts.shape
print "X_test_counts.shape =", X_test_counts.shape
In [31]:
count_vec.vocabulary_.items()[:10]
Out[31]:
In [32]:
count_vec.get_feature_names()[100:110]
Out[32]:
In [33]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train_counts, twenty_train.target)
report, df_cm = get_confusion_matrix(clf, X_train_counts, twenty_train.target, classes=twenty_train.target_names)
df_cm
Out[33]:
In [34]:
report, df_cm = get_confusion_matrix(clf, X_test_counts, twenty_test.target, classes=twenty_train.target_names)
df_cm
Out[34]:
In [35]:
from sklearn.pipeline import Pipeline
In [36]:
clf = Pipeline([
("vect", CountVectorizer(stop_words="english", token_pattern="[a-z]+")),
("nb_clf", naive_bayes.MultinomialNB())
])
In [37]:
X = twenty_train.data
y = twenty_train.target
classes = twenty_train.target_names
clf.fit(X, y)
report, df_cm = get_confusion_matrix(clf, X, y, classes=classes)
df_cm
Out[37]:
In [38]:
clf.classes_
Out[38]:
In [39]:
clf.predict(twenty_test.data[:10])
Out[39]:
In [40]:
report, df_cm = get_confusion_matrix(clf, twenty_test.data, twenty_test.target, classes=classes)
df_cm
Out[40]:
In [41]:
clf = Pipeline([
("vect", CountVectorizer(stop_words="english", token_pattern="[a-z]+")),
("nb_clf", linear_model.LogisticRegression(multi_class="multinomial", solver="lbfgs"))
])
In [42]:
X = twenty_train.data
y = twenty_train.target
classes = twenty_train.target_names
clf.fit(X, y)
report, df_cm = get_confusion_matrix(clf, X, y, classes=classes)
df_cm
Out[42]:
In [43]:
report, df_cm = get_confusion_matrix(clf, twenty_test.data, twenty_test.target, classes=classes)
df_cm
Out[43]:
In [44]:
text = " ".join(twenty_train.data)
In [45]:
wc = wordcloud.WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
In [46]:
plt.figure()
plt.imshow(wc)
plt.axis("off")
plt.show()
In [47]:
def get_words_of_class(data, labels, c=0, ax=None):
if ax is None:
fig, ax = plt.subplots()
labels = np.array(labels)
idx = np.where(labels==c)[0]
text = " ".join(data[i] for i in idx)
wc = wordcloud.WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
ax.imshow(wc)
ax.axis("off")
return ax
In [48]:
fig, ax = plt.subplots(2,2, figsize=(10,10))
ax = ax.flatten()
labels = twenty_train.target
data = twenty_train.data
classes = twenty_train.target_names
for i, axi in enumerate(ax):
get_words_of_class(data, labels, c=i, ax=axi)
axi.set_title(classes[i])
fig.tight_layout()
In [49]:
from nltk.corpus import movie_reviews
In [50]:
movie_reviews.categories()
Out[50]:
In [51]:
movie_reviews.words()
Out[51]:
In [52]:
sents = movie_reviews.sents()
In [53]:
len(sents)
Out[53]:
In [55]:
sents[0]
Out[55]:
In [57]:
movie_reviews.categories()[0]
Out[57]:
In [58]:
for i, s in enumerate(sents[:10]):
print "S[%s]:\t%s" % (i, " ".join(s))
In [61]:
bigrams = gensim.models.Phrases(sents[:1000])
In [62]:
bigrams.vocab.items()[0:10]
Out[62]:
In [63]:
sorted(bigrams.vocab.iteritems(), key=lambda x: x[1], reverse=True)[:10]
Out[63]:
In [64]:
word_frequencies = map(lambda x: x[1], bigrams.vocab.iteritems())
In [65]:
plt.hist(word_frequencies, bins=range(0,100), log=True)
plt.xscale("symlog")
In [66]:
sorted(filter(lambda x: isinstance(x[0], str) and "_" in x[0],
bigrams.vocab.iteritems()), key=lambda x: x[1], reverse=True)[:30]
Out[66]:
In [68]:
corpus = bigrams[sents[:1000]]
id2word = gensim.corpora.Dictionary(corpus)
In [69]:
len(id2word.keys())
Out[69]:
In [70]:
corpus_processed = [id2word.doc2bow(k) for k in corpus]
print len(corpus_processed)
In [71]:
corpus_processed[0]
Out[71]:
In [72]:
corpus[0]
Out[72]:
In [73]:
LDA_model = gensim.models.ldamodel.LdaModel(corpus_processed, num_topics=10, id2word=id2word)
In [76]:
LDA_model.print_topics(num_words=20)
Out[76]:
In [75]:
LDA_model.get_document_topics(corpus_processed[0])
Out[75]:
In [77]:
doc_topics = LDA_model[corpus_processed]
In [78]:
doc_topics[1]
Out[78]:
In [79]:
pyLDAvis.gensim.prepare(LDA_model, corpus_processed,
id2word)
Out[79]:
In [81]:
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)
Out[81]:
In [82]:
text = nltk.word_tokenize("US president Barack Obama signed a new treaty with the Indian prime minister Narendra Modi, in New Delhi.")
pos_tags = nltk.pos_tag(text)
print pos_tags
In [84]:
try:
chunk_tags = nltk.ne_chunk(pos_tags, binary=False)
except:
print "Done"
print chunk_tags
In [85]:
from nltk.corpus import wordnet as wn
In [88]:
wn.synsets('dog.n.01')
Out[88]:
In [ ]: