In [ ]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
Now we want to need to read in some data. In this case we can use a built-in dataset that is packaged with scikit-learn. We split the dataset into a training set and a test set.
Typically the test set is held out until after the model is fully trained, and a separate validation set is pulled out from the training set to evaluate performance at training time.
In [ ]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
In [ ]:
print newsgroups_train.keys(), '\n'
print newsgroups_train['data'][:2], '\n'
print zip(newsgroups_train['target'][:2], newsgroups_train['target_names'][:2])
It is necessary to transform text data into a mathematical structure to feed into our chosen machine learning algorithm. TF-IDF transformations count word/phrase frequencies and punish very frequent, non-descriptive words such as 'the', 'as', 'and' etc.
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(newsgroups_train['data']) # training data matrix (sparse)
y_train = np.array(newsgroups_train['target']) # target values, want to learn to associate data with these values
print x_train
print y_train
In [ ]:
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
svd = TruncatedSVD()
tsne = TSNE()
In [ ]:
x_red = svd.fit_transform(x_train[:5000])
x_red = tsne.fit_transform(x_red)
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
def plot_embedding(var_embed, labels):
c_dict = {idx + 1: val for idx, val in enumerate(newsgroups_train['target_names'])}
var_x, var_y = var_embed[:, 0], var_embed[:, 1]
var_x = (var_x - var_x.min()) / (var_x.max() - var_x.min())
var_y = (var_y - var_y.min()) / (var_y.max() - var_y.min())
plt.figure(figsize=(12, 12))
norm = plt.Normalize()
ax = plt.subplot(aspect='equal')
colors = plt.cm.rainbow(np.linspace(0, 1, len(c_dict)))
for k in c_dict:
ax.scatter(var_x[labels == k], var_y[labels == k], c=colors[k-1], edgecolors=None, alpha=0.7, label=c_dict[k])
plt.legend(loc=3)
ax.axis('off')
ax.axis('tight')
plt.show()
Now we can visualize the results.
In [ ]:
plot_embedding(x_red, y_train[:5000])
print np.asarray(vectorizer.get_feature_names())
The data is somewhat separated by class, and similar class types tend to fall into the same large scale clusters. Since the clusters aren't linearly separated we will choose a model that can handle non-linear data.
In [ ]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0, n_estimators=100)
clf.fit(x_train, y_train)
In [ ]:
x_test = vectorizer.transform(newsgroups_test['data'])
y_test = np.array(newsgroups_test['target'])
In [ ]:
predictions = clf.predict(x_test)
In [ ]:
Let's examine how the model performed. First we will look at some specific
In [ ]:
names = {idx: val for idx, val in enumerate(newsgroups_train['target_names'])}
# print predictions[:10], [names[p] for p in predictions[:10]]
for idx, text in enumerate(newsgroups_test['data'][:10]):
print text[:500]
print 'Predicted: ', names[predictions[idx]], '; Expected: ', names[y_test[idx]]
print '========================='
In [ ]:
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, f_score, support = score(y_test, predictions)
In [ ]:
for i in xrange(len(precision)):
print "Label: {0} | Precision: {1}, Recall: {2}, F1: {3}".format(newsgroups_train['target_names'][i],
precision[i],
recall[i],
f_score[i])
In [ ]: