In [7]:
# load dataset
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')
In [26]:
#observe the freqency of keyword
import re
def to_unicode(string):
if type(string) == str:
return string.decode('utf8')
elif type(string):
return string
else:
raise TypeError('wrong type for string')
from collections import Counter
count = Counter()
for article in train.data:
article = to_unicode(article)
regex = re.compile(u'^(\w+): .*$')
for line in article.split('\n'):
match = regex.match(line)
if match is not None:
count[match.groups()[0]] += 1
print 'Keyword\tCount'
for key, value in count.items():
if value >= 100:
print '%s\t' % key, value
In [9]:
# extract feature from structured data
def parse_text(article):
regex = re.compile(u'^Lines: [0-9]+$')
for line in article.split('\n'):
if regex.match(line) is not None:
return True
return False
def get_field(field_name, article):
regex = re.compile(u'^%s: (.*)$' % field_name)
for line in article.split('\n'):
match = regex.match(line)
if match is not None:
return match.groups()[0]
return None
features = ['From', 'Subject', 'Organization', 'Distribution', 'Lines']
for article in train.data[:6]:
article = to_unicode(article)
for feature in features:
print '%s:' % (feature) , get_field(feature, article)
print
In [10]:
#generate feature matrix from structure data
import numpy as np
from sklearn.feature_extraction import DictVectorizer
def to_major_key_vec(vec, threshold):
'''
:py:param vec: [token]
:return: [{token: 1}]
'''
major_keys = []
count = Counter()
for key in vec:
count[key] += 1
major_keys = set([key for key, value in count.items() if value > threshold])
vec_major_keys = []
for key in vec:
if key in major_keys:
vec_major_keys.append({key: 1})
else:
vec_major_keys.append({None: 1})
return vec_major_keys
def get_feature_matrix(articles, features, threshold, vectorizers=None):
matrix = None
if vectorizers is None:
vectorizers = [DictVectorizer(sparse=False) for i in range(len(features))]
for feature, vctr in zip(features, vectorizers):
vec = [get_field(feature, article) for article in articles]
vec_major_keys = to_major_key_vec(vec, threshold)
try:
vctr.get_feature_names()
except:
vctr.fit(vec_major_keys)
feat_vec = vctr.transform(vec_major_keys)
if matrix is None:
matrix = feat_vec
else:
matrix = np.concatenate((matrix, feat_vec), axis=1)
return matrix, vectorizers
In [11]:
def get_text(article):
'''return substring start from the next line of `Lines:` to the end of article.'''
tokens = article.split('Lines:')
if len(tokens) == 2:
tokens = tokens[1].split('\n')
if len(tokens) > 1:
return '\n'.join(tokens[1:])
return article
print get_text(train.data[7])
In [21]:
# evaluation by cross validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import KFold
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support
from pandas import Series
import matplotlib.pyplot as plt
def do_cross_validation(k_fold, verbose=False):
precisions, recalls, f1s = Series(), Series(), Series()
for train_index, test_index in k_fold:
# get articles of this fold
train_data = [x for (i, x) in enumerate(train.data) if i in train_index]
test_data = [x for (i, x) in enumerate(train.data) if i in test_index]
if verbose:
print '# of (train/test): %d/%d' % (len(train_data), len(test_data))
# feature extraction from structured data
X_train, vctrs = get_feature_matrix(train_data, features, 1)
X_test, vctrs = get_feature_matrix(test_data, features, 1, vctrs)
if verbose:
print 'X.shape of (train/test): %s/%s' % (X_train.shape, X_test.shape)
# feature extraction from text
train_text = map(get_text, train_data)
test_text = map(get_text, test_data)
vctr = TfidfVectorizer()
vctr.fit(train_text)
X_train = np.concatenate((X_train, vctr.transform(train_text).toarray()), axis=1)
X_test = np.concatenate((X_test, vctr.transform(test_text).toarray()), axis=1)
Y_train, Y_test = train.target[train_index], train.target[test_index]
# model training and evaluation
svc = LinearSVC()
svc.fit(X_train, Y_train)
Y_predict = svc.predict(X_test)
precisions = precisions.set_value(k_fold.n, precision_score(Y_test, Y_predict))
recalls = recalls.set_value(k_fold.n, recall_score(Y_test, Y_predict))
f1s = f1s.set_value(k_fold.n, f1_score(Y_test, Y_predict))
if verbose:
print precision_score(Y_test, Y_predict), recall_score(Y_test, Y_predict), f1_score(Y_test, Y_predict)
return precisions.mean(), recalls.mean(), f1s.mean()
fig, axs = plt.subplots(1, 3)
fig.set_figwidth(15)
precisions, recalls, f1s = Series(), Series(), Series()
for n in range(100, 2000, 100):
prec, recl, f1 = do_cross_validation(KFold(n, 5), False)
precisions = precisions.set_value(n, prec)
recalls = recalls.set_value(n, recl)
f1s = f1s.set_value(n, f1)
precisions.plot(ax=axs[0], title='Precision')
recalls.plot(ax=axs[1], title='Recall')
f1s.plot(ax=axs[2], title='F1')
Out[21]:
In [27]:
# demo of feature extraction from text
from sklearn.feature_extraction.text import TfidfVectorizer
vctr = TfidfVectorizer(ngram_range=(2,2))
string = 'I was wondering if anyone out there could enlighten me on this car I saw the other day.'
print 'Origin:', string
print
print 'Preprocess:', vctr.build_preprocessor()(string)
print
print 'Tokenize:', vctr.build_tokenizer()(string)
print
print 'Analyze:', vctr.build_analyzer()(string)
In [28]:
# demo of evaluation
from sklearn.svm import LinearSVC
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.metrics import precision_score
train = fetch_20newsgroups_vectorized()
a = LinearSVC()
a.fit(train.data[:100], train.target[:100])
print a.score(train.data[100:200], train.target[100:200])
print precision_score(train.target[100:200], a.predict(train.data[100:200]))
In [29]:
# demo of cross validation
from sklearn.cross_validation import KFold
for train_index, test_index in KFold(10, 2):
print train_index
print test_index