In [7]:
# load dataset
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

In [26]:
#observe the freqency of keyword
import re
def to_unicode(string):
    if type(string) == str:
        return string.decode('utf8')
    elif type(string):
        return string
    else:
        raise TypeError('wrong type for string')

from collections import Counter
count = Counter()
for article in train.data:
    article = to_unicode(article)
    regex = re.compile(u'^(\w+): .*$')
    for line in article.split('\n'):
        match = regex.match(line)
        if match is not None:
            count[match.groups()[0]] += 1

print 'Keyword\tCount'
for key, value in count.items():
    if value >= 100:
        print '%s\t' % key, value


Keyword	Count
Distribution	2549
Summary	397
Disclaimer	125
File	257
Expires	116
Subject	11612
From	11398
Keywords	943
Originator	291
Organization	10872
Lines	11317
Internet	140
To	106

In [9]:
# extract feature from structured data
def parse_text(article):
    regex = re.compile(u'^Lines: [0-9]+$')
    for line in article.split('\n'):
        if regex.match(line) is not None:
            return True
    return False

def get_field(field_name, article):
    regex = re.compile(u'^%s: (.*)$' % field_name)
    for line in article.split('\n'):
        match = regex.match(line)
        if match is not None:
            return match.groups()[0]
    return None

features = ['From', 'Subject', 'Organization', 'Distribution', 'Lines']
for article in train.data[:6]:
    article = to_unicode(article)
    for feature in features:
        print '%s:' % (feature) , get_field(feature, article)
    print


From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Organization: University of Maryland, College Park
Distribution: None
Lines: 15

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Organization: University of Washington
Distribution: None
Lines: 11

From: twillis@ec.ecn.purdue.edu (Thomas E Willis)
Subject: PB questions...
Organization: Purdue University Engineering Computer Network
Distribution: usa
Lines: 36

From: jgreen@amber (Joe Green)
Subject: Re: Weitek P9000 ?
Organization: Harris Computer Systems Division
Distribution: world
Lines: 14

From: jcm@head-cfa.harvard.edu (Jonathan McDowell)
Subject: Re: Shuttle Launch Question
Organization: Smithsonian Astrophysical Observatory, Cambridge, MA,  USA
Distribution: sci
Lines: 23

From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)
Subject: Re: Rewording the Second Amendment (ideas)
Organization: VTT
Distribution: None
Lines: 58


In [10]:
#generate feature matrix from structure data
import numpy as np
from sklearn.feature_extraction import DictVectorizer

def to_major_key_vec(vec, threshold):
    '''
    :py:param vec: [token]
    :return: [{token: 1}]
    '''
    major_keys = []
    count = Counter()
    for key in vec:
        count[key] += 1
    major_keys = set([key for key, value in count.items() if value > threshold])
    vec_major_keys = []
    for key in vec:
        if key in major_keys:
            vec_major_keys.append({key: 1})
        else:
            vec_major_keys.append({None: 1})
    return vec_major_keys

def get_feature_matrix(articles, features, threshold, vectorizers=None):
    matrix = None
    if vectorizers is None:
        vectorizers = [DictVectorizer(sparse=False) for i in range(len(features))]
    for feature, vctr in zip(features, vectorizers):
        vec = [get_field(feature, article) for article in articles]
        vec_major_keys = to_major_key_vec(vec, threshold)
        try:
            vctr.get_feature_names()
        except:
            vctr.fit(vec_major_keys)
        feat_vec = vctr.transform(vec_major_keys)
        if matrix is None:
            matrix = feat_vec
        else:
            matrix = np.concatenate((matrix, feat_vec), axis=1)
    return matrix, vectorizers

In [11]:
def get_text(article):
    '''return substring start from the next line of `Lines:` to the end of article.'''
    tokens = article.split('Lines:')
    if len(tokens) == 2:
        tokens = tokens[1].split('\n')
        if len(tokens) > 1:
            return '\n'.join(tokens[1:])
    return article
        
print get_text(train.data[7])


Distribution: world
NNTP-Posting-Host: dante.nmsu.edu

DXB132@psuvm.psu.edu writes:
>In article <1qlbrlINN7rk@dns1.NMSU.Edu>, bgrubb@dante.nmsu.edu (GRUBB) says:
>>In PC Magazine April 27, 1993:29 "Although SCSI is twice as fasst as ESDI,
>>20% faster than IDE, and support up to 7 devices its acceptance ...has   
>>long been stalled by incompatability problems and installation headaches."
                                                                      
>I love it when magazine writers make stupid statements like that re:      
>performance. Where do they get those numbers? I'll list the actual
>performance ranges, which should convince anyone that such a               
>statement is absurd:                                                     
>SCSI-I ranges from 0-5MB/s.                                                
>SCSI-II ranges from 0-40MB/s.            
>IDE ranges from 0-8.3MB/s.                          
>ESDI is always 1.25MB/s (although there are some non-standard versions)
ALL this shows is that YOU don't know much about SCSI.

SCSI-1 {with a SCSI-1 controler chip} range is indeed 0-5MB/s
and that is ALL you have right about SCSI
SCSI-1 {With a SCSI-2 controller chip}: 4-6MB/s with 10MB/s burst {8-bit}
 Note the INCREASE in SPEED, the Mac Quadra uses this version of SCSI-1
 so it DOES exist. Some PC use this set up too.
SCSI-2 {8-bit/SCSI-1 mode}:          4-6MB/s with 10MB/s burst
SCSI-2 {16-bit/wide or fast mode}:  8-12MB/s with 20MB/s burst
SCSI-2 {32-bit/wide AND fast}:     15-20MB/s with 40MB/s burst
 
By your OWN data the "Although SCSI is twice as fast as ESDI" is correct
With a SCSI-2 controller chip SCSI-1 can reach 10MB/s which is indeed
"20% faster than IDE" {120% of 8.3 is 9.96}. ALL these SCSI facts have been
posted to this newsgroup in my Mac & IBM info sheet {available by FTP on 
sumex-aim.stanford.edu (36.44.0.6) in the info-mac/report as 
mac-ibm-compare[version #].txt (It should be 173 but 161 may still be there)}

Part of this problem is both Mac and IBM PC are inconsiant about what SCSI
is which.  Though it is WELL documented that the Quadra has a SCSI-2 chip
an Apple salesperson said "it uses a fast SCSI-1 chip" {Not at a 6MB/s,
10MB/s burst it does not. SCSI-1 is 5MB/s maximum synchronous and Quadra
uses ANsynchronous SCSI which is SLOWER}  It seems that Mac and IBM see
SCSI-1 interface and think 'SCSI-1' when it maybe a SCSI-1 interface driven
in the machine by a SCSi-2 controller chip in 8-bit mode {Which is MUCH
FASTER then true SCSI-1 can go}.

Don't slam an article because you don't understand what is going on.
One reference for the Quadra's SCSI-2 controller chip is 
(Digital Review, Oct 21, 1991 v8 n33 p8(1)).


In [21]:
# evaluation by cross validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import KFold
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support
from pandas import Series
import matplotlib.pyplot as plt

def do_cross_validation(k_fold, verbose=False):
    precisions, recalls, f1s = Series(), Series(), Series()
    for train_index, test_index in k_fold:
        # get articles of this fold
        train_data = [x for (i, x) in enumerate(train.data) if i in train_index]
        test_data = [x for (i, x) in enumerate(train.data) if i in test_index]
        if verbose:
            print '# of (train/test): %d/%d' % (len(train_data), len(test_data))
            
        # feature extraction from structured data
        X_train, vctrs = get_feature_matrix(train_data, features, 1)
        X_test, vctrs = get_feature_matrix(test_data, features, 1, vctrs)
        if verbose:
            print 'X.shape of (train/test): %s/%s' % (X_train.shape, X_test.shape)
        
        # feature extraction from text
        train_text = map(get_text, train_data)
        test_text = map(get_text, test_data)
        vctr = TfidfVectorizer()
        vctr.fit(train_text)
        X_train = np.concatenate((X_train, vctr.transform(train_text).toarray()), axis=1)
        X_test = np.concatenate((X_test, vctr.transform(test_text).toarray()), axis=1)
        Y_train, Y_test = train.target[train_index], train.target[test_index]
        
        # model training and evaluation
        svc = LinearSVC()
        svc.fit(X_train, Y_train)
        Y_predict = svc.predict(X_test)
        precisions = precisions.set_value(k_fold.n, precision_score(Y_test, Y_predict))
        recalls = recalls.set_value(k_fold.n, recall_score(Y_test, Y_predict))
        f1s = f1s.set_value(k_fold.n, f1_score(Y_test, Y_predict))
        if verbose:
            print precision_score(Y_test, Y_predict), recall_score(Y_test, Y_predict), f1_score(Y_test, Y_predict)
    return precisions.mean(), recalls.mean(), f1s.mean()
        
fig, axs = plt.subplots(1, 3)
fig.set_figwidth(15)
precisions, recalls, f1s = Series(), Series(), Series()
for n in range(100, 2000, 100):
    prec, recl, f1 = do_cross_validation(KFold(n, 5), False)
    precisions = precisions.set_value(n, prec)
    recalls = recalls.set_value(n, recl)
    f1s = f1s.set_value(n, f1)

precisions.plot(ax=axs[0], title='Precision')
recalls.plot(ax=axs[1], title='Recall')
f1s.plot(ax=axs[2], title='F1')


Out[21]:
<matplotlib.axes.AxesSubplot at 0x5518990>

In [27]:
# demo of feature extraction from text
from sklearn.feature_extraction.text import TfidfVectorizer
vctr = TfidfVectorizer(ngram_range=(2,2))
string = 'I was wondering if anyone out there could enlighten me on this car I saw the other day.'
print 'Origin:', string
print
print 'Preprocess:', vctr.build_preprocessor()(string)
print
print 'Tokenize:', vctr.build_tokenizer()(string)
print
print 'Analyze:', vctr.build_analyzer()(string)


Origin: I was wondering if anyone out there could enlighten me on this car I saw the other day.

Preprocess: i was wondering if anyone out there could enlighten me on this car i saw the other day.

Tokenize: ['was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day']

Analyze: [u'was wondering', u'wondering if', u'if anyone', u'anyone out', u'out there', u'there could', u'could enlighten', u'enlighten me', u'me on', u'on this', u'this car', u'car saw', u'saw the', u'the other', u'other day']

In [28]:
# demo of evaluation
from sklearn.svm import LinearSVC
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.metrics import precision_score
train = fetch_20newsgroups_vectorized()
a = LinearSVC()
a.fit(train.data[:100], train.target[:100])
print a.score(train.data[100:200], train.target[100:200])
print precision_score(train.target[100:200], a.predict(train.data[100:200]))


0.24
0.2061053245

In [29]:
# demo of cross validation
from sklearn.cross_validation import KFold
for train_index, test_index in KFold(10, 2):
    print train_index
    print test_index


[5 6 7 8 9]
[0 1 2 3 4]
[0 1 2 3 4]
[5 6 7 8 9]