In [1]:
from words import split_words

In [2]:
import samples
reload(samples)
data = samples.load_samples(["Keywords", "Mexico", "EU", "Georgia", "Canada"], cache=True)
headers = data.keys()
print headers


['notice', 'good', 'solicitation', 'contract', 'supplier', 'authority', 'buyer', '?']

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction import text

pipe = Pipeline([ 
        ('vec', text.HashingVectorizer(analyzer=split_words)), 
        ('svm', LinearSVC()),
    ])

def predict(name):
    return pipe.predict([ [name] ])[0]


('vec', 'svm')

In [4]:
pipe.fit([data[k] for k in headers], headers)


Out[4]:
Pipeline(steps=[('vec', HashingVectorizer(analyzer=<function split_words at 0x106a27758>,
         binary=False, charset=None, charset_error=None,
         decode_error=u'strict', dtype=<type 'numpy.float64'>,
         encoding=u'utf-8', input=u'content', lowercase=True,
         n_features=1048576, ngram_ra...ling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0))])

In [5]:
for header in samples.load_headers('UK'):
    print header.ljust(50), predict(header)


NOTICEID                                           notice
REFERENCENUMBER                                    notice
DATEPUBLISHED                                      solicitation
VALUEMIN                                           notice
VALUEMAX                                           notice
STATUS                                             notice
URL                                                buyer
ORG_NAME                                           authority
ORG_CONTACTEMAIL                                   buyer
TITLE                                              good
DESCRIPTION                                        good
NOTICETYPE                                         notice
REGION                                             buyer
NOTICE_STATE                                       ?
NOTICE_STATE_CHANGE_DATE                           notice
CLASSIFICATION                                     buyer
NUM_DOCS                                           buyer

In [6]:
#for header in samples.load_headers('EU'):
#    print header.ljust(50), predict(header)

In [ ]: