notebook.community

Edit and run



In [1]:

    
from words import split_words



In [2]:

    
import samples
reload(samples)
data = samples.load_samples(["Keywords", "Mexico", "EU", "Georgia", "Canada"], cache=True)
headers = data.keys()
print headers









    



['notice', 'good', 'solicitation', 'contract', 'supplier', 'authority', 'buyer', '?']



In [3]:

    
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction import text

pipe = Pipeline([ 
        ('vec', text.HashingVectorizer(analyzer=split_words)), 
        ('svm', LinearSVC()),
    ])

def predict(name):
    return pipe.predict([ [name] ])[0]









    



('vec', 'svm')



In [4]:

    
pipe.fit([data[k] for k in headers], headers)









    Out[4]:





Pipeline(steps=[('vec', HashingVectorizer(analyzer=<function split_words at 0x106a27758>,
         binary=False, charset=None, charset_error=None,
         decode_error=u'strict', dtype=<type 'numpy.float64'>,
         encoding=u'utf-8', input=u'content', lowercase=True,
         n_features=1048576, ngram_ra...ling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0))])



In [5]:

    
for header in samples.load_headers('UK'):
    print header.ljust(50), predict(header)









    



NOTICEID                                           notice
REFERENCENUMBER                                    notice
DATEPUBLISHED                                      solicitation
VALUEMIN                                           notice
VALUEMAX                                           notice
STATUS                                             notice
URL                                                buyer
ORG_NAME                                           authority
ORG_CONTACTEMAIL                                   buyer
TITLE                                              good
DESCRIPTION                                        good
NOTICETYPE                                         notice
REGION                                             buyer
NOTICE_STATE                                       ?
NOTICE_STATE_CHANGE_DATE                           notice
CLASSIFICATION                                     buyer
NUM_DOCS                                           buyer



In [6]:

    
#for header in samples.load_headers('EU'):
#    print header.ljust(50), predict(header)



In [ ]: