In [1]:
from words import split_words
In [2]:
import samples
reload(samples)
data = samples.load_samples(["Keywords", "Mexico", "EU", "Georgia", "Canada"], cache=True)
headers = data.keys()
print headers
In [3]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction import text
pipe = Pipeline([
('vec', text.HashingVectorizer(analyzer=split_words)),
('svm', LinearSVC()),
])
def predict(name):
return pipe.predict([ [name] ])[0]
In [4]:
pipe.fit([data[k] for k in headers], headers)
Out[4]:
In [5]:
for header in samples.load_headers('UK'):
print header.ljust(50), predict(header)
In [6]:
#for header in samples.load_headers('EU'):
# print header.ljust(50), predict(header)
In [ ]: