In [ ]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
In [133]:
%load ../../ocdata_categorize/words.py
In [134]:
%load ../../ocdata_categorize/samples.py
In [135]:
raw_data = {
'Keywords': '../../data/pycon-sprint/keywords.csv',
'UK': '../../data/pycon-sprint/UK.csv',
'Georgia': '../../data/pycon-sprint/Georgia.csv',
'Mexico': '../../data/pycon-sprint/Mexico.csv',
'EU': '../../data/pycon-sprint/EU.csv',
'Canada': '../../data/pycon-sprint/Canada.csv',
'Moldova':'../../data/pycon-sprint/Moldova.csv',
'UNOPS': '../../data/pycon-sprint/UNOPS.csv',
}
original_key = 'English Name'
entity_key = 'Entity'
data = load_samples(raw_data, original_key, entity_key)
keywords = load_samples({'Keywords': raw_data['Keywords']}, original_key, entity_key)
entities = list( set(x['entity'] for x in keywords) )
In [137]:
slices = {}
for i, row in enumerate(data):
slices.setdefault(row['sample'], []).append(i)
In [138]:
def organize_data(data):
organized = []
for k, headers in data.items():
for header in headers:
organized.append({'entity': k, 'header': header})
return organized
In [139]:
def length(df):
return df['header'].apply(len)
def word_count(df):
return df['header'].apply(lambda x: len(list(split_words(x))))
def header_in_entity(df):
return df['header'].str.lower().isin(df['entity'].str.lower())
def entity_in_header(df):
return df['entity'].str.lower().isin(df['header'].str.lower())
def entity_feature(name):
entity_set = set(x['header'] for x in data if x['entity'] == name)
def fn(x):
#print name, x, words.subsetness(x, entity_set)
try:
return words.subsetness(x, entity_set)
except:
return 0
def entity_feature(df):
return df['header'].apply( fn )
entity_feature.func_name = 'entity_%s' % name
return entity_feature
entity_features = [entity_feature(name) for name in entities]
In [140]:
class Model(object):
def __init__(self, samples, outcome_key='entity', svm=RandomForestClassifier(n_estimators=10)):
self.samples = samples
self.svm = svm
self.frame = pd.DataFrame(self.samples)
self.outcome_key = outcome_key
self.features_built = set()
def test(self, features, iterations=5, train_size=0.35, test_size=.25, seed=0):
X = self.build(self.frame, features)
y = self.frame[self.outcome_key]
rs = cross_validation.ShuffleSplit(len(X), n_iter=iterations, train_size=train_size, test_size=test_size, random_state=seed)
accuracies = []
for train_index, test_index in rs:
model = self.svm.fit(X.ix[train_index], y.ix[train_index])
actual = y.ix[test_index].values
predicted = model.predict(X.ix[test_index])
accuracies.append( self.score_model(actual, predicted) )
print "Avg Accuracy: %%%.2f" % np.mean(accuracies)
def test_sample(self, slice, features):
X = self.build(self.frame, features)
y = self.frame[self.outcome_key]
model = self.svm.fit(X, y)
actual = y.ix[slice].values
predicted = model.predict(X.ix[slice])
accuracy = self.score_model(actual, predicted)
for i, a, p in zip(slice, actual, predicted):
print self.samples[i]['header'].ljust(50), a.ljust(20), p
print "Accuracy: %%%.2f" % accuracy
def test_data(self, data, features):
X = self.build(self.frame, features)
y = self.frame[self.outcome_key]
model = self.svm.fit(X, y)
df = pd.DataFrame(data)
z = self.build(df, features)
actual = df.entity
predicted = model.predict(z)
accuracy = self.score_model(actual, predicted)
for dct, a, p in zip(data, actual, predicted):
print dct['header'].ljust(50), a.ljust(20), p
print "Accuracy: %%%.2f" % accuracy
def score_model(self, actual, predicted):
score_df = pd.DataFrame([actual, predicted], index=['actual', 'predicted']).T
correct = sum(score_df.actual == score_df.predicted)
incorrect = sum(score_df.actual != score_df.predicted)
total = correct + incorrect
accuracy = float(correct) / float(total) * 100
return accuracy
def predict(self, headers, features):
X = self.build(self.frame, features)
y = self.frame[self.outcome_key]
model = self.svm.fit(X, y)
data = [{'header': h, 'entity': '?'} for h in headers]
df = pd.DataFrame(data)
z = self.build(df, features)
self.df = df
self.z = z
predictions = model.predict(z)
return zip(headers, predictions)
def build(self, df, features):
result = pd.DataFrame()
for fn in features:
result[fn.func_name] = fn(df)
return result
model = Model(data)
In [141]:
model.test(features=[length, word_count])
In [142]:
model.test(features=[length, word_count, header_in_entity, entity_in_header])
In [143]:
model.test(features=[length, word_count, header_in_entity, entity_in_header] + entity_features)
In [144]:
model.test(features=entity_features)
In [145]:
model.test_sample(slices['UK'], features=[length, word_count, header_in_entity, entity_in_header] + entity_features)
In [146]:
model = Model(keywords)
predict_data = list(item for item in data if item['sample']=='UK')
results = model.predict(predict_data, features=[length, word_count, header_in_entity, entity_in_header] + entity_features)
for input, result in results:
print input['header'].ljust(50), result