In [ ]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

In [133]:
%load ../../ocdata_categorize/words.py

In [134]:
%load ../../ocdata_categorize/samples.py

In [135]:
raw_data = {
    'Keywords': '../../data/pycon-sprint/keywords.csv',
    'UK': '../../data/pycon-sprint/UK.csv',
    'Georgia': '../../data/pycon-sprint/Georgia.csv',
    'Mexico': '../../data/pycon-sprint/Mexico.csv',
    'EU': '../../data/pycon-sprint/EU.csv',
    'Canada': '../../data/pycon-sprint/Canada.csv',
    'Moldova':'../../data/pycon-sprint/Moldova.csv',
    'UNOPS': '../../data/pycon-sprint/UNOPS.csv',
}

original_key = 'English Name'
entity_key = 'Entity'



data = load_samples(raw_data, original_key, entity_key)
keywords = load_samples({'Keywords': raw_data['Keywords']}, original_key, entity_key)
entities = list( set(x['entity'] for x in keywords) )

In [137]:
slices = {}
for i, row in enumerate(data):
    slices.setdefault(row['sample'], []).append(i)

I. Define Raw Data


In [138]:
def organize_data(data):
    organized = []
    for k, headers in data.items():
        for header in headers:
            organized.append({'entity': k, 'header': header})
    return organized

II. Define Features


In [139]:
def length(df):
    return df['header'].apply(len)

def word_count(df):
    return df['header'].apply(lambda x: len(list(split_words(x))))

def header_in_entity(df):
    return df['header'].str.lower().isin(df['entity'].str.lower())

def entity_in_header(df):
    return df['entity'].str.lower().isin(df['header'].str.lower())

def entity_feature(name):
    entity_set = set(x['header'] for x in data if x['entity'] == name)
    
    def fn(x):
        #print name, x, words.subsetness(x, entity_set)
        try:
            return words.subsetness(x, entity_set)
        except:
            return 0
    
    def entity_feature(df):
        return df['header'].apply( fn )
    
    entity_feature.func_name = 'entity_%s' % name    
    return entity_feature

entity_features = [entity_feature(name) for name in entities]

III. Combine Features into Feature Matrix & Define Outcome

IV. Create Model

V. Split Data into Test and Training

Fit and Test Models


In [140]:
class Model(object):
    def __init__(self, samples, outcome_key='entity', svm=RandomForestClassifier(n_estimators=10)):
        self.samples = samples
        self.svm = svm
        self.frame = pd.DataFrame(self.samples)
        self.outcome_key = outcome_key
        self.features_built = set()
    
    def test(self, features, iterations=5, train_size=0.35, test_size=.25, seed=0): 
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        
        rs = cross_validation.ShuffleSplit(len(X), n_iter=iterations, train_size=train_size, test_size=test_size, random_state=seed)
        
        accuracies = []
        for train_index, test_index in rs:
            model = self.svm.fit(X.ix[train_index], y.ix[train_index])
            actual = y.ix[test_index].values
            predicted = model.predict(X.ix[test_index])
            accuracies.append( self.score_model(actual, predicted) )
        
        print "Avg Accuracy: %%%.2f" % np.mean(accuracies)
    
    def test_sample(self, slice, features):
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        
        model = self.svm.fit(X, y)
        actual = y.ix[slice].values
        predicted = model.predict(X.ix[slice])
        accuracy = self.score_model(actual, predicted)
        
        for i, a, p in zip(slice, actual, predicted):
            print self.samples[i]['header'].ljust(50), a.ljust(20), p
        
        print "Accuracy: %%%.2f" % accuracy
    
    def test_data(self, data, features):
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        
        model = self.svm.fit(X, y)
        
        df = pd.DataFrame(data)
        z = self.build(df, features)
        
        actual = df.entity
        predicted = model.predict(z)
        accuracy = self.score_model(actual, predicted)
        
        for dct, a, p in zip(data, actual, predicted):
            print dct['header'].ljust(50), a.ljust(20), p
        
        print "Accuracy: %%%.2f" % accuracy
    
    def score_model(self, actual, predicted):
        score_df = pd.DataFrame([actual, predicted], index=['actual', 'predicted']).T
        correct = sum(score_df.actual == score_df.predicted)
        incorrect = sum(score_df.actual != score_df.predicted)
        total = correct + incorrect
        accuracy = float(correct) / float(total) * 100
        return accuracy
    
    def predict(self, headers, features):
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        model = self.svm.fit(X, y)
        
        data = [{'header': h, 'entity': '?'} for h in headers]
        df = pd.DataFrame(data)
        z = self.build(df, features)
        
        self.df = df
        self.z = z
        
        predictions = model.predict(z)
        return zip(headers, predictions)
    
    def build(self, df, features):
        result = pd.DataFrame()
        for fn in features:
            result[fn.func_name] = fn(df)
        return result

model = Model(data)

In [141]:
model.test(features=[length, word_count])


Avg Accuracy: %34.46

In [142]:
model.test(features=[length, word_count, header_in_entity, entity_in_header])


Avg Accuracy: %50.30

In [143]:
model.test(features=[length, word_count, header_in_entity, entity_in_header] + entity_features)


Avg Accuracy: %49.50

In [144]:
model.test(features=entity_features)


Avg Accuracy: %34.46

Predict a dataset


In [145]:
model.test_sample(slices['UK'], features=[length, word_count, header_in_entity, entity_in_header] + entity_features)


NOTICEID                                           notice               notice
REFERENCENUMBER                                    ?                    ?
DATEPUBLISHED                                      solicitation         solicitation
VALUEMIN                                           solicitation         solicitation
VALUEMAX                                           solicitation         solicitation
STATUS                                             solicitation         solicitation
URL                                                notice               notice
ORG_NAME                                           buyer                buyer
ORG_CONTACTEMAIL                                   buyer                solicitation
TITLE                                              good                 contract
DESCRIPTION                                        good                 good
NOTICETYPE                                         notice               contract
REGION                                             buyer                buyer
NOTICE_STATE                                       notice               notice
NOTICE_STATE_CHANGE_DATE                           notice               notice
CLASSIFICATION                                     good                 notice
NUM_DOCS                                           notice               notice
Accuracy: %76.47

In [146]:
model = Model(keywords)
predict_data = list(item for item in data if item['sample']=='UK')
results = model.predict(predict_data, features=[length, word_count, header_in_entity, entity_in_header] + entity_features)

for input, result in results:
    print input['header'].ljust(50), result


NOTICEID                                           solicitation
REFERENCENUMBER                                    solicitation
DATEPUBLISHED                                      solicitation
VALUEMIN                                           solicitation
VALUEMAX                                           solicitation
STATUS                                             solicitation
URL                                                solicitation
ORG_NAME                                           solicitation
ORG_CONTACTEMAIL                                   solicitation
TITLE                                              solicitation
DESCRIPTION                                        solicitation
NOTICETYPE                                         solicitation
REGION                                             solicitation
NOTICE_STATE                                       solicitation
NOTICE_STATE_CHANGE_DATE                           solicitation
CLASSIFICATION                                     solicitation
NUM_DOCS                                           solicitation