In [1]:
cd ..


C:\Users\kevin.rose\Documents\GitHub\cjp-article-tagging\lib

In [2]:
import tagnews
import pandas as pd

In [3]:
# Download (and extract if needed) a saved glove data from
# https://github.com/stanfordnlp/GloVe
# and save it to tagnews/data/
glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')

In [4]:
# Download (and extract if needed) the NER data from
# https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data
# and save it to tagnews/data/
ner = tagnews.load_ner_data('tagnews/data/')


b'Skipping line 281837: expected 25 fields, saw 34\n'
C:\Users\kevin.rose\AppData\Local\Continuum\Anaconda2\envs\cjp\lib\site-packages\numpy\lib\arraysetops.py:463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [5]:
ner = pd.concat([ner, pd.DataFrame(glove.loc[ner['word'].str.lower()].values)], axis='columns')

In [6]:
num_asserts = 0
for i, row in ner.sample(1000).iterrows():
    if not any(row.iloc[2:].isnull()):
        assert (glove.loc[row['word'].lower()].values == row.iloc[3:].values).all()
        num_asserts += 1
print('Asserted correct vectorizations', num_asserts, 'times.')


Asserted correct vectorizations 998 times.

In [7]:
import sklearn.ensemble

In [8]:
clf = sklearn.ensemble.RandomForestClassifier()

In [9]:
# be careful doing this if you are relying on sequential-ness!
ner.fillna(value=0.0, inplace=True)

In [10]:
clf.fit(ner.iloc[:200000, 3:], ner['tag'].iloc[:200000].values)


Out[10]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
clf.predict_proba(glove.loc[['london', 'france', 'napkins']])


Out[11]:
array([[ 0.04864864,  0.95135136],
       [ 0.2663006 ,  0.7336994 ],
       [ 1.        ,  0.        ]])

In [12]:
# Go to https://geo-extract-tester.herokuapp.com/ and download
# the validation data (validation.txt).
with open('validation.txt', encoding='utf-8') as f:
    s = f.read()

In [13]:
with open('guesses.txt', 'w') as f:
    for prob in clf.predict_proba(glove.loc[[w for w in s.split('\n') if w]].fillna(0))[:,1]:
        f.write(str(prob) + '\n')

Now go to https://geo-extract-tester.herokuapp.com/ and upload guesses.txt to see how you did!


In [ ]: