In [1]:
cd ..
In [2]:
import tagnews
import pandas as pd
In [3]:
# Download (and extract if needed) a saved glove data from
# https://github.com/stanfordnlp/GloVe
# and save it to tagnews/data/
glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')
In [4]:
# Download (and extract if needed) the NER data from
# https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data
# and save it to tagnews/data/
ner = tagnews.load_ner_data('tagnews/data/')
In [5]:
ner = pd.concat([ner, pd.DataFrame(glove.loc[ner['word'].str.lower()].values)], axis='columns')
In [6]:
num_asserts = 0
for i, row in ner.sample(1000).iterrows():
if not any(row.iloc[2:].isnull()):
assert (glove.loc[row['word'].lower()].values == row.iloc[3:].values).all()
num_asserts += 1
print('Asserted correct vectorizations', num_asserts, 'times.')
In [7]:
import sklearn.ensemble
In [8]:
clf = sklearn.ensemble.RandomForestClassifier()
In [9]:
# be careful doing this if you are relying on sequential-ness!
ner.fillna(value=0.0, inplace=True)
In [10]:
clf.fit(ner.iloc[:200000, 3:], ner['tag'].iloc[:200000].values)
Out[10]:
In [11]:
clf.predict_proba(glove.loc[['london', 'france', 'napkins']])
Out[11]:
In [12]:
# Go to https://geo-extract-tester.herokuapp.com/ and download
# the validation data (validation.txt).
with open('validation.txt', encoding='utf-8') as f:
s = f.read()
In [13]:
with open('guesses.txt', 'w') as f:
for prob in clf.predict_proba(glove.loc[[w for w in s.split('\n') if w]].fillna(0))[:,1]:
f.write(str(prob) + '\n')
Now go to https://geo-extract-tester.herokuapp.com/ and upload guesses.txt
to see how you did!
In [ ]: