notebook.community

Edit and run



In [1]:

    
cd ..









    



C:\Users\kevin.rose\Documents\GitHub\cjp-article-tagging\lib



In [2]:

    
import tagnews
import pandas as pd



In [3]:

    
# Download (and extract if needed) a saved glove data from
# https://github.com/stanfordnlp/GloVe
# and save it to tagnews/data/
glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')



In [4]:

    
# Download (and extract if needed) the NER data from
# https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data
# and save it to tagnews/data/
ner = tagnews.load_ner_data('tagnews/data/')









    



b'Skipping line 281837: expected 25 fields, saw 34\n'
C:\Users\kevin.rose\AppData\Local\Continuum\Anaconda2\envs\cjp\lib\site-packages\numpy\lib\arraysetops.py:463: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)



In [5]:

    
ner = pd.concat([ner, pd.DataFrame(glove.loc[ner['word'].str.lower()].values)], axis='columns')



In [6]:

    
num_asserts = 0
for i, row in ner.sample(1000).iterrows():
    if not any(row.iloc[2:].isnull()):
        assert (glove.loc[row['word'].lower()].values == row.iloc[3:].values).all()
        num_asserts += 1
print('Asserted correct vectorizations', num_asserts, 'times.')









    



Asserted correct vectorizations 998 times.



In [7]:

    
import sklearn.ensemble



In [8]:

    
clf = sklearn.ensemble.RandomForestClassifier()



In [9]:

    
# be careful doing this if you are relying on sequential-ness!
ner.fillna(value=0.0, inplace=True)



In [10]:

    
clf.fit(ner.iloc[:200000, 3:], ner['tag'].iloc[:200000].values)









    Out[10]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [11]:

    
clf.predict_proba(glove.loc[['london', 'france', 'napkins']])









    Out[11]:





array([[ 0.04864864,  0.95135136],
       [ 0.2663006 ,  0.7336994 ],
       [ 1.        ,  0.        ]])



In [12]:

    
# Go to https://geo-extract-tester.herokuapp.com/ and download
# the validation data (validation.txt).
with open('validation.txt', encoding='utf-8') as f:
    s = f.read()



In [13]:

    
with open('guesses.txt', 'w') as f:
    for prob in clf.predict_proba(glove.loc[[w for w in s.split('\n') if w]].fillna(0))[:,1]:
        f.write(str(prob) + '\n')

Now go to https://geo-extract-tester.herokuapp.com/ and upload guesses.txt to see how you did!



In [ ]: