Mineral NER using Data Programming

Project:: Mars Target Encyclopedia

This notebook does not explain much, however, the exaplanations are found in the original notebook(s) https://github.com/HazyResearch/snorkel/tree/master/tutorials/intro

Setup:

Follow instructions in https://github.com/HazyResearch/snorkel
Start jupyter notebook server using ./run.sh as described in snorkel README
copy this notebook to a place accessible from the jupyter server started in previous step. Perhaps symlink your directory



In [2]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
import os
import numpy as np
import re
import codecs

os.environ['SNORKELDB'] = 'sqlite:///snorkel-mte.db'



In [3]:

    
# Open Session
session = SnorkelSession()



In [4]:

    
# Read input
base_dir = '/Users/thammegr/work/mte/data/newcorpus/MTE-corpus-open/'
def scan_docs(dir):
    txt_filter = lambda _: re.match("^[0-9]{4}\.txt$", _)
    for root, dirs, files in os.walk(dir):
        for f in filter(txt_filter, files):
            txt_path = os.path.join(root, f)
            ann_path = txt_path.replace('.txt', '.ann')
            parts = ann_path.split(os.path.sep)
            parts[-2] += "-reviewed-target" # directory name
            new_ann_path = os.path.sep.join(parts)
            if os.path.exists(new_ann_path):
                ann_path = new_ann_path
            yield (txt_path, ann_path)

corpus_file = "mte-corpus.list"
with open(corpus_file, 'w') as f:
    count = 0
    for rec in scan_docs(base_dir):
        f.write(",".join(rec))
        f.write("\n")
        count += 1
    print("Wrote %d records to %s" %(count, corpus_file))









    



Wrote 5959 records to mte-corpus.list



In [67]:

    
# sample 100 docs to setup whole pipeline first

!head -30 mte-corpus.list > mte-corpus-head.list
corpus_file = "mte-corpus-head.list"
!wc -l *.list









    



      30 mte-corpus-head.list
    5959 mte-corpus.list
      10 paths-sample.list
      10 sample-pdfs.list
    1499 wikipedia-minerals.list
    7508 total



In [ ]:

    
from snorkel.parser import CSVPathsPreprocessor

doc_preprocessor = CSVPathsPreprocessor(path=corpus_file, column=0, delim=',')
#doc_preprocessor = CSVPathsPreprocessor("paths-sample.list")

# Corpus parser to get features
from snorkel.parser import CorpusParser
corpus_parser = CorpusParser()
%time corpus_parser.apply(doc_preprocessor)



In [68]:

    
from snorkel.models import Document, Sentence

print "Documents:", session.query(Document).count()
print "Sentences:", session.query(Sentence).count()









    



Documents: 30
Sentences: 1801



In [6]:

    
# Schema for Minerals
from snorkel.models import candidate_subclass
Mineral = candidate_subclass('Mineral', ['name'])



In [7]:

    
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import RegexMatchEach

mineral_matcher = RegexMatchEach(attrib='pos_tags', rgx="NN.*")

ngrams         = Ngrams(n_max=3)
cand_extractor = CandidateExtractor(Mineral, 
                                    [ngrams], [mineral_matcher],
                                    symmetric_relations=False)



In [8]:

    
# Counts number of nouns in a sentence => could be used for filtering
def number_of_nouns(sentence):
    active_sequence = False
    count = 0
    last_tag = ''
    for tag in sentence.pos_tags:
        if tag.startswith('NN') and not active_sequence:
            active_sequence = True
            count += 1
        elif not tag.startswith('NN') and active_sequence:
            active_sequence = False
    return count



In [9]:

    
from snorkel.models import Document

# load, filter and split the sentences
docs = session.query(Document).order_by(Document.name).all()
ld   = len(docs)

train_sents = set()
dev_sents   = set()
test_sents  = set()
splits = (0.9, 0.95)
for i,doc in enumerate(docs):
    for s in doc.sentences:
        if number_of_nouns(s) > 0:
            if i < splits[0] * ld:
                train_sents.add(s)
            elif i < splits[1] * ld:
                dev_sents.add(s)
            else:
                test_sents.add(s)



In [10]:

    
s1 = session.query(Sentence).all()[26]
s1.pos_tags









    Out[10]:





[u'-LRB-',
 u'CD',
 u'-RRB-',
 u'NNP',
 u',',
 u'NNP',
 u'NNP',
 u'FW',
 u'FW',
 u'-LRB-',
 u'CD',
 u'-RRB-',
 u',',
 u'NNP',
 u',',
 u'CD',
 u'.']



In [11]:

    
cand_extractor.apply(train_sents, split=0)









    



Clearing existing...
Running UDF...
[========================================] 100%



In [12]:

    
train_cands = session.query(Mineral).filter(Mineral.split == 0).all()
print "Number of candidates:", len(train_cands)









    



Number of candidates: 8962



In [13]:

    
# inspect the candidates using this widget
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(train_cands[:300], session)
sv



In [14]:

    
# Develop and Tests
## Develop and Test

for i, sents in enumerate([dev_sents, test_sents]):
    cand_extractor.apply(sents, split=i+1)
    print "Number of candidates:", session.query(Mineral).filter(Mineral.split == i+1).count()









    



Clearing existing...
Running UDF...
[========================================] 100%

Number of candidates: 922
Clearing existing...
Running UDF...
[========================================] 100%

Number of candidates: 401

Labelling functions



In [165]:

    
# Distance supervision
minerals_file = "/Users/thammegr/work/mte/git/ref/minerals.txt"
non_minerals_file = "/Users/thammegr/work/mte/git/ref/non-minerals.txt"

def load_set(path, lower=True):
    with codecs.open(path, 'r', 'utf-8') as f:
        lines = f.readlines()
        lines = map(lambda x: x.strip(), lines)
        lines = filter(lambda x: x and not x.startswith('#'), lines)
        if lower:
            lines = map(lambda x: x.lower(), lines)
        return set(lines)

mte_minerals = load_set(minerals_file)
non_minerals = load_set(non_minerals_file)

def lf_dict_mte_minerals(c):
    return 1 if c.name.get_span().lower() in mte_minerals else 0

def lf_dict_nonminerals(c):
    return -1 if c.name.get_span().lower() in non_minerals else 0

# rule based
def lf_rule_ite_minerals(c):
    return 1 if c.name.get_span().lower().endswith('ite') else 0

# rule based 2
ends_ite = re.compile("^[a-z]*[aeiou][a-z]*ite$") 
def lf_rule_ite2_minerals(c): 
    # has one vowel before ite
    return 1 if ends_ite.match(c.name.get_span().lower()) is not None else 0



In [159]:

Distant supervision

Get list of known minerals for distant supervision



In [12]:

    
import requests
from lxml import etree
# lxml supports XPath 1.0 which doesnt have regex match function, so extending it
ns = etree.FunctionNamespace(None)
def matches(dummy, val, patrn):
    if not val:
        return False
    return re.match(patrn, str(val[0])) is not None
ns['matches'] = matches

all_minerals_page = "https://en.wikipedia.org/wiki/List_of_minerals"
tree = etree.HTML(requests.get(all_minerals_page).text)
minerals = tree.xpath('//h2[matches(span/@id, "^[A-Z]$")]/following-sibling::*//li/a/@title')

minerals = set(map(lambda x: x.lower().strip(), minerals)) # remove duplicates
print("Found %d minerals in %s" %(len(minerals), all_minerals_page))

minerals_kb = "wikipedia-minerals.list"
with codecs.open(minerals_kb, 'w', 'utf-8') as out:
    out.write(u"\n".join(minerals))
    print("Stored the mineral names at %s" % minerals_kb)









    



Found 1500 minerals in https://en.wikipedia.org/wiki/List_of_minerals
Stored the mineral names at wikipedia-minerals.list



In [197]:

    
minerals_kb = "wikipedia-minerals.list"
minerals_set = load_set(minerals_kb)
def lf_dict_wikipedia_minerals(c):
    return 1 if c.name.get_span().lower() in minerals_set else 0
    # returning 0 instead of -1, because the wikipedia page may not be an exhaustive list.
    # TODO: check with Kiri to confirm this



In [162]:

    
# Debugging label functions
from pprint import pprint

labeled = []
for c in session.query(Mineral).filter(Mineral.split == 0).all():
    if lf_rule_ite2_minerals(c) != 0: # function 
        labeled.append(c)

print "Number labeled:", len(labeled)









    



Number labeled: 56



In [139]:

    
labeled[0]









    Out[139]:





Mineral(Span("bronzite", sentence=159, chars=[230,237], words=[41,41]))



In [198]:

    
# all labeling functions in a list
LFs = [
    lf_dict_mte_minerals, lf_dict_nonminerals,
    lf_dict_wikipedia_minerals,
    #lf_rule_ite_minerals,
    lf_rule_ite2_minerals
]



In [199]:

    
from snorkel.annotations import LabelAnnotator
import numpy as np
labeler = LabelAnnotator(f=LFs)



In [201]:

    
np.random.seed(1701)
%time L_train = labeler.apply(split=0)
L_train









    



Clearing existing...
Running UDF...
[========================================] 100%

CPU times: user 31.9 s, sys: 143 ms, total: 32 s
Wall time: 32 s






    Out[201]:





<8962x4 sparse matrix of type '<type 'numpy.float64'>'
	with 138 stored elements in Compressed Sparse Row format>



In [202]:

    
# Loading it again  -- resume from here
L_train = labeler.load_matrix(session, split=0)
L_train









    Out[202]:





<8962x4 sparse matrix of type '<type 'numpy.float64'>'
	with 138 stored elements in Compressed Sparse Row format>



In [170]:

    
L_train.get_candidate(session, 0)









    Out[170]:





Mineral(Span("radar", sentence=796, chars=[10,14], words=[3,3]))



In [171]:

    
L_train.get_key(session, 0)









    Out[171]:





LabelKey (lf_dict_mte_minerals)



In [203]:

    
L_train.lf_stats(session, )









    Out[203]:






  
    
      
      j
      Coverage
      Overlaps
      Conflicts
    
  
  
    
      lf_dict_mte_minerals
      0
      0.003236
      0.002566
      0.000000
    
    
      lf_dict_nonminerals
      1
      0.001785
      0.001785
      0.001785
    
    
      lf_dict_wikipedia_minerals
      2
      0.006137
      0.003905
      0.000000
    
    
      lf_rule_ite2_minerals
      3
      0.004240
      0.003124
      0.001785

Fitting the generative models



In [204]:

    
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
gen_model.train(L_train, epochs=500, decay=0.95, step_size=0.1/L_train.shape[0], reg_param=1e-6)



In [191]:

    
train_marginals = gen_model.marginals(L_train)
# visualize 
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()



In [205]:

    
gen_model.weights.lf_accuracy()









    Out[205]:





array([ 0.51871666,  0.51860755,  0.5187    ,  0.51880248])



In [206]:

    
L_dev = labeler.apply_existing(split=1)









    



Clearing existing...
Running UDF...
[========================================] 100%



In [177]:

    
L_dev









    Out[177]:





<922x5 sparse matrix of type '<type 'numpy.float64'>'
	with 256 stored elements in Compressed Sparse Row format>

Label development set for evaluation



In [7]:

    
dev_cands = session.query(Mineral).filter(Mineral.split == 1).all()
len(dev_cands)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-9535d4e7bda7> in <module>()
----> 1 dev_cands = session.query(Mineral).filter(Mineral.split == 1).all()
      2 len(dev_cands)

NameError: name 'Mineral' is not defined



In [72]:

    
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(dev_cands, session)
sv



In [ ]:

    
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name=os.environ['USER'], split=1)
L_gold_dev



In [209]:

    
tp, fp, tn, fn = gen_model.score(session, L_dev, L_gold_dev)









    



========================================
Scores (Un-adjusted)
========================================
Pos. class accuracy: 0.803
Neg. class accuracy: 0.991
Precision            0.933
Recall               0.803
F1                   0.863
----------------------------------------
TP: 98 | FP: 7 | TN: 793 | FN: 24
========================================



In [1]:

    
fn









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-dc562cbc3cec> in <module>()
----> 1 fn

NameError: name 'fn' is not defined



In [163]:

    
L_dev.lf_stats(session, L_gold_dev, gen_model.weights.lf_accuracy())









    Out[163]:






  
    
      
      j
      Coverage
      Overlaps
      Conflicts
      TP
      FP
      FN
      TN
      Empirical Acc.
      Learned Acc.
    
  
  
    
      lf_dict_mte_minerals
      0
      0.067245
      0.060738
      0.000000
      58
      4
      0
      0
      0.935484
      0.519071
    
    
      lf_dict_nonminerals
      1
      0.005423
      0.005423
      0.005423
      0
      0
      0
      5
      1.000000
      0.518340
    
    
      lf_dict_wikipedia_minerals
      2
      0.106291
      0.099783
      0.000000
      92
      6
      0
      0
      0.938776
      0.519468
    
    
      lf_rule_minerals
      3
      0.057484
      0.044469
      0.005423
      37
      16
      0
      0
      0.698113
      0.519274



In [211]:

    
# Save labels
from snorkel.annotations import save_marginals
%time save_marginals(session, L_train, train_marginals)









    



Saved 8962 training marginals
CPU times: user 6.16 s, sys: 164 ms, total: 6.33 s
Wall time: 6.35 s

Part 5:

Automatic features



In [212]:

    
# generate features 
from snorkel.annotations import FeatureAnnotator
featurizer = FeatureAnnotator()
%time F_train = featurizer.apply(split=0)
F_train









    



Clearing existing...
Running UDF...
[========================================] 100%

CPU times: user 3min 43s, sys: 2.26 s, total: 3min 45s
Wall time: 3min 46s






    Out[212]:





<8962x182754 sparse matrix of type '<type 'numpy.float64'>'
	with 457657 stored elements in Compressed Sparse Row format>



In [213]:

    
%%time
F_dev  = featurizer.apply_existing(split=1)
F_test = featurizer.apply_existing(split=2)









    



Clearing existing...
Running UDF...
[========================================] 100%

Clearing existing...
Running UDF...
[========================================] 100%

CPU times: user 46.4 s, sys: 768 ms, total: 47.1 s
Wall time: 47 s



In [229]:

    
from snorkel.learning import SparseLogisticRegression
from snorkel.learning.utils import MentionScorer
from snorkel.learning import RandomSearch, ListParameter, RangeParameter

# our discriminative model
disc_model = SparseLogisticRegression()

#Hyper parameters search
rate_param = RangeParameter('lr', 1e-6, 1e-2, step=1, log_base=10)
l1_param  = RangeParameter('l1_penalty', 1e-6, 1e-2, step=1, log_base=10)
l2_param  = RangeParameter('l2_penalty', 1e-6, 1e-2, step=1, log_base=10)
searcher = RandomSearch(session, disc_model, F_train, train_marginals, [rate_param, l1_param, l2_param], n=20)

from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)

# fit 
np.random.seed(1701)
searcher.fit(F_dev, L_gold_dev, n_epochs=50, rebalance=0.9, print_freq=25)









    



Initialized RandomSearch search of size 20. Search space size = 125.
============================================================
[1] Testing lr = 1.00e-02, l1_penalty = 1.00e-03, l2_penalty = 1.00e-04
============================================================
[SparseLR] lr=0.01 l1=0.001 l2=0.0001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (5.32s)	Avg. loss=0.917868	NNZ=182754
[SparseLR] Epoch 25 (6.29s)	Avg. loss=0.696658	NNZ=182754
[SparseLR] Epoch 49 (7.21s)	Avg. loss=0.684836	NNZ=182754
[SparseLR] Training done (7.21s)
[SparseLR] Model saved. To load, use name
		SparseLR_0
============================================================
[2] Testing lr = 1.00e-04, l1_penalty = 1.00e-06, l2_penalty = 1.00e-03
============================================================
[SparseLR] lr=0.0001 l1=1e-06 l2=0.001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.68s)	Avg. loss=0.745201	NNZ=182754
[SparseLR] Epoch 25 (1.61s)	Avg. loss=0.732160	NNZ=182754
[SparseLR] Epoch 49 (2.56s)	Avg. loss=0.722308	NNZ=182754
[SparseLR] Training done (2.56s)
============================================================
[3] Testing lr = 1.00e-03, l1_penalty = 1.00e-05, l2_penalty = 1.00e-05
============================================================
[SparseLR] lr=0.001 l1=1e-05 l2=1e-05
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.70s)	Avg. loss=0.724593	NNZ=182754
[SparseLR] Epoch 25 (1.70s)	Avg. loss=0.682245	NNZ=182754
[SparseLR] Epoch 49 (2.67s)	Avg. loss=0.679233	NNZ=182754
[SparseLR] Training done (2.67s)
============================================================
[4] Testing lr = 1.00e-03, l1_penalty = 1.00e-06, l2_penalty = 1.00e-03
============================================================
[SparseLR] lr=0.001 l1=1e-06 l2=0.001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.70s)	Avg. loss=0.751933	NNZ=182754
[SparseLR] Epoch 25 (1.67s)	Avg. loss=0.690115	NNZ=182754
[SparseLR] Epoch 49 (2.60s)	Avg. loss=0.683738	NNZ=182754
[SparseLR] Training done (2.60s)
============================================================
[5] Testing lr = 1.00e-02, l1_penalty = 1.00e-04, l2_penalty = 1.00e-05
============================================================
[SparseLR] lr=0.01 l1=0.0001 l2=1e-05
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.72s)	Avg. loss=0.752715	NNZ=182754
[SparseLR] Epoch 25 (1.68s)	Avg. loss=0.681357	NNZ=182754
[SparseLR] Epoch 49 (2.60s)	Avg. loss=0.678969	NNZ=182754
[SparseLR] Training done (2.60s)
============================================================
[6] Testing lr = 1.00e-06, l1_penalty = 1.00e-03, l2_penalty = 1.00e-05
============================================================
[SparseLR] lr=1e-06 l1=0.001 l2=1e-05
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.74s)	Avg. loss=0.939171	NNZ=182754
[SparseLR] Epoch 25 (1.73s)	Avg. loss=0.938962	NNZ=182754
[SparseLR] Epoch 49 (2.67s)	Avg. loss=0.938762	NNZ=182754
[SparseLR] Training done (2.67s)
============================================================
[7] Testing lr = 1.00e-06, l1_penalty = 1.00e-03, l2_penalty = 1.00e-02
============================================================
[SparseLR] lr=1e-06 l1=0.001 l2=0.01
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.73s)	Avg. loss=1.028171	NNZ=182754
[SparseLR] Epoch 25 (1.70s)	Avg. loss=1.027953	NNZ=182754
[SparseLR] Epoch 49 (2.64s)	Avg. loss=1.027744	NNZ=182754
[SparseLR] Training done (2.64s)
============================================================
[8] Testing lr = 1.00e-02, l1_penalty = 1.00e-05, l2_penalty = 1.00e-02
============================================================
[SparseLR] lr=0.01 l1=1e-05 l2=0.01
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.76s)	Avg. loss=0.861294	NNZ=182754
[SparseLR] Epoch 25 (1.72s)	Avg. loss=0.684005	NNZ=182754
[SparseLR] Epoch 49 (2.65s)	Avg. loss=0.679984	NNZ=182754
[SparseLR] Training done (2.65s)
============================================================
[9] Testing lr = 1.00e-04, l1_penalty = 1.00e-02, l2_penalty = 1.00e-06
============================================================
[SparseLR] lr=0.0001 l1=0.01 l2=1e-06
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.76s)	Avg. loss=2.583820	NNZ=182754
[SparseLR] Epoch 25 (1.76s)	Avg. loss=2.514466	NNZ=182754
[SparseLR] Epoch 49 (2.67s)	Avg. loss=2.451487	NNZ=182754
[SparseLR] Training done (2.67s)
============================================================
[10] Testing lr = 1.00e-06, l1_penalty = 1.00e-05, l2_penalty = 1.00e-03
============================================================
[SparseLR] lr=1e-06 l1=1e-05 l2=0.001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.75s)	Avg. loss=0.758988	NNZ=182754
[SparseLR] Epoch 25 (1.67s)	Avg. loss=0.758843	NNZ=182754
[SparseLR] Epoch 49 (2.53s)	Avg. loss=0.758704	NNZ=182754
[SparseLR] Training done (2.53s)
============================================================
[11] Testing lr = 1.00e-04, l1_penalty = 1.00e-02, l2_penalty = 1.00e-04
============================================================
[SparseLR] lr=0.0001 l1=0.01 l2=0.0001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.75s)	Avg. loss=2.591350	NNZ=182754
[SparseLR] Epoch 25 (1.64s)	Avg. loss=2.521874	NNZ=182754
[SparseLR] Epoch 49 (2.49s)	Avg. loss=2.458599	NNZ=182754
[SparseLR] Training done (2.49s)
============================================================
[12] Testing lr = 1.00e-03, l1_penalty = 1.00e-06, l2_penalty = 1.00e-03
============================================================
[SparseLR] lr=0.001 l1=1e-06 l2=0.001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.84s)	Avg. loss=0.740243	NNZ=182754
[SparseLR] Epoch 25 (1.81s)	Avg. loss=0.689503	NNZ=182754
[SparseLR] Epoch 49 (2.71s)	Avg. loss=0.683664	NNZ=182754
[SparseLR] Training done (2.71s)
============================================================
[13] Testing lr = 1.00e-04, l1_penalty = 1.00e-04, l2_penalty = 1.00e-05
============================================================
[SparseLR] lr=0.0001 l1=0.0001 l2=1e-05
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.81s)	Avg. loss=0.781613	NNZ=182754
[SparseLR] Epoch 25 (1.78s)	Avg. loss=0.765801	NNZ=182754
[SparseLR] Epoch 49 (2.72s)	Avg. loss=0.753303	NNZ=182754
[SparseLR] Training done (2.72s)
============================================================
[14] Testing lr = 1.00e-03, l1_penalty = 1.00e-05, l2_penalty = 1.00e-04
============================================================
[SparseLR] lr=0.001 l1=1e-05 l2=0.0001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.86s)	Avg. loss=0.744045	NNZ=182754
[SparseLR] Epoch 25 (1.84s)	Avg. loss=0.683807	NNZ=182754
[SparseLR] Epoch 49 (2.78s)	Avg. loss=0.679936	NNZ=182754
[SparseLR] Training done (2.78s)
============================================================
[15] Testing lr = 1.00e-04, l1_penalty = 1.00e-02, l2_penalty = 1.00e-05
============================================================
[SparseLR] lr=0.0001 l1=0.01 l2=1e-05
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.85s)	Avg. loss=2.582385	NNZ=182754
[SparseLR] Epoch 25 (1.83s)	Avg. loss=2.513310	NNZ=182754
[SparseLR] Epoch 49 (2.78s)	Avg. loss=2.450236	NNZ=182754
[SparseLR] Training done (2.78s)
============================================================
[16] Testing lr = 1.00e-05, l1_penalty = 1.00e-06, l2_penalty = 1.00e-04
============================================================
[SparseLR] lr=1e-05 l1=1e-06 l2=0.0001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.80s)	Avg. loss=0.730738	NNZ=182754
[SparseLR] Epoch 25 (1.85s)	Avg. loss=0.729541	NNZ=182754
[SparseLR] Epoch 49 (3.03s)	Avg. loss=0.728417	NNZ=182754
[SparseLR] Training done (3.03s)
============================================================
[17] Testing lr = 1.00e-06, l1_penalty = 1.00e-02, l2_penalty = 1.00e-03
============================================================
[SparseLR] lr=1e-06 l1=0.01 l2=0.001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.83s)	Avg. loss=2.627842	NNZ=182754
[SparseLR] Epoch 25 (1.80s)	Avg. loss=2.627075	NNZ=182754
[SparseLR] Epoch 49 (2.75s)	Avg. loss=2.626339	NNZ=182754
[SparseLR] Training done (2.75s)
============================================================
[18] Testing lr = 1.00e-02, l1_penalty = 1.00e-05, l2_penalty = 1.00e-06
============================================================
[SparseLR] lr=0.01 l1=1e-05 l2=1e-06
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.91s)	Avg. loss=0.764506	NNZ=182754
[SparseLR] Epoch 25 (1.89s)	Avg. loss=0.680405	NNZ=182754
[SparseLR] Epoch 49 (2.85s)	Avg. loss=0.678339	NNZ=182754
[SparseLR] Training done (2.85s)
============================================================
[19] Testing lr = 1.00e-05, l1_penalty = 1.00e-02, l2_penalty = 1.00e-06
============================================================
[SparseLR] lr=1e-05 l1=0.01 l2=1e-06
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.93s)	Avg. loss=2.586696	NNZ=182754
[SparseLR] Epoch 25 (1.95s)	Avg. loss=2.579592	NNZ=182754
[SparseLR] Epoch 49 (2.87s)	Avg. loss=2.572806	NNZ=182754
[SparseLR] Training done (2.87s)
============================================================
[20] Testing lr = 1.00e-02, l1_penalty = 1.00e-05, l2_penalty = 1.00e-04
============================================================
[SparseLR] lr=0.01 l1=1e-05 l2=0.0001
[SparseLR] Building model
[SparseLR] Training model
[SparseLR] #examples=79  #epochs=50  batch size=79
[SparseLR] Epoch 0 (0.94s)	Avg. loss=0.753933	NNZ=182754
[SparseLR] Epoch 25 (1.92s)	Avg. loss=0.679895	NNZ=182754
[SparseLR] Epoch 49 (2.85s)	Avg. loss=0.678295	NNZ=182754
[SparseLR] Training done (2.85s)
[SparseLR] Loaded model <SparseLR_0>






    Out[229]:






  
    
      
      lr
      l1_penalty
      l2_penalty
      Prec.
      Rec.
      F1
    
  
  
    
      0
      0.010000
      0.001000
      0.000100
      0
      0
      0
    
    
      1
      0.000100
      0.000001
      0.001000
      0
      0
      0
    
    
      18
      0.000010
      0.010000
      0.000001
      0
      0
      0
    
    
      17
      0.010000
      0.000010
      0.000001
      0
      0
      0
    
    
      16
      0.000001
      0.010000
      0.001000
      0
      0
      0
    
    
      15
      0.000010
      0.000001
      0.000100
      0
      0
      0
    
    
      14
      0.000100
      0.010000
      0.000010
      0
      0
      0
    
    
      13
      0.001000
      0.000010
      0.000100
      0
      0
      0
    
    
      12
      0.000100
      0.000100
      0.000010
      0
      0
      0
    
    
      11
      0.001000
      0.000001
      0.001000
      0
      0
      0
    
    
      10
      0.000100
      0.010000
      0.000100
      0
      0
      0
    
    
      9
      0.000001
      0.000010
      0.001000
      0
      0
      0
    
    
      8
      0.000100
      0.010000
      0.000001
      0
      0
      0
    
    
      7
      0.010000
      0.000010
      0.010000
      0
      0
      0
    
    
      6
      0.000001
      0.001000
      0.010000
      0
      0
      0
    
    
      5
      0.000001
      0.001000
      0.000010
      0
      0
      0
    
    
      4
      0.010000
      0.000100
      0.000010
      0
      0
      0
    
    
      3
      0.001000
      0.000001
      0.001000
      0
      0
      0
    
    
      2
      0.001000
      0.000010
      0.000010
      0
      0
      0
    
    
      19
      0.010000
      0.000010
      0.000100
      0
      0
      0



In [228]:

    
#from snorkel.annotations import load_gold_labels
#L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)
#_, _, _, _ = disc_model.score(session, F_test, L_gold_test)
tp, fp, tn, fn = disc_model.score(session, F_dev, L_gold_dev)









    



========================================
Scores (Un-adjusted)
========================================
Pos. class accuracy: 0.0
Neg. class accuracy: 0.601
Precision            0.0
Recall               0.0
F1                   0.0
----------------------------------------
TP: 0 | FP: 368 | TN: 554 | FN: 0
========================================



In [226]:

    
vars(F_dev[0])









    Out[226]:





{'_shape': (1, 182754),
 'annotation_key_cls': None,
 'candidate_index': None,
 'col_index': None,
 'data': array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]),
 'indices': array([    2,    34,    38,    89,    91,   126,   140,   156,   495,
          497,  2697,  3108,  3894,  4317,  5709,  5710,  5734, 28157,
        28175, 30053, 37762, 61472, 61478, 73478, 73479, 77022], dtype=int32),
 'indptr': array([ 0, 26], dtype=int32),
 'key_index': None,
 'maxprint': 50,
 'row_index': None}



In [ ]:

	j	Coverage	Overlaps	Conflicts
lf_dict_mte_minerals	0	0.003236	0.002566	0.000000
lf_dict_nonminerals	1	0.001785	0.001785	0.001785
lf_dict_wikipedia_minerals	2	0.006137	0.003905	0.000000
lf_rule_ite2_minerals	3	0.004240	0.003124	0.001785

	j	Coverage	Overlaps	Conflicts	TP	FP	TN	Empirical Acc.	Learned Acc.
lf_dict_mte_minerals	0	0.067245	0.060738	0.000000	58	4	0	0.935484	0.519071
lf_dict_nonminerals	1	0.005423	0.005423	0.005423	0	0	5	1.000000	0.518340
lf_dict_wikipedia_minerals	2	0.106291	0.099783	0.000000	92	6	0	0.938776	0.519468
lf_rule_minerals	3	0.057484	0.044469	0.005423	37	16	0	0.698113	0.519274

	lr	l1_penalty	l2_penalty
0	0.010000	0.001000	0.000100
1	0.000100	0.000001	0.001000
18	0.000010	0.010000	0.000001
17	0.010000	0.000010	0.000001
16	0.000001	0.010000	0.001000
15	0.000010	0.000001	0.000100
14	0.000100	0.010000	0.000010
13	0.001000	0.000010	0.000100
12	0.000100	0.000100	0.000010
11	0.001000	0.000001	0.001000
10	0.000100	0.010000	0.000100
9	0.000001	0.000010	0.001000
8	0.000100	0.010000	0.000001
7	0.010000	0.000010	0.010000
6	0.000001	0.001000	0.010000
5	0.000001	0.001000	0.000010
4	0.010000	0.000100	0.000010
3	0.001000	0.000001	0.001000
2	0.001000	0.000010	0.000010
19	0.010000	0.000010	0.000100