In [5]:
import argparse
import os
import shutil
import gzip

import torch
import torch.nn as nn
from torch.autograd import Variable

import spacy

In [4]:
train_path = '/Users/Max/data/beer_reviews/reviews.all.train.txt.gz'
val_path = '/Users/Max/data/beer_reviews/reviews.all.heldout.txt.gz'

nlp = spacy.load('en')

In [ ]:


In [7]:
import gzip
check_path = '/Users/Max/data/full_beer/aspects/reviews.aspect1.train.txt.gz'

with gzip.open(check_path, 'rt') as f:
    lines = f.readlines()
print(len(lines))


70000

In [14]:
import tqdm

for line in tqdm.tqdm(lines[:1000]):
    target, _, review = line.partition("\t")
    doc = nlp(review)
    
    # parsing in spacy style
    words = [tuple([token.text]) for token in doc]
    sents = [tuple([token.text for token in sent]) for sent in doc.sents]
    chunks = [tuple([word.text for word in token.subtree if word.text != '\n' and word.text != '\t']) for token in doc]
    
    # creating encodings
    enc_words  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in words])
    enc_sents  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in sents])
    enc_chunks = target + '\D' + '\T'.join(['\W'.join(tup) for tup in chunks]) + '\n'
    
    with gzip.open('/Users/Max/data/beer_reviews/reviews.short.train.words.txt.gz', 'at') as f:
        f.write(enc_words)
    
    with gzip.open('/Users/Max/data/beer_reviews/reviews.short.train.chunks.txt.gz', 'at') as f:
        f.write(enc_chunks)
        
    with gzip.open('/Users/Max/data/beer_reviews/reviews.short.train.sents.txt.gz', 'at') as f:
        f.write(enc_sents)


100%|██████████| 1000/1000 [00:22<00:00, 56.87it/s]

In [13]:
len(lines)


Out[13]:
210000

In [ ]:
with gzip.open('/Users/Max/data/beer_reviews/reviews.all.train.chunks.txt.gz', 'rt') as f:
    my_lines = f.readlines()

In [ ]:
# Decode
obj = my_lines[128383]
target, review = obj.split('\D')
[tuple(chunk.split('\W')) for chunk in review.split('\T')]

In [ ]:
my_lines[129322]

In [ ]:
# parsing in spacy style
words = [tuple([token.text]) for token in doc]
sents = [tuple([token.text for token in sent]) for sent in doc.sents]
chunks = [tuple([word.text for word in token.subtree]) for token in doc]

# creating encodings
enc_words  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in words])
enc_sents  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in sents])
enc_chunks = target + '\D' + '\T'.join(['\W'.join(tup) for tup in chunks])

In [ ]:
lines[1]

In [ ]:
target, review = enc_chunks.split('\D')

In [ ]:
target

In [ ]:
review.split('\T')[0].split('\W')

In [ ]:
enc_chunks[]

In [ ]:
import json
import gzip

def read_rationales(path):
    """
    This reads the json.annotations file. 
    Creates a list of dictionaries, which holds the 994 reviews for which
    sentence-level annotations are available. 
    """
    data = []
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path) as fin:
        for line in fin:
            item = json.loads(line)
            data.append(item)
    return data

In [ ]:
import spacy
import ast
nlp = spacy.load('en')
anno = '/Users/Max/data/beer_reviews/annotations.json'
annotations = read_rationales(anno)

In [ ]:
# Better
# For each sentence that deserves a label, we extract the sentence from doc that is closest. 

def map_sentence(sen, tup):
    sen = set([str(token).lower() for token in sen])
    n = len(sen) #+ len(tup[0])
    s = len(sen & tup[0])
    score = s / n
    return score

# Which sentences deserve a label and what label?
ix = 2
review = annotations[ix]
doc = nlp(ast.literal_eval(annotations[ix]['raw'])['review/text'])
all_words = review['x']
label_sens = []
for label in ['0','1','2']:
    label_sens.extend([(set(all_words[s:e]), label) for s, e in review[label]])

# Label the sentences in doc
sentences = [(sen, set()) for sen in doc.sents]
for tup in label_sens:
    scores = ([map_sentence(sen, tup) for sen, _ in sentences])
    # print(scores)
    sentences[scores.index(max(scores))][1].add(tup[1])

# Process the sentences
words = []
chunks= []
#mode = 'chunks'

for tup in sentences:    
    words1 = [tuple([tuple([token.text]),tup[1]]) for token in tup[0]]
    chunks1 = [tuple([tuple([word.text for word in token.subtree if word.text != '\n' and word.text != '\t']),tup[1]]) for token in tup[0]]
    words.extend(words1)
    chunks.extend(chunks1)
sents = sentences

In [ ]:
words

In [ ]:
doc

In [ ]:
doc

In [ ]:
[review[label] for label in ['0','1','2']]

In [ ]:
from itertools import chain, combinations

A = torch.randn(5,5)
L = A.mm(A.t())

A = A.numpy()
L = L.numpy()

def computeMAP(L):

    # initialization
    n = L.shape[0]
    no_choice = list(range(n))
    choice = []
    best_p = 0

    while True:

        candidates = [choice + [j] for j in no_choice]
        submats = [L[np.ix_(cand, cand)] for cand in candidates]
        probs = [np.linalg.det(submat) - best_p for submat in submats]

        if all(p <= 0 for p in probs):
            return choice
        else:
            which = np.argmax(np.array(probs))
            choice = candidates[which]
            which_elem = choice[-1]
            no_choice.remove(which_elem)
            best_p += probs[which]
            

def exactMAP(L):

    n = L.shape[0]
    
    # Generate powerset
    s = list(range(n))
    powerset = list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))
    
    # Compute Probabilities 
    probs = np.array([np.linalg.det(L[np.ix_(choice, choice)]) for choice in powerset])
    which = np.argmax(probs)
    MAP = powerset[which], probs[which]
    
    return MAP

In [ ]:
from dpp_nets.dpp.map import exactMAP, computeMAP