In [1]:
import xml.etree.ElementTree as ET
import numpy as np
The comments in the dataset are representes as XML
. To represent it, three classes are used - TaggedDocument
, TaggedSentence
and TaggedWord
.
TaggedWord
The class represents a work with associated metadata - index in sentence, lemma, stem, POS tag and so on.
TaggedSentence
A representation of a sentence with associated metadata. Each sentence has a list of TaggedWord
objects representing the words that make it up.
TaggedDocument
A representation of a document and its metadata, with a list of sentences that it contains as a list of TaggedSentence
objects.
In [6]:
class TaggedText(object):
"""Object representation of a review document tagged with sentence metadata, source and ratings."""
def __init__(self):
self.sentences = []
self.rating = 0
self.source = ''
self.text = ''
self.__pairs = None
def add_sentence(self, sentence):
self.sentences.append(sentence)
def get_sentences(self):
return self.sentences
def set_filename(self, fn):
self.filename = fn
def get_filename(self):
return self.filename
def set_source_url(self, url):
self.source = url
def get_source_url(self):
return self.source
def set_rating(self, rating):
self.rating = rating
def get_rating(self):
return self.rating
def set_text(self, text):
self.text = text
def get_text(self):
return self.text
def get_length(self):
return len(self.text)
def get_annotated_pairs(self):
if self.__pairs:
return self.__pairs
self.__pairs = []
for sentence in self.sentences:
# Get all positively marked sentiment-clue pairs
self.__pairs += [ (a,s) for (a,s,l,r) in sentence.get_annotated_pairs() if l == 1]
return self.__pairs
def get_clues(self):
return [a for (a, b) in self.get_annotated_pairs()]
def get_aspects(self):
return [b for (a, b) in self.get_annotated_pairs()]
class TaggedSentence(object):
def __init__(self):
self.words = []
self.dependencies = []
self.text = ''
self.start_position = 0
self.length = 0
self.annotated_pairs = []
def add_annotated_pair(self, aspect, anchor, link, sentiment):
r_link = transform_annotation(link)
r_sentiment = transform_annotation(sentiment)
self.annotated_pairs.append((aspect, anchor, r_link, r_sentiment))
def get_annotated_pairs(self):
return self.annotated_pairs
def get_annotated_pair(self, aspect, anchor):
aspect_word = aspect.get_word()
anchor_word = anchor.get_word()
for (r_asp, r_anch, link, sentiment) in self.get_annotated_pairs():
if aspect_word == r_asp and anchor_word == r_anch:
return np.array([link, sentiment])
return np.array([0, 0])
def add_word(self, word):
self.words.append(word)
def get_words(self):
return self.words
def get_word(self, i):
return self.words[i]
def add_dependency(self, dependency):
self.dependencies.append(dependency)
def get_dependencies(self):
return self.dependencies
def set_text(self, text):
self.text = text
def get_text(self):
return self.text
def set_start_position(self, pos):
self.start_position = pos
def get_start_position(self):
return self.start_position
def get_length(self):
return len(self.text)
class Dependency(object):
def __init__(self, governor, dependent, relation):
self.governor = governor
self.dependent = dependent
self.relation = relation
def get_governor_index(self):
return self.governor
def get_dependent_index(self):
return self.dependent
def get_relation(self):
return self.relation
def __str__(self):
return '({0} as {1} to {2})'.format(self.governor, self.relation, self.dependent)
class TaggedWord(object):
def __init__(self):
self.word = ''
self.lemma = ''
self.molex_lemmas = []
self.POS = None
self.stem = ''
self.MSDs = []
self.position = 0
self.index = 0
def set_word(self, word):
self.word = word
def get_word(self):
return self.word
def set_stem(self, stem):
self.stem = stem
def get_stem(self):
return self.stem
def set_lemma(self, lemma):
self.lemma = lemma
def get_lemma(self):
return self.lemma
def add_molex_lemma(self, m_lemma):
self.molex_lemmas.append(m_lemma)
def get_molex_lemmas(self):
return self.molex_lemmas
def set_POS_tag(self, POS):
self.POS = POS
def get_POS_tag(self):
return self.POS
def add_MSD(self, msd):
self.MSDs.append(msd)
def get_MSDs(self):
return self.MSDs
def set_position(self, pos):
self.position = pos
def get_position(self):
return self.position
def set_index(self, index):
self.index = index
def get_index(self):
return self.index
def __eq__(self, other):
'''
If the word is the same and the index is the same, assume it's also
the same sentence. Good enough for practical use, but not necessarily
correct.
'''
return self.word == other.word and self.index == other.index
def transform_annotation(annotation):
"""Transforms string-based orientation annotations into numbers"""
#if annotation == '+':
# return 1
#if annotation == '-':
# return -1
if annotation in "+-":
return 1
return 0
In [3]:
from os import listdir
from os.path import isfile, join
import re
def files_in(directory):
"""
List all files in directory. Excludes directories.
Args:
directory (str): String representation of path
Returns:
str[]: List of file names, not prefixed with directory name
"""
return [f for f in listdir(directory) if isfile(join(directory,f))]
def read_documents_in(directory):
"""
Parses all XML documents in a given directory to create a list of object
document representations.
Args:
directory (str) : The directory path
Returns:
TaggedDocument[] : List of documents as object
"""
files = files_in(directory)
documents = []
for f in files:
documents.append(read_document(join(directory, f)))
return documents
def read_document(doc):
"""
Parses an XML document to returns its tagged document representation
Args:
doc (str): The path to the document
Returns:
TaggedDocument: Parsed XML as object
"""
tree = ET.parse(doc)
root = tree.getroot()
filename = doc.split('/')[-1].split('.')[0]
comment = TaggedText()
comment.set_filename(filename) # Name the document
text = root.find('Text').text
rating = float(root.find('Rating').text)
source = root.find('Source').text
# Setting the basic properties
comment.set_text(text)
comment.set_rating(rating)
comment.set_source_url(source)
# Iterating through all sentences in the text
sentences = root.find('Sentences')
for sentence in sentences.findall('SentenceInfo'):
tagged_sentence = TaggedSentence()
text = sentence.find('Text').text
start = int(sentence.find('StartPosition').text)
tagged_sentence.set_text(text)
tagged_sentence.set_start_position(start)
words = sentence.find('TaggedWords')
for word in words.findall('CROTaggedWord'):
tagged_word = TaggedWord()
word_text = word.find('Word').text
word_lemma = word.find('Lemma').text
molex_lemmas = [molex.text for molex in word.find('MolexLemmas').findall('string')]
POS = word.find('POSTag').text
stem = word.find('BasicStem').text
MSDs = [ msd.text for msd in word.find('MSDs').findall('string')]
position = int(word.find('Position').text)
index = int(word.find('SentenceIndex').text)
tagged_word.set_word(word_text)
tagged_word.set_lemma(word_lemma)
for molex in molex_lemmas: tagged_word.add_molex_lemma(molex)
tagged_word.set_stem(stem)
for msd in MSDs: tagged_word.add_MSD(msd)
tagged_word.set_position(position)
tagged_word.set_index(index)
tagged_word.set_POS_tag(POS)
tagged_sentence.add_word(tagged_word)
dependencies = sentence.find('DependencyRelations')
for dependency in dependencies.findall('DependencyRelation'):
governor = int(dependency.find('Governor').find('SentenceIndex').text)
dependent = int(dependency.find('Dependent').find('SentenceIndex').text)
relation = dependency.find('Relation').text
tagged_sentence.add_dependency(Dependency(governor, dependent, relation))
comment.add_sentence(tagged_sentence)
return comment
In [88]:
from math import log
import codecs
import json
def get_all_lemmas(word):
"""Locates all lemmas and all molex lemmas"""
return word.get_molex_lemmas() + [word.get_lemma()]
def get_all_roots(word):
"""Locates all lemmas, molex lemmas and stems"""
return word.get_molex_lemmas() + [word.get_lemma(), word.get_stem()]
def read_word_roots(document):
"""
Read stems from a file formatted as
>>>word\tstem
This format is used for stemmer output
"""
stems = []
with codecs.open(document, 'r', 'utf-8') as fp:
for line in fp:
stems.append(line.split()[1]) # Stems/lemmas are in the second tab-delimited column
return set(stems)
lemma_aspects = read_word_roots('CropinionDataset/dictionary_lemmatised/aspects.txt')
lemma_pos_clues = read_word_roots('CropinionDataset/dictionary_lemmatised/positive_clues.txt')
lemma_neg_clues = read_word_roots('CropinionDataset/dictionary_lemmatised/negative_clues.txt')
stem_aspects = read_word_roots('CropinionDataset/dictionary_stemmed/aspects.txt')
stem_pos_clues = read_word_roots('CropinionDataset/dictionary_stemmed/positive_clues.txt')
stem_neg_clues = read_word_roots('CropinionDataset/dictionary_stemmed/negative_clues.txt')
aspects = lemma_aspects | stem_aspects
pos_clues = lemma_pos_clues | stem_pos_clues
neg_clues = lemma_neg_clues | stem_neg_clues
all_clues = pos_clues | neg_clues
# We use only aspects and clues as words (at this point)
all_words = aspects | all_clues
#TaggedWord.get_lemma
def get_word_list(docs, processor=get_all_roots):
word_list = []
# Record occurrences
for doc in docs:
doc_name = doc.get_filename()
for sentence in doc.get_sentences():
for word in sentence.get_words():
root = processor(word)
if isinstance(root, list):
word_list += root
else:
word_list.append(root)
return set(word_list)
def get_idf(docs, processor=get_all_roots, word_list=set()):
N = len(docs) # Document count
idfs = {}
# Record occurrences
for doc in docs:
doc_name = doc.get_filename()
for sentence in doc.get_sentences():
for word in sentence.get_words():
root = processor(word)
if isinstance(root, list):
if not any([w in word_list for w in root]):
continue
elif root not in word_list:
continue
if not isinstance(root, list):
if root in idfs:
idfs[root][doc_name] = 1
else:
idfs[root] = { doc_name : 1 }
else:
for r in root:
if r in idfs:
idfs[r][doc_name] = 1
else:
idfs[r] = { doc_name : 1 }
# Actual idfs:
for root in idfs:
root_doc_count = len(idfs[root]) + 1.0 # Smoothing
idfs[root] = log(N / root_doc_count)
return idfs
def get_tf(docs, processor=get_all_roots, word_list=set()):
tfs = {}
# Record occurrences
for doc in docs:
doc_name = doc.get_filename()
tfs[doc_name] = {}
for sentence in doc.get_sentences():
for word in sentence.get_words():
root = processor(word)
if isinstance(root, list):
if not any([w in word_list for w in root]):
continue
elif root not in word_list:
continue
if not isinstance(root, list):
if root in tfs:
tfs[doc_name][root] += 1
else:
tfs[doc_name][root] = 1
else:
for r in root:
if r in tfs:
tfs[doc_name][r] += 1
else:
tfs[doc_name][r] = 1
for doc_name in tfs:
# Skip documents without any recognised lemmas
if not tfs[doc_name].keys():
continue
max_freq = max(tfs[doc_name].values())
for root in tfs[doc_name]:
tfs[doc_name][root] = 0.5 + 0.5 * tfs[doc_name][root] / max_freq
return tfs
def tf_idf_vector(document_name, tfs, idfs, word_list, only_words=None):
'''
Given a document with path :document_name, loads it and calculates its tf-idf
vector. TFS are drawn from the :tfs dictionary, while the IDFS are drawn from
the :idfs dictionary. Words are drawn from the :word_list iterable. If the
:only_words variable is set to an iterable, all words NOT in the word list
are ignored.
'''
tf_idf = []
for word in word_list:
if only_words is not None and word not in only_words:
tf_idf.append(0.0)
elif document_name in tfs and word in tfs[document_name] and word in idfs:
tf_idf.append(tfs[document_name][word] * idfs[word])
else:
tf_idf.append(0.0)
return np.array(tf_idf)
train_docs = read_documents_in('CropinionDataset/reviews_new/train2/')
test_docs = read_documents_in('CropinionDataset/reviews_new/test2/')
# We must use train idfs in both cases to prevent information leakage
word_list = sorted(list(get_word_list(train_docs))) # all_words
all_idfs = get_idf(train_docs, word_list=word_list)
test_tfs = get_tf(train_docs, word_list=word_list) # all_words
train_tfs = get_tf(test_docs, word_list=word_list)
all_tfs = dict(test_tfs, **train_tfs)
In [14]:
from os import remove
from os.path import exists, join
def aspects_in(document, aspects=aspects, processor=get_all_roots):
found = set()
for sentence in document.get_sentences():
for word in sentence.get_words():
root = processor(word)
if isinstance(root, list):
if any([w in aspects for w in root]):
found |= set(root) # Add all roots
else:
if root in aspects:
found.add(root)
return found
def clues_in(document, clues=all_clues, processor=get_all_roots):
found = set()
for sentence in document.get_sentences():
for word in sentence.get_words():
root = processor(word)
if isinstance(root, list):
if any([w in clues for w in root]):
found |= set(root) # Add all roots
else:
if root in clues:
found.add(root)
return found
def find_pairs(sentence, aspects, clues, processor=get_all_roots):
"""
Finds all potential clue-aspect pairs in the sentence. Pairs are identified by comparing word stems
in the sentence to stems given in the aspect and clue dictionaries (sets).
Args:
sentence (TaggedSentence) : Object representation of a sentence
aspects (set(str)) : Set of aspect stems
clues (set(str)) : Set of clue stems
Returns:
([(TaggedWord, TaggedWord)]) : List of TaggedWord candidate pairs
"""
sent_aspects = []
sent_clues = []
for word in sentence.get_words():
root_word = processor(word)
if isinstance(root_word, list):
if any([w in aspects for w in root_word]):
sent_aspects.append(word)
if any([w in clues for w in root_word]):
sent_clues.append(word)
else:
if root_word in aspects:
sent_aspects.append(word)
if root_word in clues:
sent_clues.append(word)
return [(aspect, clue) for aspect in sent_aspects for clue in sent_clues]
def get_annotated_pairs_in_document(document):
tree = ET.parse(document)
root = tree.getroot()
pairs = []
for sentence in root.iter('sentence'):
sent_pairs = []
for pair in sentence.iter('pair'):
if pair.get('link') in "+-":
sent_pairs.append((pair.get('aspect'), pair.get('anchor')))
pairs.append(sent_pairs)
return pairs
def get_unlocated_sentence_pair_count(found_pairs, real_pairs):
unlocated = 0
for (aspect, clue) in real_pairs:
found = False
for (w_a, w_c) in found_pairs:
w_a, w_c = w_a.get_word(), w_c.get_word()
if (w_a == aspect and w_c == clue) or (w_c == aspect and w_a == clue) :
found = True
break
if not found:
unlocated += 1
return unlocated
def get_unlocated_document_pair_count(document, ann_document, aspects, clues):
try:
ann_pairs = get_annotated_pairs_in_document(ann_document)
except:
#print ann_document, 'does not exist'
return 0 #No file, missed nothing
parsed_doc = read_document(document)
count = 0
N = len(ann_pairs)
for i in xrange(N):
my_pairs = find_pairs(parsed_doc.get_sentences()[i], aspects, clues)
ap_pairs = ann_pairs[i]
count += get_unlocated_sentence_pair_count(my_pairs, ap_pairs)
return count
def get_dataset_unlocated_pair_count(dataset_dir, annotated_pairs_dir, aspects, clues):
count = 0
files = files_in(dataset_dir)
for doc in files:
my_doc = join(dataset_dir, doc)
ap_doc = join(annotated_pairs_dir, doc)
count += get_unlocated_document_pair_count(my_doc, ap_doc, aspects, clues)
return count
In [174]:
import os.path
from os import remove
from os.path import exists
def doc_load_annotated_pairs(document, src_dir):
"""
Given a TaggedDocument representation of an xml document, loads the manually annotated pairs and their
sentiments from a provided source directory. The pair are stored on the document - more precisely, on
each TaggedSentence within the document.
"""
filename = join(src_dir, document.get_filename() + '.xml')
if not os.path.isfile(filename):
return False # No pairs
doc_sentences = document.get_sentences()
ap_tree = ET.parse(filename)
ap_root = ap_tree.getroot()
ap_sentences = list(ap_root.iter('sentence'))
sent_count = len(ap_sentences)
for i in xrange(sent_count):
ap_sent = ap_sentences[i]
doc_sent = doc_sentences[i]
# Find all pairs and append them to the doc
for pair in ap_sent.iter('pair'):
doc_sent.add_annotated_pair(pair.get('aspect'), pair.get('anchor'), pair.get('link'), pair.get('sent'))
return True
def doc_annotate_located_pairs(document, aspects, clues):
marks = []
for sentence in document.get_sentences():
pairs = find_pairs(sentence, aspects, clues)
for pair in pairs:
marks.append(sentence.get_annotated_pair(pair[0], pair[1]))
return np.array(marks)
def annotate_set_pairs(src_dir, dest_file, annotations_dir, aspects, clues):
docs = files_in(src_dir)
fp = codecs.open(dest_file, 'a', 'utf-8')
status = False
for doc in docs:
document = read_document(join(src_dir, doc))
doc_load_annotated_pairs(document, annotations_dir)
np.savetxt(fp, doc_annotate_located_pairs(document, aspects, clues))
fp.close()
def annotate_regression(src_dir, dest_file):
docs = files_in(src_dir)
fp = codecs.open(dest_file, 'a', 'utf-8')
status = False
scores = []
for doc in docs:
document = read_document(join(src_dir, doc))
doc_score = document.get_rating()
positive = 1 if doc_score >= 4 else 0
score = np.array([doc_score, positive])
scores.append(score)
np.savetxt(fp, np.array(scores))
fp.close()
def annotate_sets():
#if(exists('features/train/y.txt')): remove('features/train/y.txt')
#if(exists('features/test/y.txt')): remove('features/test/y.txt')
if(exists('features/train/z.txt')): remove('features/train/z.txt')
if(exists('features/test/z.txt')): remove('features/test/z.txt')
#annotate_set_pairs('CropinionDataset/reviews_new/train2', 'features/train/y.txt', 'CropinionDataset/annotated_pairs/all', aspects, all_clues)
#annotate_set_pairs('CropinionDataset/reviews_new/test2', 'features/test/y.txt', 'CropinionDataset/annotated_pairs/all', aspects, all_clues)
annotate_regression('CropinionDataset/reviews_new/Train', 'features/train/z.txt')
annotate_regression('CropinionDataset/reviews_new/Test', 'features/test/z.txt')
Here goes nothing!
The features used in the initial classification (finding pairs) runs are:
The features (to be) used in the initial regression run:
In [194]:
def extract_set_features(src_dir, dest_file, aspects, pos_clues, neg_clues):
docs = files_in(src_dir)
fp = codecs.open(dest_file, 'a', 'utf-8')
for doc in docs:
features = extract_document_features(join(src_dir, doc), aspects, pos_clues, neg_clues)
np.savetxt(fp, features)
fp.close()
def extract_document_features(document_path, aspects, pos_clues, neg_clues):
document = read_document(document_path)
clues = pos_clues | neg_clues
features = []
for sentence in document.get_sentences():
pairs = find_pairs(sentence, aspects, clues)
for pair in pairs:
features.append(compute_feature_vector(pair, sentence, pos_clues, neg_clues))
return np.array(features)
def compute_feature_vector(pair, sentence, pos_clues, neg_clues):
features = []
features.append(pair_distance_index(pair))
features.append(pair_init_distances(pair))
features.append(np.array([len(sentence.get_text())]))
features.append(sentence_length(sentence))
features.append(govern_relations_exist(pair, sentence))
features.append(match(pair, extract_plurality))
features.append(match(pair, extract_genders))
features.append(POS_vector(pair[0]))
features.append(POS_vector(pair[1]))
features.append(clues_counts(sentence, pos_clues, neg_clues))
features.append(negation_present(pair[0], sentence))
features.append(negation_present(pair[1], sentence))
# BOW between the pair candidates
#features.append(get_tfidf_vec_between(pair, sentence))
return np.hstack(features)
def words_between(pair, sentence):
# Find which words to take
i, j = pair[0].get_index(), pair[1].get_index()
if i > j:
i, j = j, i
word_bag = []
# All words inbetween, if any
for index in xrange(i + 1, j):
word_bag.append(sentence.get_word(index))
return word_bag
def get_tf_between(pair, sentence):
word_bag = words_between(pair, sentence)
dummy_sent = TaggedSentence()
for word in word_bag:
dummy_sent.add_word(word)
dummy_doc = TaggedText()
dummy_doc.set_filename('dummy')
dummy_doc.add_sentence(dummy_sent)
return get_tf([dummy_doc], word_list=word_list)
def get_tfidf_vec_between(pair, sentence, idfs=all_idfs, words=all_words):
dummy_tfs = get_tf_between(pair, sentence)
return tf_idf_vector('dummy', dummy_tfs, all_idfs, words)
def clues_counts(sentence, pos_clues, neg_clues, processor = get_all_lemmas):
counts = [0, 0]
for word in sentence.get_words():
root = processor(word)
if isinstance(root, list):
if any([r in pos_clues for r in root]):
counts[0] += 1
if any([r in neg_clues for r in root]):
counts[1] += 1
else:
if root in pos_clues:
counts[0] += 1
if root in neg_clues:
counts[1] += 1
return np.array(counts)
def pair_distance_index(pair):
return np.array([abs(pair[0].get_index() - pair[1].get_index())])
def pair_init_distances(pair):
return np.array([abs(pair[0].get_position() - pair[1].get_position())])
def sentence_length(sentence):
count = len(sentence.get_words())
return np.array([count])
def govern_relations_exist(pair, sentence):
rels = [0, 0] # Aspect governs clue, clue governs aspect
index_pair = pair[0].get_index(), pair[1].get_index()
for dependency in sentence.get_dependencies():
(a, b) = dependency.get_governor_index(), dependency.get_dependent_index()
if (a,b) == index_pair:
rels[0] = 1
elif (b, a) == index_pair:
rels[1] = 1
return np.array(rels)
def match(pair, f):
options_a = f(pair[0])
options_b = f(pair[1])
for a_g in options_a:
for b_g in options_b:
if a_g == b_g:
return np.array([1.0])
return np.array([0.0])
def POS_vector(word):
tags = "ACIMNPQRSVYZ" # POS tags in set
one_hot = np.zeros(12)
#print tags, word.get_POS_tag()
index = tags.index(word.get_POS_tag())
if index != -1:
one_hot[index] = 1
return one_hot
def extract_plurality(word):
pluralities = set()
msds = word.get_MSDs()
for msd in msds:
if msd[0] == 'A':
pluralities.add(msd[4])
elif msd[0] == 'N':
pluralities.add(msd[3])
elif msd[0] == 'V':
pluralities.add(msds[-1])
return list(pluralities)
def extract_genders(word):
genders = set()
msds = word.get_MSDs()
for msd in msds:
if msd[0] == 'A':
genders.add(msd[3])
elif msd[0] == 'N':
genders.add(msd[2])
return list(genders)
def negation_present(word, sentence, k = 3):
negations = [u'ne', u'nije', u'nimalo', u'nipošto', u'nisam', u'nisu', u'nismo', u'nemojte', u'nikako', u'neće']
word_index = word.get_index()
negation_present = 0
words = sentence.get_words()
N = len(words)
start = max(0, word_index - k)
end = min(N - 1, word_index + k)
while start <= end:
if start == word_index:
start += 1
continue
if words[start].get_word().lower() in negations:
negation_present = 1
break # Just looking for presence, not count
start += 1
return np.array([negation_present])
def get_BOW(document):
return tf_idf_vector(document.get_filename(), all_tfs, all_idfs, all_words)
def get_token_count(document):
return np.array([sum([len(s.get_words()) for s in document.get_sentences()])])
def get_uppercase_percentage(document):
"""Computes the percentage of uppercase letters in document"""
text = document.get_text()
upper_length = float(len([c for c in text if c.isupper()]))
length = float(len(text))
return np.array([upper_length / length])
def count_substrings(document, substrings):
count = 0
text = document.get_text().lower()
for substring in substrings:
count += text.count(substring.lower())
return np.array([count])
def count_good_smileys(document):
"""Naive good emoticon count"""
return count_substrings(document, [':)', ':D', ': )', ':-)', ':-D'])
def count_bad_smileys(document):
"""Naive bad emoticon count"""
return count_substrings(document, [':(', ':-(', ': ('])
def count_exclamations(document):
return count_substrings(document, ['!'])
def count_questionmarks(document):
return count_substrings(document, ['?'])
def aspect_bow(document):
my_aspects = aspects_in(document)
return tf_idf_vector(document.get_filename(), all_tfs, all_idfs, aspects, my_aspects)
def clue_bow(document):
clues = clues_in(document)
return tf_idf_vector(document.get_filename(), all_tfs, all_idfs, all_clues, clues)
def count_aspects(document):
return len(aspects_in(document, processor=TaggedWord.get_lemma))
def count_pos_clues(document):
return len(clues_in(document, processor=TaggedWord.get_lemma, clues=pos_clues))
def count_neg_clues(document):
return len(clues_in(document, processor=TaggedWord.get_lemma, clues=neg_clues))
def count_plus(document):
return count_substrings(document, ['+', 'plus'])
def count_minus(document):
return count_substrings(document, ['-', 'minus'])
def extract_regression_features(src_dir, dest_file):
docs = files_in(src_dir)
fp = codecs.open(dest_file, 'a', 'utf-8')
all_features = []
for doc in docs:
document = read_document(join(src_dir, doc))
features = []
features.append(get_BOW(document))
features.append(np.array([document.get_length()]))
features.append(get_token_count(document))
# Does nothing
#features.append(get_uppercase_percentage(document)) Useless
# Symbol features (not very effective)
#features.append(count_good_smileys(document))
#features.append(count_bad_smileys(document))
#features.append(count_plus(document))
#features.append(count_minus(document))
# Do nothing
#features.append(count_exclamations(document))
#features.append(count_questionmarks(document))
features.append(count_aspects(document))
#features.append(count_pos_clues(document))
#features.append(count_neg_clues(document))
features.append(aspect_bow(document))
#features.append(clue_bow(document))
features = np.hstack(features)
all_features.append(features)
np.savetxt(fp, np.array(all_features))
fp.close()
In [195]:
from os import remove
from os.path import exists
def store_feature_vectors():
reg_name = 'regression+A.txt'
"""
Calculate the feature vectors for the entire train and test sets and store the resulting numpy vectors
into files for easy access. Should be run every time the way features are calculated has changed.
"""
#Remove old ones
#if(exists('features/train/classification.txt')): remove('features/train/classification.txt')
#if(exists('features/test/classification.txt')): remove('features/test/classification.txt')
if(exists('features/train/'+reg_name)): remove('features/train/'+reg_name)
if(exists('features/test/'+reg_name)): remove('features/test/'+reg_name)
#extract_set_features('CropinionDataset/reviews_new/train2', 'features/train/classification.txt', aspects, pos_clues, neg_clues)
#extract_set_features('CropinionDataset/reviews_new/test2', 'features/test/classification.txt', aspects, pos_clues, neg_clues)
extract_regression_features('CropinionDataset/reviews_new/Train', 'features/train/'+reg_name)
extract_regression_features('CropinionDataset/reviews_new/Test', 'features/test/'+reg_name)
# Prepare both features and true annotations
store_feature_vectors()
#annotate_sets()
A set of grid searches is performed, using k-fold validation (where k is 10) to locate the best parameters for precision, recall and F1-score maximisation. The code has been moved to grid.py.
Best parameters when using stems are:
While the best for lemmas are:
Best for lemmas and molex lemas combined:
Best for all possible roots (lemmas, stems and molex lemmas) combined (the one we're using):
Also with BoW for words in between:
This cell performs testing using the developed features and parameters gleaned by cross-validation. The micro and macro scores are computer (P, R and F1) and displayed. All data is scaled.
These are the initial results. Improvements using additional features, as well as analysis which features are most important, will be attempted.
The recorded results are:
P = 0.88
R = 0.61
F1 = 0.72
The recall is obviously terrible (stems failed to locate a lot of the pairs). The same experiment must be tried with lemmas, which may improve recall performance greatly.
P = 0.99
R = 0.33
F1 = 0.50
What the $&@#
P = 0.99
R = 0.47
F1 = 0.64
I have no idea what happened ~ Luka
P = 0.90
R = 0.75
F1 = 0.82
P = 0.90
R = 0.73
F1 = 0.81
The results achieved with a combination of all three are almost as good as those in the initial paper. Now we only have to improve them by introducing additional features. However, adding BoW information reduced the recall and F1. We will not use it in the report.
In [103]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import scale
X_train = np.loadtxt('features/train/classification.txt')
y_train = np.loadtxt('features/train/y.txt')[:,0]
X_test = np.loadtxt('features/test/classification.txt')
y_test = np.loadtxt('features/test/y.txt')[:,0]
X_train = scale(X_train)
X_test = scale(X_test)
N = X_train.shape[0]
model = SVC(C=750, gamma=0.001, kernel='rbf')
model.fit(X_train, y_train)
y_guess = model.predict(X_test)
# ------------- PADDING -------------- #
# Some values were probably not located. We must add them to the count.
# We do so by padding our vector with 0s and the true vector with 1s
cnt = get_dataset_unlocated_pair_count(
'CropinionDataset/reviews_new/test2',
'CropinionDataset/annotated_pairs/all',
aspects,
all_clues)
print "Missed all of", cnt, "pairs completely while I at at least tagged", y_guess.shape[0]
y_guess = np.lib.pad(y_guess, (cnt,), 'constant', constant_values=(0,))
y_test = np.lib.pad(y_test, (cnt,), 'constant', constant_values=(1,))
print "P = {0:.3f}".format(precision_score(y_test, y_guess))
print "R = {0:.3f}".format(recall_score(y_test, y_guess))
print "F1 = {0:.3f}".format(f1_score(y_test, y_guess))
Predicting whether the scores are positive or negative, where scores of 4 or more are positive and scores of 2.5 or less are negative. Scores inbetween are ambigious and not considered here.
A grid search (c_grid.py) is used to find optimal parameters (with 5-fold validation).
The best found parameters when using only BoW + document length are:
The scores are:
F1 = 0.84
F1 = 0.11
F1 = 0.48
F1 = 0.85
F1 = 0.36
F1 = 0.61
rbf, 5000, 0.1
F1 = 0.84, 0.11, 0.48
rbf, 100, 0.0001
F1 = 0.90, 0.65, 0.77
rbf, 750, 0.00001
F1 = 0.91, 0.74, 0.82
In [240]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import scale
X_train = np.loadtxt('features/train/regression+E.txt')
y_train = np.loadtxt('features/train/z.txt')[:,1]
X_test = np.loadtxt('features/test/regression+E.txt')
y_test = np.loadtxt('features/test/z.txt')[:,1]
X_train = scale(X_train)
X_test = scale(X_test)
model = SVC(C=10000, gamma=0.1, kernel='rbf')
model.fit(X_train, y_train)
y_guess = model.predict(X_test)
p_p = precision_score(y_test, y_guess, pos_label=1)
r_p = recall_score(y_test, y_guess, pos_label=1)
f1_p = f1_score(y_test, y_guess, pos_label=1)
p_n = precision_score(y_test, y_guess, pos_label=0)
r_n = recall_score(y_test, y_guess, pos_label=0)
f1_n = f1_score(y_test, y_guess, pos_label=0)
print "POSITIVE\n------------------------"
print "P = {0:.3f}".format(p_p),
print "R = {0:.3f}".format(r_p),
print "F1 = {0:.3f}".format(f1_p)
print "NEGATIVE\n------------------------"
print "P = {0:.3f}".format(p_n),
print "R = {0:.3f}".format(r_n),
print "F1 = {0:.3f}".format(f1_n)
print "AVERAGE\n------------------------"
print "P = {0:.3f}".format((p_p + p_n) / 2),
print "R = {0:.3f}".format((r_p + r_n) / 2),
print "F1 = {0:.3f}".format((f1_p + f1_n) / 2)
Trying to learn the mapping from review to score.
We are using Support Vector Regression (SVR) with parameteres tuned via grid search.
Best found parameters using only BoW and length are:
The corresponding scores are:
MAE = 1.513
r = 0.24 (Pearson)
Best params:
Scores:
MAE = 1.316
r = 0.37
Best params:
Scores:
MAE = 1.381
r = 0.25
Best params:
Scores:
MAE = 1.00
r = 0.68
Best params:
Scores:
MAE = 0.98
r = 0.70
In [246]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr
from sklearn.preprocessing import scale
def pearson(y_true, y_pred):
ret_score = pearsonr(y_true, y_pred)[0]
return ret_score if not np.isnan(ret_score) else 0.0
X_train = np.loadtxt('features/train/regression+A+C+E.txt')
y_train = np.loadtxt('features/train/z.txt')[:,0]
X_test = np.loadtxt('features/test/regression+A+C+E.txt')
y_test = np.loadtxt('features/test/z.txt')[:,0]
X_train = scale(X_train)
X_test = scale(X_test)
N = X_train.shape[0]
model = SVR(C=10, kernel='rbf', gamma=0.05)
model.fit(X_train, y_train)
y_guess = model.predict(X_test)
print 'MAE = {0:.3f}'.format(mean_absolute_error(y_test, y_guess))
print 'r = {0:.3f}'.format(pearson(y_test, y_guess))
In [25]:
def store_all_words(directory, word_file):
wf = codecs.open(word_file, 'w', 'utf-8')
files = files_in(directory)
i = 1
N = len(files)
for document in files:
if i % 50 == 0:
print '{}/{}'.format(i, N)
i += 1
tree = ET.parse(join(directory, document))
root = tree.getroot()
words = root.iter('Word')
for word in words:
wf.write(word.text + '\n')
wf.close()
def write_new_stems(lemma_file, src_dir, dest_dir):
docs = files_in(src_dir)
with codecs.open(lemma_file, 'r', 'utf-8') as lemmas_list:
i = 1
N = len(docs)
for doc in docs:
if i % 50 == 0:
print '{}/{}'.format(i, N)
i += 1
tree = ET.parse(join(src_dir, doc))
root = tree.getroot()
for word in root.iter('BasicStem'): #'Lemma'
word.text = lemmas_list.readline().split()[1].strip()
tree.write(join(dest_dir, doc), encoding='utf-8')
def print_comment(comment):
text = []
for sentence in comment.get_sentences():
for word in sentence.get_words():
text.append(word.get_word())
print '\n'.join(text)
def print_dependencies(comment):
dependencies = []
for sentence in comment.get_sentences():
subdep = []
for dependency in sentence.get_dependencies():
subdep.append(str(dependency))
dependencies.append(','.join(subdep))
print '\n'.join(dependencies)
#store_all_words('CropinionDataset/reviews_new/train', 'train-words.txt')
#write_new_stems('test-lemmas.txt', 'CropinionDataset/reviews_new/test', 'CropinionDataset/reviews_new/test2')
In [21]: