Construct Data Set for LTR

Phrases from Summaries

Phrases from Stroies


In [5]:
class Phrase:
    """Information of a phrase"""
    def __init__(self, word, word_before, word_after, chapter_id, sentence_id, negation):
        self.negation = negation
        self.word = word
        self.word_before = word_before
        self.word_after = word_after
        self.chapter_id = chapter_id
        self.sentence_id = sentence_id
        self.count = 0
        self.weight = 0
    def add_info(self):
        self.count += 1
    def output(self):
        return str(self.weight) + "\t" + str(self.chapter_id) + "\t" + str(self.sentence_id) + "\t" + self.word \
    + "\t" + self.word_before + "\t" + self.word_after + "\t" + str(self.count)
class PhraseSet:
    """Set to manage phrases"""
    def __init__(self, story_id, character_id):
        self.phrases = {}
        self.story_id = story_id
        self.character_id = character_id
    def add(self, word, chapter_id, sentence_id, negation, word_before, word_after):
        if not word in self.phrases:
            self.phrases[word] = Phrase(word, word_before, word_after, chapter_id, sentence_id, negation)
        self.phrases[word].add_info()
    def clear(self):
        self.phrases = {}
    def sort(self):
        return sorted(self.phrases.items(), lambda x, y: cmp(x[1].weight, y[1].weight), reverse=True)

In [9]:
import Syntax as sx
BOOK_ID = 0
CHAPTER_ID = 1
SENTENCE_ID = 2
CHARACTER_ID = 15
def sim(phrase1, phrase2):
    return 0
def cal_similarity(summarySet, storySet):
    for phrase1 in storySet.phrases.values():
        max_sim = 0
        for phrase2 in summarySet.phrases.values():
            similarity = sim(phrase1, phrase2)
            if max_sim < similarity:
                max_sim = similarity
        phrase1.weight = max_sim
def process(summary, story, story_id):
    #phrases and characters in summary
    characters = {}
    pos = 0
    for sentence in summary:
        for token in sentence:
            cid = int(token[CHARACTER_ID])
            if cid >= 0:
                if not cid in characters:
                    characters[cid] = [[], [], PhraseSet(story_id, cid), PhraseSet(story_id, cid)]
                characters[cid][0].append(pos)
        pos += 1
    for cid in characters.keys():
        for sid in characters[cid][0]:
            sentence = summary[sid]
            syn = sx.SyntaxTree()
            syn.creat(sentence)
            labels = syn.extract_label_with_info(cid)
            for label in labels:
                characters[cid][2].add(label[1], syn.chapterID, syn.sentenceID, label[0], label[2], label[3])
    for sentence in story:
        for token in sentence:
            cid = int(token[CHARACTER_ID])
            if cid in characters:
                syn = sx.SyntaxTree()
                syn.creat(sentence)
                labels = syn.extract_label_with_info(cid)
                for label in labels:
                    characters[cid][3].add(label[1], syn.chapterID, syn.sentenceID, label[0], label[2], label[3])
    for cid in characters:
        cal_similarity(characters[cid][2], characters[cid][3])
        sorted_phrases = characters[cid][3].sort()
        for phrase in characters[cid][2].phrases:
            print str(characters[cid][2].story_id) + "\t" + str(characters[cid][2].character_id) \
            + "\t" + phrase.output()
        for phrase in sorted_phrases:
            print str(characters[cid][3].story_id) + "\t" + str(characters[cid][3].character_id) \
            + "\t" + phrase[1].output()
    return 0

token_file = open("../../2.part.tokens.sample", 'rb')
story_id = -1
chapter_id = -1
sentence_id = -1
summary = []
story = []
sentence = []
for line in token_file:
    terms = line.rstrip().split('\t')
    if not int(terms[BOOK_ID]) == story_id:
        #process
        process(summary, story, story_id)
        #new story
        story_id = int(terms[BOOK_ID])
        chapter_id = int(terms[CHAPTER_ID])
        sentence_id = int(terms[SENTENCE_ID])
        summary = []
        story = []
        sentence.append(terms)
    if int(terms[CHAPTER_ID]) == chapter_id and int(terms[SENTENCE_ID]) == sentence_id:
        sentence.append(terms)
    else:
        if len(sentence):
            if chapter_id == 0:
                summary.append(sentence)
            else:
                story.append(sentence)
        chapter_id = int(terms[CHAPTER_ID])
        sentence_id = int(terms[SENTENCE_ID])
        sentence = []
        sentence.append(terms)
token_file.close()


1850	7	0	45	2736	busy too	be	,	6
1850	7	0	22	1459	so trustful	be	and	1
1850	7	0	48	2882	mine	be	.	1
1850	7	0	47	2778	only one	the	who	2
1850	7	0	47	2800	little confident too	a	,	2
1850	7	0	37	2304	fine	be	.	1
2300	2	0	13	1757	out cold	be	.	1
2300	2	0	11	1271	right	be	;	1
2300	2	0	10	922	day-dreaming	be	,	5
2300	2	0	11	1282	unfazed	remain	.	2
2300	2	0	12	1380	REALLY powerful	be	but	1
2300	2	0	14	1908	most experienced	the	,	2
2300	2	0	1	11	small	be	compare	3
2300	2	0	14	1907	true leader	the	.	1
2300	2	0	11	1162	more	lot	than	1
2300	13	0	10	924	old fart	a	.	3
2540	0	0	9	683	ex-boyfriend	my	.	1
2540	0	0	8	569	first love	my	.	1
2540	0	0	11	885	so muscular	be	.	2

In [ ]: