In [1]:
from episode_miner import EventText, EventSequences, Episode, Episodes
from estnltk.taggers import EventTagger
from IPython.display import HTML, FileLink
Find from the texts
In [2]:
event_vocabulary = [{'term': 'üks'},
{'term': 'kaks'}]
event_tagger = EventTagger(event_vocabulary, case_sensitive=False, return_layer=True)
event_text1 = EventText('Üks kaks kolm neli kolm. Kaks üks kaks kolm neli kolm üks kaks.', event_tagger=event_tagger)
event_text2 = EventText('Kaks üks kaks kolm neli kolm üks kaks.', event_tagger=event_tagger)
texts = [event_text1, event_text2]
event_sequences = EventSequences(event_texts=texts, classificator='term', time_scale='start')
html = event_sequences.pretty_print()
HTML(html)
Out[2]:
In [3]:
frequent_episodes = event_sequences.find_serial_episodes(window_width=31,
min_frequency=0.3,
only_full_windows=False,
allow_intermediate_events=True)
list(zip(frequent_episodes, frequent_episodes.abs_support(), frequent_episodes.rel_support()))
Out[3]:
It turns out that the episode ('kaks', 'üks', 'kaks')
appears in 58 Winepi windows. Since the length of the first and the second text is 63 and 38 characters, respctively, the total number of winepi windows is 63+31-1+38+31-1=161
. Therefore the relative frequency of this episode is 58 / 161 = 36%.
Write the results to file.
In [4]:
frequent_episodes.to_json(file='data/episodes.txt')
FileLink('data/episodes.txt')
Out[4]:
If file==None
(the default), then to_json
returns the corresponding string.
In [5]:
event_sequences.find_episode_examples(frequent_episodes,
window_width=31,
allow_intermediate_events=True,
number_of_examples='ALL')
Write the results to file.
In [6]:
frequent_episodes.examples_to_json(file='data/episode_examples.txt')
FileLink('data/episode_examples.txt')
Out[6]:
The lines of the file of examples correspond to the lines of the file of episodes.
If file==None
(the default), then examples_to_json
returns the corresponding string.
Choose the frequent episode ('kaks', 'üks', 'kaks')
and pretty print the examples:
In [7]:
HTML(frequent_episodes[5].examples_pretty_print())
Out[7]:
In [8]:
episode1 = Episode(('üks', 'kaks', 'üks'))
episode2 = Episode(('kaks', 'kaks', 'üks'))
episodes = Episodes([episode1, episode2])
event_sequences.support(episodes=episodes,
window_width=31,
only_full_windows=False,
allow_intermediate_events=True)
episodes.abs_support(), episodes.rel_support()
Out[8]: