In [1]:
%matplotlib inline
import sys
import pathlib
sys.path.insert(0, str(pathlib.Path('..').resolve()))
import itertools
import matplotlib.pyplot as pyplot
import random
import collections
random.seed(0)
from mining.retrieve_index import get_index
from mining.retrieve_10k import get_10k_items
from util.paragraphs import to_paragraphs, group_paragraphs
from util.stem import stem_list
from mining.parsing import ParseError
In [2]:
def get_range(year_range, form_type):
for year in year_range:
for qtr in range(1, 4):
yield from get_index(year, qtr, form_type)
In [7]:
year_range = range(2006, 2017)
form_type = '10-k'
item = '1a'
In [8]:
def sample_iterator(it, p, max=None):
if max is not None:
yield from itertools.islice(sample_iterator(it, p), 0, max)
else:
for elem in it:
if random.random() < p:
yield elem
In [9]:
def n_gram(it, n):
it = iter(it)
while True:
ret = tuple(itertools.islice(it, 0, n))
if ret:
yield ret
else:
break
In [10]:
corpus = []
index = get_range(year_range, form_type)
for record in sample_iterator(index, 0.01, 1):
try:
form = get_10k_items(record['Filename'], enable_cache=False)
except ParseError as e:
print(record)
raise e
if item in form:
groups = group_paragraphs(to_paragraphs(form[item]))
for header, sentencecs in group:
if sentences:
body = stem_list(' '.join(sentences))
unigram = collections.Counter(body)
bigram = collections.Counter(n_gram(body, 2))
corpus.append((unigram, bigram))
print(corpus)
else:
print(form.keys())