In [1]:
%matplotlib inline
import sys
import pathlib
sys.path.insert(0, str(pathlib.Path('..').resolve()))

import itertools
import matplotlib.pyplot as pyplot
import random
import collections
random.seed(0)

from mining.retrieve_index import get_index
from mining.retrieve_10k import get_10k_items
from util.paragraphs import to_paragraphs, group_paragraphs
from util.stem import stem_list
from mining.parsing import ParseError

In [2]:
def get_range(year_range, form_type):
    for year in year_range:
        for qtr in range(1, 4):
            yield from get_index(year, qtr, form_type)

In [7]:
year_range = range(2006, 2017)
form_type = '10-k'
item = '1a'

In [8]:
def sample_iterator(it, p, max=None):
    if max is not None:
        yield from itertools.islice(sample_iterator(it, p), 0, max)
    else:
        for elem in it:
            if random.random() < p:
                yield elem

In [9]:
def n_gram(it, n):
    it = iter(it)
    while True:
        ret = tuple(itertools.islice(it, 0, n))
        if ret:
            yield ret
        else:
            break

In [10]:
corpus = []
index = get_range(year_range, form_type)
for record in sample_iterator(index, 0.01, 1):
    try:
        form = get_10k_items(record['Filename'], enable_cache=False)
    except ParseError as e:
        print(record)
        raise e
    if item in form:
        groups = group_paragraphs(to_paragraphs(form[item]))
        for header, sentencecs in group:
            if sentences:
                body = stem_list(' '.join(sentences))
                unigram = collections.Counter(body)
                bigram = collections.Counter(n_gram(body, 2))
            corpus.append((unigram, bigram))
        print(corpus)
    else:
        print(form.keys())


0.0007777561734396267 0.015306122448979591 0.00020214271275520516 5143 196 4 3
0.00038887808671981335 0.00510204081632653 0.00020214271275520516 5143 196 2 1
0.0033054637371184134 0.02040816326530612 0.0026278552658176675 5143 196 17 4
0.014388489208633094 0.00510204081632653 0.014756418031129977 5143 196 74 1
0.00038887808671981335 0.00510204081632653 0.00020214271275520516 5143 196 2 1
0.04316546762589928 0.07142857142857142 0.04204568425308268 5143 196 222 14
0.0009721952167995333 0.00510204081632653 0.0008085708510208206 5143 196 5 1
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.0013610733035193468 0.015306122448979591 0.0008085708510208206 5143 196 7 3
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.00058331713007972 0.00510204081632653 0.0004042854255104103 5143 196 3 1
0.0042776589539179465 0.025510204081632654 0.003436426116838488 5143 196 22 5
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.00174995139023916 0.01020408163265306 0.0014149989892864362 5143 196 9 2
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.0007777561734396267 0.00510204081632653 0.0006064281382656155 5143 196 4 1
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.00038887808671981335 0.00510204081632653 0.00020214271275520516 5143 196 2 1
0.0007777561734396267 0.015306122448979591 0.00020214271275520516 5143 196 4 3
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.0009721952167995333 0.01020408163265306 0.0006064281382656155 5143 196 5 2
0.0013610733035193468 0.00510204081632653 0.001212856276531231 5143 196 7 1
0.00038887808671981335 0.01020408163265306 0.0 5143 196 2 2
0.00116663426015944 0.00510204081632653 0.001010713563776026 5143 196 6 1
0.0013610733035193468 0.015306122448979591 0.0008085708510208206 5143 196 7 3
0.0007777561734396267 0.01020408163265306 0.0004042854255104103 5143 196 4 2
0.00058331713007972 0.00510204081632653 0.0004042854255104103 5143 196 3 1
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.02683258798366712 0.2193877551020408 0.019203557711744493 5143 196 138 43
0.00524985417071748 0.08163265306122448 0.002223569840307257 5143 196 27 16
0.02877697841726619 0.01020408163265306 0.029512836062259955 5143 196 148 2
0.00058331713007972 0.00510204081632653 0.0004042854255104103 5143 196 3 1
0.0042776589539179465 0.015306122448979591 0.003840711542348898 5143 196 22 3
0.0009721952167995333 0.025510204081632654 0.0 5143 196 5 5
0.00233326852031888 0.01020408163265306 0.002021427127552052 5143 196 12 2
0.0009721952167995333 0.02040816326530612 0.00020214271275520516 5143 196 5 4
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.035193466848143104 0.061224489795918366 0.03416211845562967 5143 196 181 12
0.022554929029749173 0.0663265306122449 0.020820699413786132 5143 196 116 13
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.0021388294769589733 0.01020408163265306 0.0018192844147968466 5143 196 11 2
0.0019443904335990666 0.015306122448979591 0.0014149989892864362 5143 196 10 3
0.0013610733035193468 0.00510204081632653 0.001212856276531231 5143 196 7 1
0.0007777561734396267 0.00510204081632653 0.0006064281382656155 5143 196 4 1
0.008944195994555707 0.03571428571428571 0.007883565797453002 5143 196 46 7
0.0027221466070386936 0.00510204081632653 0.0026278552658176675 5143 196 14 1
0.0007777561734396267 0.00510204081632653 0.0006064281382656155 5143 196 4 1
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
0.0038887808671981333 0.015306122448979591 0.003436426116838488 5143 196 20 3
0.00019443904335990667 0.00510204081632653 0.0 5143 196 1 1
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-2551be001f33> in <module>()
      9     if item in form:
     10         groups = group_paragraphs(to_paragraphs(form[item]))
---> 11         for header, sentencecs in group:
     12             if sentences:
     13                 body = stem_list(' '.join(sentences))

NameError: name 'group' is not defined
it = iter(get_range(year_range, form_type)) while True: next(it)