In [20]:
%matplotlib inline

import sys
import pathlib
sys.path.insert(0, str(pathlib.Path('..').resolve()))

import itertools
import matplotlib.pyplot as pyplot
import random
import collections
import re
random.seed(0)

from mining.retrieve_index import get_index
from mining.retrieve_10k import get_10k_items
from util.paragraphs import to_paragraphs, group_paragraphs
from util.stem import stem_list
from mining.parsing import ParseError
#from cluster_config import init_spark


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-20-7014e322d98d> in <module>()
     17 from util.stem import stem_list
     18 from mining.parsing import ParseError
---> 19 from cluster_config import init_spark

/home/sam/box/dev/EDGAR-research/cluster_config/init_spark.py in <module>()
      1 import glob
----> 2 import pyspark
      3 from haikunator import Haikunator
      4 
      5 my_sc = None

ImportError: No module named 'pyspark'

In [4]:
def sample_iterator(it, p, max=None):
    if max is not None:
        yield from itertools.islice(sample_iterator(it, p), 0, max)
    else:
        for elem in it:
            if random.random() < p:
                yield elem

In [5]:
def n_gram(it, n):
    it = iter(it)
    while True:
        ret = tuple(itertools.islice(it, 0, n))
        if ret:
            yield ret
        else:
            break

In [3]:
def get_index_range(year_range, form_type):
    for year in year_range:
        for qtr in range(1, 4):
            yield from get_index(year, qtr, form_type)

In [19]:
sc = init_spark('save rf')
sc.


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-a72dcfe7acd8> in <module>()
----> 1 init_spark('save rf')

NameError: name 'init_spark' is not defined

In [16]:
def get_items(records):
    for record in records:
        try:
            form = get_10k_items(record['Filename'], enable_cache=False)
        except ParseError as e:
            print(record)
            print(str(e))
        if item in form:
            yield form[item]

In [6]:
def vectorize(rfs):
    for rf in rfs:
        groups = group_paragraphs(to_paragraphs(form[item]))
        for header, paragraphs in groups:
            if paragraphs:
                body = ' '.join(' '.join(paragraph) for paragraph in paragraphs)
                body = re.sub('[^a-zA-Z ]', '', body)
                words = list(stem_list(body))
                unigram = collections.Counter(words)
                bigram = collections.Counter(n_gram(words, 2))
                corpus.append((unigram, bigram))
                print(', '.join(unigram for unigram, count in unigram.most_common(10)))
                print(', '.join(' '.join(bigram) for bigram, count in bigram.most_common(10)))
    else:
        print(form.keys())


0.0002701242571582928 0.00423728813559322 0.00013950892857142856 7404 236 2 1
0.000675310642895732 0.00847457627118644 0.0004185267857142857 7404 236 5 2
0.00675310642895732 0.09322033898305085 0.00390625 7404 236 50 22
0.0001350621285791464 0.00423728813559322 0.0 7404 236 1 1
0.0008103727714748784 0.00423728813559322 0.0006975446428571429 7404 236 6 1
0.0008103727714748784 0.00423728813559322 0.0006975446428571429 7404 236 6 1
0.0002701242571582928 0.00423728813559322 0.00013950892857142856 7404 236 2 1
0.0009454349000540249 0.00423728813559322 0.0008370535714285714 7404 236 7 1
0.0008103727714748784 0.00847457627118644 0.0005580357142857143 7404 236 6 2
0.009049162614802809 0.038135593220338986 0.008091517857142858 7404 236 67 9
0.0001350621285791464 0.00423728813559322 0.0 7404 236 1 1
0.0001350621285791464 0.00423728813559322 0.0 7404 236 1 1
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.0008103727714748784 0.012711864406779662 0.0004185267857142857 7404 236 6 3
0.0001350621285791464 0.00423728813559322 0.0 7404 236 1 1
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.009994597514856833 0.012711864406779662 0.009905133928571428 7404 236 74 3
0.0009454349000540249 0.00423728813559322 0.0008370535714285714 7404 236 7 1
0.0001350621285791464 0.00423728813559322 0.0 7404 236 1 1
0.000675310642895732 0.00423728813559322 0.0005580357142857143 7404 236 5 1
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.0002701242571582928 0.00847457627118644 0.0 7404 236 2 2
0.0002701242571582928 0.00423728813559322 0.00013950892857142856 7404 236 2 1
0.02188006482982172 0.0847457627118644 0.019810267857142856 7404 236 162 20
0.0002701242571582928 0.00423728813559322 0.00013950892857142856 7404 236 2 1
0.004862236628849271 0.0423728813559322 0.003627232142857143 7404 236 36 10
0.0002701242571582928 0.00847457627118644 0.0 7404 236 2 2
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.0017558076715289033 0.00423728813559322 0.0016741071428571428 7404 236 13 1
0.0002701242571582928 0.00423728813559322 0.00013950892857142856 7404 236 2 1
0.0014856834143706105 0.00423728813559322 0.0013950892857142857 7404 236 11 1
0.0004051863857374392 0.00847457627118644 0.00013950892857142856 7404 236 3 2
0.0017558076715289033 0.00423728813559322 0.0016741071428571428 7404 236 13 1
0.0001350621285791464 0.00423728813559322 0.0 7404 236 1 1
0.001350621285791464 0.00847457627118644 0.0011160714285714285 7404 236 10 2
0.0005402485143165856 0.00847457627118644 0.00027901785714285713 7404 236 4 2
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.0012155591572123178 0.00423728813559322 0.0011160714285714285 7404 236 9 1
0.0032414910858995136 0.00423728813559322 0.003208705357142857 7404 236 24 1
0.004321988114532685 0.01694915254237288 0.00390625 7404 236 32 4
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.021609940572663425 0.1906779661016949 0.016043526785714284 7404 236 160 45
0.0009454349000540249 0.00423728813559322 0.0008370535714285714 7404 236 7 1
0.048217179902755265 0.11440677966101695 0.04603794642857143 7404 236 357 27
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.000675310642895732 0.00423728813559322 0.0005580357142857143 7404 236 5 1
0.0014856834143706105 0.00423728813559322 0.0013950892857142857 7404 236 11 1
0.0004051863857374392 0.00423728813559322 0.00027901785714285713 7404 236 3 1
0.007698541329011345 0.00423728813559322 0.0078125 7404 236 57 1
0.0009454349000540249 0.00423728813559322 0.0008370535714285714 7404 236 7 1
0.0002701242571582928 0.00423728813559322 0.00013950892857142856 7404 236 2 1
0.027552674230145867 0.0635593220338983 0.0263671875 7404 236 204 15
0.0035116153430578066 0.029661016949152543 0.002650669642857143 7404 236 26 7
0.0009454349000540249 0.00423728813559322 0.0008370535714285714 7404 236 7 1
0.0002701242571582928 0.00423728813559322 0.00013950892857142856 7404 236 2 1
0.0005402485143165856 0.01694915254237288 0.0 7404 236 4 4
0.0004051863857374392 0.012711864406779662 0.0 7404 236 3 3
0.03862776877363587 0.00847457627118644 0.03962053571428571 7404 236 286 2
,, statement, forward-look, and, in, our, ., contain, such, may
contain in, forward-look statement, from those, other thing, such forward-look, made by, our futur, press releas, e may, risk describ
of, million, senior, ,, our, ., secur, $, to, 31
an addit, senior secur, are permit, at decemb, option of, credit facil, subject to, $ 69, debt under, s of
our, the, to, indebted, and, senior, ., in, ,, abl
contain in, indebted is, in the, If new, senior subordin, incur substanti, our senior, may be, ., the restrict
the, ,, our, to, and, ., of, secur, senior, credit
secur credit, the senior, credit facil, senior secur, our abil, under the, these coven, facil and, facil ., that we
to, and, our, ,, ., or, the, oblig, not, may
our debt, servic oblig, may not, dispos of, , we, or to, to meet, our indebted, us to, on or
our, to, ,, subsidiari, ., of, and, make, abil, or
our subsidiari, to make, of our, our indebted, to us, to pay, payment in, abil to, , is, on such
our, to, and, ,, could, ., custom, standard, of, be
, our, fail to, . If, , and, be damag, and market, to adopt, qualiti compon, gener on, of product
,, of, ., for, a, the, sale, our, and, net
net sale, boston scientif, number of, that we, the loss, our net, for a, of our, will reach, of custom
to, ., of, our, the, compani, and, ,, outsourc, medic
medic devic, by medic, devic compani, oper ., properti right, could caus, current assembl, busi ., of product, while industri
our, of, ,, in, ;, and, result, the, to, a
our customers’, of our, , which, time of, result in, our oper, revenu ;, our cost, materi ,, chang in
and, ,, our, to, we, competit, of, ., custom, compani
medic devic, and manufactur, and market, base ,, As more, veri competit, , personnel, , prototyp, our exist, , technic
and, our, facil, custom, ,, or, to, could, in, .
custom and, the servic, could damag, consolid and, facil ., shift product, outsourc vendor, , which, longer use, with our
,, our, and, or, ., could, to, technolog, product, result
for our, which could, busi ,, not result, that finish, , particularli, medic devic, viabl process, result ., or obsolet
,, and, ., our, product, in, of, materi, raw, to
raw materi, and product, , we, product feedstock, materi and, , tantalum, fluctuat in, of raw, feedstock use, stainless steel
our, and, in, ,, oper, of, the, intern, or, to
which is, in the, sale from, gener by, , could, intern oper, a particular, europ and, and difficulti, facil .
new, ,, and, market, our, into, ., to, in, expand
new market, expand into, in new, through the, improv manufactur, e may, , develop, busi ., other aspect, design and
,, ., our, to, and, the, or, of, have, a
in the, have a, a result, In addit, our product, hazard chemic, for violat, to futur, on the, on us
,, and, to, of, our, we, may, or, ., be
we may, intellectu properti, may be, properti right, of our, protect our, in the, or other, infring claim, , and
to, ,, or, the, of, ., and, our, product, in
respond to, or recal, liabil claim, of our, to product, be requir, we will, medic devic, . We, product recal
,, and, the, ., to, regul, industri, in, or, healthcar
our oper, In addit, feder and, the medic, devic industri, , ani, contain propos, chang frequent, healthcar industri, oper .
to, industri, ,, ., compani, and, healthcar, consolid, our, power
medic devic, industri particip, will becom, suffer ., in the, that incorpor, to provid, condit and, of consolid, industri consolid
,, to, our, and, the, of, for, ., custom, devic
our custom, the cost, discount price, parti payor, on third, insur compani, , in, amount we, design and, and other
,, our, ., to, could, and, in, or, result, accid
involv complex, or death, to satisfi, other resourc, oper or, and could, , which, employ safeti, and hazard, can be
,, of, the, our, goodwil, ., net, fair, could, is
fair valu, of the, our total, oper ., on the, , which, , of, s of, impair ,, is subject
,, or, and, capit, avail, ., to, addit, requir, be
. If, , we, cash gener, capit in, credit facil, and result, curtail our, of operations—liquid, intern is, under our
,, ., our, and, to, of, oper, the, custom, sale
power gener, and industri, oper ., lamp ,, e have, our net, , motion, impact our, market due, medic devic
the, of, ,, or, erp, system, ., our, and, in
erp system, the erp, of the, in the, . ani, the integr, busi ,, be no, system ,, a materi
,, our, of, ., and, the, We, senior, manag, ron
of our, senior manag, ani of, offic ., oper ., make up, do not, insur for, gari D., our senior
our, to, ., and, personnel, in, ,, engin, or, may
personnel ., our abil, engin with, abil to, other compani, compet with, and provid, in the, or in, , which
to, ,, and, compon, may, our, ., product, on, or
and compon, product and, , we, third parti, supplier may, ani of, abil to, compon from, on term, occur ,
,, of, and, the, to, we, our, oper, not, .
integr of, to the, busi ,, in the, success integr, manag of, , includ, , we, to identifi, we may
our, ,, in, of, the, to, and, sponsor, ., or
in the, conflict with, signific amount, refer to, as the, , control, ffiliat of, to influenc, in compani, busi of

In [3]:
year_range = range(2006, 2017)
form_type = '10-k'
item = '1a'

In [ ]:
items = (
    get_items(
        random_sample(
            get_index_range(year_range, form_type),
            0.01)))
it = iter(get_range(year_range, form_type)) while True: next(it)

In [14]:
from util.supress import supress
with supress():
    print('hi')


hi

In [ ]: