In [6]:
import requests
from bs4 import BeautifulSoup as BSoup
import pandas as pd

In [1]:
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('sqlite:///edgar_idx.db')
conn = engine.connect()
conn.begin()


Out[1]:
<sqlalchemy.engine.base.RootTransaction at 0x1cbf6fcd978>

In [11]:
import sqlite3
conn3 = sqlite3.connect('edgar_idx.db')
cursor=conn3.cursor()

In [12]:
ticker = "DQ"
#cursor.execute('''SELECT * FROM idx WHERE Symbol=?;''', ("ABBV",))
cursor.execute('''SELECT * FROM cik_ticker_name WHERE ticker=?;''',(ticker,))
res=cursor.fetchall()
print(res)
cursor.execute('''SELECT * FROM idx WHERE cik=?;''', (res[0][0],))
recs = cursor.fetchall()
names = list(map(lambda x: x[0], cursor.description))
print(names)
df = pd.DataFrame(data=recs, columns=names)
df['date'] = pd.to_datetime(df['date'])
df.columns
df.size
df.dtypes
#print(type(recs))
print(recs)
#conn3.close()
#df[df.type == "10-Q"].to_csv("TestData\\"+ticker.lower()+"_all_10qs.csv", index=None)
#df[df.type == "10-K"].to_csv("TestData\\"+ticker.lower()+"_all_10ks.csv", index=None)
df[df.type == "20-F"].to_csv("TestData\\"+ticker.lower()+"_all_20fs.csv", index=None)


[('0001477641', 'DQ', 'DAQO NEW ENERGY CORP.')]
['cik', 'conm', 'type', 'date', 'path']
[]

In [10]:
ticker = "APTV"
req = requests.get("https://www.sec.gov/cgi-bin/browse-edgar?CIK={:s}&owner=exclude&action=getcompany&Find=Search".format(ticker.lower()))
soup = BSoup(req.content, "lxml")
conmTag = soup.find("span", {"class": "companyName"})
if conmTag:
    print(conmTag.text)
atags = soup.findAll("a")
for t in atags:
    if "see all company filings" in t.text:
        print(t.text)

In [5]:


In [9]:
class ATest(self):
    self.d = {}
    
    def getValFromDict(self, n):
        if not self.d:
            for i in range(0,20):
                self.d["Val-{:d}".format(i)]= i
        print(self.d)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-90bae293debf> in <module>()
----> 1 class ATest(self):
      2     self.d = {}
      3 
      4     def getValFromDict(self, n):
      5         if not self.d:

NameError: name 'self' is not defined

In [11]:


In [12]:
all_10Qs.to_csv("TestData\\appl_all_10qs.csv", index=None)
all_10Ks.to_csv("TestData\\appl_all_10ks.csv", index=None)

In [13]:
all_10Ks =pd.read_csv("TestData\\nvda_all_10ks.csv", parse_dates=['date'], dtype={'cik':str, 'conm':str, 'type':str,'path':str})
all_10Qs =pd.read_csv("TestData\\nvda_all_10qs.csv", parse_dates=['date'], dtype={'cik':str, 'conm':str, 'type':str,'path':str})


       cik         conm  type       date  \
0  1045810  NVIDIA CORP  10-K 2016-03-17   
1  1045810  NVIDIA CORP  10-K 2017-03-01   
2  1045810  NVIDIA CORP  10-K 2018-02-28   

                                          path  
0  edgar/data/1045810/0001045810-16-000205.txt  
1  edgar/data/1045810/0001045810-17-000027.txt  
2  edgar/data/1045810/0001045810-18-000010.txt   cik             object
conm            object
type            object
date    datetime64[ns]
path            object
dtype: object

In [9]:
import numpy as np
import matplotlib.pyplot as plt
## NVDA
#y= np.array([3123000000, 3207000000, 2911000000, 2636000000, 2230000000]) #sales
#y= np.array([1.81, 2.05, 1.86, 1.39, 0.98]) #eps
#x=np.array([0.0, -91, -182, -273, -364])
## AAPL
#se = np.array([114949, 126878, 140199]) * 1000.0
#ni = np.array([11519, 13822, 20065 ]) * 1000000.0
#sales = np.array([53265, 61137, 88293, 52579, 45408, 52896, 78351]) * 1000000.0
#eps = np.array([2.36, 2.75, 3.92, 2.18, 1.68, 2.11, 3.38])
## ACLS
se= np.array([397074, 385614, 371527])*1000.0
ni = np.array([8838, 14669, 13915])*1000.0
sales = np.array([95374, 119333, 122185])*1000.0
eps = np.array([0.27, 0.46, 0.43])
days = np.array([0.0, -91, -182, -273, -364, -455, -546])
y = sales[:3]
x = days[:3]

In [10]:
p = np.polyfit(x[:3], y[:3], 2)
yfit = np.polyval(p, x)
sigma = (y[:3] - yfit[:3]) / y[:3]
error = sigma * sigma
res = error.sum()
plt.plot(x,y)
plt.plot(x,yfit)
plt.show()



In [11]:
print(p)
print(error)
print(res)
print((sales[2] - sales[1])/(days[2] - days[1]))
print((eps[2] - eps[1])/(days[2] - days[1]))
print((y[2] - y[0])/(x[2] - x[0]))


[ -1.27442338e+03  -3.79258242e+05   9.53740000e+07]
[  1.56228479e-30   2.49481927e-31   3.71830018e-31]
2.18359673575e-30
-31340.6593407
0.00032967032967
-147313.186813

In [21]:
import datetime as dt
(dt.datetime(2016,12, 31) - dt.datetime(2018,6, 30)).days


Out[21]:
-546

In [32]:
9.35-1.68- 2.11- 3.38


Out[32]:
2.1800000000000006

In [13]:
229234 - 45408- 52896- 78351


Out[13]:
52579

In [44]:
import pandas as pd
all_10Qs =pd.read_csv("TestData\\nvda_all_10qs.csv", parse_dates=['date'], dtype={'cik':str, 'conm':str, 'type':str,'path':str})

In [50]:
all_10Qs[all_10Qs.date == "2017-05-23"].index[0]


Out[50]:
1

In [ ]:


In [ ]:


In [ ]:

Candidate keywords

Shareholder's equity

  • FinancialSupportCapitalContributionsMember
  • StatementOfStockholdersEquityAbstract
  • StockholdersEquity ### EPS/Earnings/Profits/Income
  • EarningsPerShareBasic
  • EarningsPerShareDiluted
  • IncomeLossFromContinuingOperationsPerDilutedShare(?)
  • IncomeLossFromContinuingOperationsPerBasicShare (?) ### Sales/Net sales/net revenue
  • SalesRevenueNetMember (?)
  • Search for 'Revenue' in income statement (?) ### Others that might be of interest
  • us-gaap:StockRepurchasedDuringPeriodShares

Data to extract from filing:

  • Earnings for current Q
  • Earnings for last 12-20 Q
  • Sales
  • Annual EPS for last 4 years
  • ROE (= net income / shareholder's equity) # Parameters to calculate for each company:
  • Earnings increase: earning for current Q/earnings for same Q prior year
  • Excellency of earnings increase: each of two most recent Q/same Q's prior year + growth rate over last three years
  • Earnings growth acceleration in last 10 Q (calculate 2nd derivative?)
    • two consecutive Qs of earnings deceleration could mean trouble
    • plot earnings on log-scale for last 12 months to see acceleration
  • Sales increase:
    • current Q/prior Q > 25%
    • or Sales growth is accelerating in the last three Q
  • Acceleration of sales growth and earnings growth in last three Q (don't sell if accelerating!)
  • Annual EPS should be increasing in each of the last three years.
  • ROE>17%
  • Stability of Q-EPS over last 3-5 years (plot EPS, fit line through data to determine growth trend, calculate deviation of EPS's from this growth trend (i.e. goodness of fit))

In [ ]:
import requests
from bs4 import BeautifulSoup as Bsoup
import pandas as pd

In [ ]:
ticker = "NVDA"
## Read in the file
f = open ("1045810_NVIDIA-CORP_10-Q_2017-08-23", "r")
lines = f.readlines ()
f.close ()
ser = pd.Series (lines)
#print (ser)
#this denotes where the actual 10-Q starts. I think the 10-Q part is enclosed by <document> tags
html_idxs = (ser[ser == '<TYPE>10-Q\n'])
#get the line number (== index number):
print ("html_idxs: {}".format(html_idxs.index.values))
#and the corresponding line
print (lines[html_idxs.index.values[0]])
#html_idxs_end = (ser['<\TYPE>' in ser])
#print ("html_idxs_end: {}".format (html_idxs_end.index.values))
#print (lines[html_idxs_end.index.values[0]])

#now I can parse the 10-Q with beautifulsoup:
## Find a way to determine which lines to parse automatically. Really, I only need the html-tag, i.e. lines 55-63 
## (see NVDA_finstat_test.hmtl)
soup = Bsoup ("".join (lines[55:63]), "lxml") #find the end of the section by searching for /Document?
#collect all div-tags for future use
all_divs = soup.find_all ("div")
#find the div-tag that contains 'ITEM 1.&#160;&#160;FINANCIAL STATEMENTS' which indicates the start of the
#income statement's table
#will this work on all filings???

print ("found {} occurrences". format (len(all_divs)))
tables = soup.find_all ("table")
print ("found {} tables".format (len(tables)))
#iterate over the div-tags, not sure if there is a better way. I think everything is is div tags.
flag = False
count = 0
for f in all_divs:
    #find beginning of financial statements
    if (f.get_text().startswith ("ITEM 1.") and "FINANCIAL STATEMENTS" in f.get_text()):
        flag = True
        #print (count)
    if (flag):
        #print (f.get_text())
        if ("three months ended" in f.text.lower()):
            ## At this point, we found the div-tag with the income-statement table
            ## Try to use pandas
            #print((f.find_all('div', text='$')))
            for ff in f.find_all('div',text='$'):
                ff.decompose()
            ## Almost! Just need to get rid of the '$', which are offsetting the affected rows by several columns
            ## This removes the '$' (use re.compile for other currency symbols?), now what about the paren's denoting negative numbers?
            tableDf=pd.read_html(str(f))
            print(tableDf)
            ## Manual method
            trs = f.find_all("tr")
            #print (len(trs))
            #print(trs[2].get_text())
            for t in trs:
                tlist = [item.get_text() for item in t.find_all("td")]
                print(" || ".join(tlist))
            #print (trs)
            break
        #print ("\n")
    count += 1

See these urls for downloading the appropriate files, and parsing XBRL with Python/BeautifulSoup

  • https://www.codeproject.com/Articles/1227765/Parsing-XBRL-with-Python
  • https://www.slideshare.net/afalk42/xbrl-us-altova-webinar ### Taxonomy:
  • http://www.xbrlsite.com/LinkedData/BrowseObjectsByType_HTML.aspx?Type=%5BConcept%5D&Submit=Submit
  • I think I want the instance xml file for each filing, though that *.txt file I've been downloading so far contains everything (I think). Could use that as well, the benefits are that the name of this file is standardized and I already have the machinery to download it in place, but the trade-off is that it's the biggest file for each filing.
    • The instance document can be found in the *.txt file; it has the <description> XBRL INSTANCE DOCUMENT. Inside the description-tag is the <XBRL>, which contains the XBRL-data for the instance document. Note that there are several XBRL- tags containing the various XBRL documents (schema, extension, linkbases, ...).
  • Search for the tags 'us-gaap:*', they contain the items for the financial statements.
  • Also, Wikipedia has a good reference on the components of XBRL:
  • Figure out which taxonomy items I want/need, and all their permutations
  • Determine the date on each item
    • Each us-gaap:* - tag has a context-ref (e.g. "FI2017Q4"). Find the corresponding context (in the instance document, below the us-gaap - tags):
      <xbrli:context id="FI2017Q4">
        <xbrli:entity>
            <xbrli:identifier scheme="http://www.sec.gov/CIK">0001045810</xbrli:identifier>
        </xbrli:entity>
        <xbrli:period>
            <xbrli:instant>2017-01-29</xbrli:instant>
        </xbrli:period>
      </xbrli:context>
      ### Interesting classifiers:
  • unitref=