In [38]:
import pandas as pd
import MySQLdb as mdb
%matplotlib inline
In [39]:
import MySQLdb as mdb
def executeSQL(statement):
if len(statement) <= 0:
return 'no statement is given'
conn = mdb.connect('localhost', 'usstock', 'usstock', 'usstock')
cur = conn.cursor()
try:
res = None
res = cur.execute(statement)
conn.commit()
except mdb.Error as e:
print 'MySQL exception has been caught.'
finally:
cur.close()
conn.close()
return res
In [72]:
#returns list of [market, stock_sybol] by querying database.
MAX_LIMIT = 100000
#limit means the number of results to be returned. 0 means everything
def getSymbols(limit=10):
if limit <= 0:
limit = MAX_LIMIT
conn = mdb.connect('localhost', 'usstock', 'usstock', 'usstock')
cur = conn.cursor()
sql = "SELECT market, symbol FROM symbols WHERE symbol REGEXP '^[A-Z]+$'"
sql = sql + ' limit %d;'%(limit)
#print sql
#and market=\'NYSE\'
cur.execute(sql)
stocks = []
for i in range(cur.rowcount):
row = cur.fetchone()
market = row[0]
stock = row[1]
stocks.append([market, stock])
# print cur.rowcount
conn.close()
return stocks
In [73]:
#returns db column names from the dataframe arg. names are a form of string without special characters.
import re
def getFields(df):
res = []
y = df.iloc[:,0:1] #extract 1st column
#y.iloc[1:10].values
for z in y.iloc[:].values:
x=re.sub('[ /,\.\&\-()\']','',z[0]) #remove special chars
if isinstance(x, unicode):
x = x.encode('UTF-8')
#print type(x)
res.append(x)
return res
In [74]:
#generate a statement for creating a table
def generateFinTableStatement(tablename, fields):
statement = 'CREATE TABLE IF NOT EXISTS ' + tablename + ' (\n'
is_first=True
#mandatory fields
statement = statement + 'Symbol VARCHAR(20) NOT NULL,\n'
statement = statement + 'Date DATE NOT NULL,\n'
statement = statement + 'Period VARCHAR(20) NOT NULL,\n'
#financial fields
for x in fields:
if not is_first:
statement = statement + ',\n'
is_first=False
tmp = '%s DOUBLE NULL DEFAULT NULL'%x
#print statement
#print type(statement)
statement = statement + tmp
statement = statement + ');\n\n'
#print statement
return statement
In [ ]:
In [75]:
#make tables
data_to_extract = {1:'IncomeStatement', #index in the dataframe, the name of financial data
3:'BalanceSheet',
5:'CashFlow'}
def makeFinTables():
stocks = getSymbols(1)
stock = stocks[0]
#print stocks
url = "https://www.google.com/finance?q="+stock[0]+"%3A"+stock[1]+"&fstype=ii"
#print url
df = pd.read_html(url, encoding=False)
#print len(df)
for k in data_to_extract.keys():
#print k
x = df[k]
y = x.iloc[:,0:]
fields = getFields(y)
statement = generateFinTableStatement(data_to_extract[k], fields)
executeSQL(statement)
makeFinTables()
In [76]:
def getTableName(data):
y = data.iloc[:,0:1]
val = y.iloc[0:1].values[0][0]
if isinstance(val, unicode):
val = val.encode('UTF-8')
if val.startswith('Revenue'):
return 'IncomeStatement'
elif val.startswith('Cash '):
return 'BalanceSheet'
elif val.startswith('Net '):
return 'CashFlow'
else:
return None
In [ ]:
#get values from fin tables for each stock
import re
def getPeriodAndDate(column_value):
if isinstance(column_value, unicode):
column_value = column_value.encode('UTF-8')
column_value = re.sub('[\n]','',column_value) #remove special chars
size = len(column_value)
period = column_value[:size-11]
date = column_value[size-10:]
return (period,date)
def gatherFinValues(limit=10):
if limit <= 0:
limit = MAX_LIMIT
stocks = getSymbols(limit) #get every symbol
for stock in stocks[:limit]:
market = stock[0]
stockname = stock[1]
url = "https://www.google.com/finance?q="+market+"%3A"+stockname+"&fstype=ii"
print url
df = None
try:
df = pd.read_html(url, encoding=False)
except Exception:
print "Exception has been caught while processing [%s:%s]. No financial data."%(market, stockname)
if df is None:
continue #skip the rest if there is no financial data for this stock
print 'num of data is %d'%(len(df))
if len(df) <= 1:
print "Exception has been caught while processing [%s:%s]. No financial data."%(market, stockname)
continue #skip the rest if there is no financial data for this stock
#data_to_extract : this is declared already
#columns_to_extract = [1,2,3,4] #from 2015 to 2012
for k in range(0, len(df)):
data = df[k]
columns_to_extract = range(1,len(data.columns.values)) # all data columns
for c in columns_to_extract:
#print getPeriodAndDate(data.columns.values[c])
tablename = getTableName(data)
(period, date) = getPeriodAndDate(data.columns.values[c])
statement = 'INSERT INTO %s \n'%(tablename)
statement = statement + 'VALUES (\n'
statement = statement + '\'%s\',\n'%(stockname)
statement = statement + '\'%s\',\n'%(date)
statement = statement + '\'%s\'\n'%(period)
y = data.iloc[:,c:c+1]
for z in y.iloc[:].values:
try:
val = z[0].encode('UTF-8')
except AttributeError:
print z[0]
val = z[0]
#print val
if val == '-':
statement = statement + ',NULL\n'
else:
statement = statement + ',%s\n'%(val)
statement = statement + ');\n'
#print statement
res = executeSQL(statement)
#print res
gatherFinValues(0)