In [1]:
%pylab inline
import pandas as pd
import matplotlib.pyplot as plt
import requests
import string
from lxml import html
import datetime
In [29]:
def QB_HEADER():
str1 = "idx, DATE,OPPONENT,RESULT,CMP,PATT,PCT,PYDS,PAVG,PTD,INT,SAC,SYDS,RATE,"
str1 += "RATT,RYDS,RAVG,RLG,RTD,FUM\n"
return str1
def writeCSV(fileName, final, i=0):
d = open(fileName, 'w')
strArr = generateCSVString(final, i)
for item in strArr:
print item
d.write(item)
d.close()
def generateCSVString(final, i=0):
arrString = []
comma = ','
arrString.append(QB_HEADER())
for item in final:
#print len(item)
str1 = str(i)+comma
for j in item:
if isinstance(j, datetime.date):
str1 += str(j)+comma
else:
str1 += j+comma
str1 = str1.strip(comma)
str1 += "\n"
arrString.append(str1)
i = i + 1
return arrString
import dateutil.parser
def cbs_cleanQBArray(playerArray, year):
ret = []
#print playerArray
for item in playerArray:
del item[-1]
tmp = item
#print item[0]
date1 = dateutil.parser.parse(item[0] + '/' + str(year))
next_monday = date1 + datetime.timedelta(days=-date1.weekday(), weeks=1)
if date1.weekday() == 0:
tmp[0] = date1
else:
tmp[0] = next_monday
if '@' in item[1]:
tmp[1] = "A"
else:
tmp[1] = "H"
if 'W' in item[2]:
tmp[2] = "W"
else:
tmp[2] = "L"
ret.append(tmp)
return ret
def cbs_2011_cleanQBArray(playerArray, year):
ret = []
#print playerArray
for item in playerArray:
print len(item)
del item[3]
i = 0
tmp = list(item)
if not item[0]:
continue
date1 = dateutil.parser.parse(item[0] + '/' + str(year))
next_monday = date1 + datetime.timedelta(days=-date1.weekday(), weeks=1)
if date1.weekday() == 0:
tmp[0] = date1
else:
tmp[0] = next_monday
if '@' in item[1]:
tmp[1] = "A"
else:
tmp[1] = "H"
if 'W' in item[2]:
tmp[2] = "W"
else:
tmp[2] = "L"
tmp.insert(7,str(float(item[6])/float(item[4])))
val = tmp.pop(10)
#print val
tmp.insert(12, val)
del tmp[-1]
ret.append(tmp)
return ret
http://www.cbssports.com/nfl/playersearch?POSITION=QB&print_rows=9999
In [30]:
page = requests.get('http://www.cbssports.com/nfl/playersearch?POSITION=QB&print_rows=9999')
OUTPUT = 'players_cbs.csv'
d = open(OUTPUT, 'w')
#d.write(page.text)
#xpath1= r'//*[@id="gridContainer"]/div/div[2]/div[1]/table[2]/tbody/*/td[1]/*'
xpath1= r'//*[@id="gridContainer"]/div/div[2]/div[1]/table[2]/*/td/a/@href'
print xpath1
tree = html.fromstring(page.text)
buyers = tree.xpath(xpath1)
ret = []
for item in buyers:
s = item.split('/')
ret.append((s[4], s[5]))
#print ret
#print buyers
comma = ','
d.write("id,cbs_id,cbs_player\n")
i = 0
for item in ret:
d.write(str(i)+comma+item[0]+comma+item[1]+"\n")
i = i + 1
d.close()
page.close()
http://www.cbssports.com/nfl/players/player/gamelogs/2015/405598/
In [32]:
#name = 'jameis-winston'
#playerid = 1998197
#year = 2015
#name = 'peyton-manning'
#playerid = 12531
name = "tom-brady"
playerid = 187741
years = [2009, 2010, 2011, 2012, 2013, 2014, 2015]
OUTPUT = r'{0}_{1}.csv'.format(name, playerid)
def getQB_CBS_CSV(name, playerid, years):
ret = []
for year in years:
url = r'http://www.cbssports.com/nfl/players/player/gamelogs/{0}/{1}/'.format(year, playerid)
page = requests.get(url)
#//*[@id="layoutPlayersRight"]/div[1]/table/tbody/*
xpath1= r'//*[@id="layoutPlayersRight"]/div[1]/table/tr[@align="right"]'
#xpath1= r'//*[@id="layoutPlayersRight"]/div[1]/table/tr[@class="row1" or @clase]'
#print xpath1
tree = html.fromstring(page.text)
stats = tree.xpath(xpath1)
final = []
tmp = []
#print stats
for item in stats:
if(len(item) == 18):
continue
if(len(item) == 4):
continue
if(len(item) != 20):
continue
if 'Total' in item[0].text_content():
continue
for cell in item.xpath('./td'):
tmp.append(cell.text_content())
final.append(tmp)
tmp = []
#print final
if(year > 2011):
final = cbs_cleanQBArray(final, year)
else:
final = cbs_2011_cleanQBArray(final, year)
ret.append(final)
page.close()
ret2 = [item for sublist in ret for item in sublist]
return ret2
var1 = getQB_CBS_CSV(name, playerid, years)
#print var1
writeCSV("data/" + OUTPUT, var1, 0)
In [33]:
%pylab inline
import pandas as pd
import matplotlib.pyplot as plt
import os
def test_run():
dfSPY = pd.read_csv("tom-brady_187741.csv")
print dfSPY.head()
test_run()
In [37]:
def get_PassingYards(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
for symbol in symbols:
# TODO: Read and join data for each symbol
df_temp = pd.read_csv("data/{}.csv".format(symbol), index_col=" DATE", parse_dates=True ,usecols= [' DATE', 'PYDS'],
na_values=['nan'])
df_temp = df_temp.rename(columns={'PYDS': symbol})
if symbol == 'peyton-manning_12531': # drop dates SPY did not trade
df = df.dropna(subset=["tom-brady_187741"])
df=df.join(df_temp)
return df
def test_run():
# Define a date range
dates = pd.date_range('2014-08-22', '2015-03-26')
# Choose stock symbols to read
symbols = ['tom-brady_187741', 'peyton-manning_12531']
# Get stock data
df = get_PassingYards(symbols, dates)
print df
df.plot()
test_run()
In [ ]: