In [1]:
dummy = 'funky tomato'
print 'this is only a test'
print len(dummy)
print dummy[1]
In [47]:
#from bs4 import BeautifulSoup
#import requests
#import numpy as np
#import pandas as pd
#import unicodedata, re
#displays the webpage
#from IPython.display import HTML
#HTML('http://red-eye.baseball.cbssports.com/draft/results')
######################Code snippet for javascript pages
#from selenium import webdriver
#from selenium.common.exceptions import NoSuchElementException
#from selenium.webdriver.common.keys import Keys
#from bs4 import BeautifulSoup
#browser = webdriver.Firefox()
#browser.get('http://red-eye.baseball.cbssports.com/draft/results')
#html_source = browser.page_source
#browser.quit()
#soup = BeautifulSoup(html_source,'html.parser')
#####class "postText" is not defined in the source code
#comments = soup.findAll('div',{'class':'postText'})
#print comments
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
batters = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Batting.csv')
batters = batters[batters['yearID'] >= 2001] #from 2001 on
batters = batters[batters['AB'] > 300] #set minimum at bats
#test = batters.yearID['HR'].count
#print test
#bat_league_avg = batters.groupby(['yearID']).sum()/batters.yearID.shape(0)
#print bat_league_avg.head()
#batters = batters.set_index(['yearID','playerID'])
#print 'batters is', len(batters)
#print type(batters) #batters is a class of type pandas.core.frame.DataFrame
#print batters.ndim
print batters.keys()
print 'batters size = ', batters.size
print 'batters shape = ', batters.shape
#print batters['HR']['pujolal01'].size
#print batters[batters['playerID']=='pujolal01']
#gets stats for each year for albert pujols
#now can we index an individual player another way?
#if we multiindex the variable, we can call it, but we have to call some subset of the array
#mibat = batters.set_index(['yearID','playerID'])
#print mibat[]
#batters.plot(data[, x, y, kind, ax, subplots, ...])
salaries = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Salaries.csv')
#salaries_by_yearID_teamID = salaries.groupby(['yearID', 'teamID'])['salary'].sum()
#dummy = salaries.groupby(['yearID', 'teamID'])['salary']
#print salaries.head()
#print dummy.size
#print dummy.shape
#print batters.index
#batters.HR[2001,'abernbr01']
#print batters[:][:,'pujolal01']
#data type:
#Index([u'playerID', u'yearID', u'stint', u'teamID', u'lgID', u'G', u'AB', u'R',
# u'H', u'2B', u'3B', u'HR', u'RBI', u'SB', u'CS', u'BB', u'SO', u'IBB',
# u'HBP', u'SH', u'SF', u'GIDP'],
# dtype='object')
In [165]:
#from selenium import webdriver
#from selenium.common.exceptions import NoSuchElementException
#from selenium.webdriver.common.keys import Keys
#from bs4 import BeautifulSoup
import pandas as pd
#import scipy as sp
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import FuncFormatter
batters = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Batting.csv')
batters = batters[batters['yearID'] >= 1950] #from 2001 on
batters = batters[batters['AB'] > 500] #set minimum at bats
bat_mlb_avg = batters.groupby(['yearID']).aggregate(np.mean)
numbat = batters.groupby('yearID')['playerID'].count()
numbat = numbat.values
print range(1960,2015)
#bat_mlb_avg = bat_mlb_avg.groupbydivide(numbat)
#bat_mlb_avg['HR'].hist()
hrarr = bat_mlb_avg['HR'].values
print hrarr
dummydf = pd.DataFrame([[hrarr],[range(1960,2015)]])
print dummydf.values
#dummydf.plot(x='yr', y='hr')
In [61]:
import pandas as pd
#import scipy as sp
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import FuncFormatter
batters = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Batting.csv')
start_yr = 2001
#this is the year the stats begin at (data set goes back to 1871)
min_ab = 5
#minimum at bats to be included in stats
#print batters['HR'][:5]
test = batters[batters['stint'] > 1].add(batters[batters['stint'] == 1])
#print batters[(batters['playerID'] == 'drewst01') & (batters['yearID'] == 2014)].index.values
traded_player_list = np.array(batters[batters['stint'] > 1].index.values)
#print batters.axes
#prints names of all columns and rows in the datatframe
#print traded_player_list
#for tr_ind in traded_player_list:
#print test.head(100)
batters = batters[batters['yearID'] >= start_yr]
batters = batters[batters['AB'] > min_ab] #set minimum at bats
#newbat = batters
newbat = batters.set_index(['yearID','playerID'])
#bat_mlb_avg = batters.groupby(['yearID']).aggregate(np.mean)
#gets mlb averages for every stat
#numbat = batters.groupby('yearID')['playerID'].count()
#numbat = numbat.values
#intended to get number of batters per season
#print "batters"
#print batters.head()
#newbat = batters.groupby(['yearID','playerID']).sum() #aggregate(np.mean)
#newbat = batters.groupby(['yearID','playerID']).aggregate(np.mean)
#print "newbat"
#print newbat.head()
#print newbat['RBI'][2013,'pujolal01']
#successfully gets rbi data for pujols for 2013
#print newbat.loc[2013]
#prints all stats for 2013 for all players
print newbat.loc[2013,'drewst01']
#prints only stats for stephen drew in 2013
#print range(1960,2015)
In [ ]:
In [111]:
import pandas as pd
#import scipy as sp
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import FuncFormatter
batters_raw = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Batting.csv')
start_yr = 1900
#this is the year the stats begin at (data set goes back to 1871)
min_ab = 500
#minimum at bats to be included in stats
batters_raw = batters_raw[batters_raw['yearID'] >= start_yr] #filter out data before start year
batters = batters_raw.groupby(['yearID','playerID']).aggregate(np.sum)
#combine all stints for a player for their season numbers
batters = batters[batters['AB'] > min_ab] #set minimum at bats
batters_reset = batters.reset_index() #need to reset index to calculate averages using groupby
bat_mlb_avg = batters_reset.groupby(['yearID']).aggregate(np.mean)
#print bat_mlb_avg.head()
### now calculate fantasy league stats using a function, and add them to the database
def add_redeye_bat_stats(batstat):
avg = batstat.H/batstat.AB
hr = batstat.HR
rbi = batstat.RBI
runs = batstat.R
obp = (batstat.H + batstat.BB + batstat.HBP)/(batstat.AB + batstat.BB + batstat.HBP + batstat.SF)
slg = (batstat.H + batstat['2B'] + 2*batstat['3B'] + 3*batstat.HR)/(batstat.AB)
firstp = ((batstat.H - batstat['2B'] - batstat['3B'] - batstat.HR)/2) + batstat.BB + batstat.HBP
netsb = batstat.SB - batstat.CS
xbh = batstat['2B'] + batstat['3B'] + batstat.HR
#add the extra categories to the df
batstat['AVG'] = pd.Series(avg, index=batstat.index)
batstat['OBP'] = pd.Series(obp, index=batstat.index)
batstat['SLG'] = pd.Series(slg, index=batstat.index)
batstat['1B+'] = pd.Series(firstp, index=batstat.index)
batstat['SB-CS'] = pd.Series(netsb, index=batstat.index)
batstat['XBH'] = pd.Series(xbh, index=batstat.index)
return batstat
batters = add_redeye_bat_stats(batters)
print batters.head()
bat_mlb_avg = add_redeye_bat_stats(bat_mlb_avg)
### Now, how did they do compared to league average?
pd.set_option('display.mpl_style', 'default')
pd.set_option('display.line_width', 5000)
pd.set_option('display.max_columns', 60)
bat_mlb_avg['HR'].plot
In [ ]: