notebook.community

Edit and run



In [1]:

    
dummy = 'funky tomato'
print 'this is only a test'
print len(dummy)
print dummy[1]









    



this is only a test
12
u



In [47]:

    
#from bs4 import BeautifulSoup
#import requests
#import numpy as np
#import pandas as pd
#import unicodedata, re

#displays the webpage
#from IPython.display import HTML
#HTML('http://red-eye.baseball.cbssports.com/draft/results')



######################Code snippet for javascript pages
#from selenium import webdriver  
#from selenium.common.exceptions import NoSuchElementException  
#from selenium.webdriver.common.keys import Keys  
#from bs4 import BeautifulSoup

#browser = webdriver.Firefox()  
#browser.get('http://red-eye.baseball.cbssports.com/draft/results')  
#html_source = browser.page_source  
#browser.quit()

#soup = BeautifulSoup(html_source,'html.parser')  
#####class "postText" is not defined in the source code
#comments = soup.findAll('div',{'class':'postText'})  
#print comments

import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

batters = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Batting.csv')

batters = batters[batters['yearID'] >= 2001]  #from 2001 on
batters = batters[batters['AB'] > 300]        #set minimum at bats


#test = batters.yearID['HR'].count
#print test
#bat_league_avg = batters.groupby(['yearID']).sum()/batters.yearID.shape(0)
#print bat_league_avg.head()
#batters = batters.set_index(['yearID','playerID'])

#print 'batters is', len(batters)
#print type(batters) #batters is a class of type pandas.core.frame.DataFrame
#print batters.ndim
print batters.keys()
print 'batters size = ', batters.size
print 'batters shape = ', batters.shape

#print batters['HR']['pujolal01'].size
#print batters[batters['playerID']=='pujolal01']
#gets stats for each year for albert pujols

#now can we index an individual player another way?

#if we multiindex the variable, we can call it, but we have to call some subset of the array
#mibat = batters.set_index(['yearID','playerID'])
#print mibat[]

#batters.plot(data[, x, y, kind, ax, subplots, ...])

salaries = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Salaries.csv')

#salaries_by_yearID_teamID = salaries.groupby(['yearID', 'teamID'])['salary'].sum()

#dummy = salaries.groupby(['yearID', 'teamID'])['salary']
#print salaries.head()

#print dummy.size
#print dummy.shape
#print batters.index
#batters.HR[2001,'abernbr01']
#print batters[:][:,'pujolal01']

#data type:
#Index([u'playerID', u'yearID', u'stint', u'teamID', u'lgID', u'G', u'AB', u'R',
#       u'H', u'2B', u'3B', u'HR', u'RBI', u'SB', u'CS', u'BB', u'SO', u'IBB',
#       u'HBP', u'SH', u'SF', u'GIDP'],
#      dtype='object')









    



Index([u'playerID', u'yearID', u'stint', u'teamID', u'lgID', u'G', u'AB', u'R',
       u'H', u'2B', u'3B', u'HR', u'RBI', u'SB', u'CS', u'BB', u'SO', u'IBB',
       u'HBP', u'SH', u'SF', u'GIDP'],
      dtype='object')
batters size =  74448
batters shape =  (3384, 22)
        playerID  yearID  stint teamID lgID    G   AB    R    H  2B  ...   \
81604  pujolal01    2001      1    SLN   NL  161  590  112  194  47  ...    
82934  pujolal01    2002      1    SLN   NL  157  590  118  185  40  ...    
84248  pujolal01    2003      1    SLN   NL  157  591  137  212  51  ...    
85642  pujolal01    2004      1    SLN   NL  154  592  133  196  51  ...    
86946  pujolal01    2005      1    SLN   NL  161  591  129  195  38  ...    
88306  pujolal01    2006      1    SLN   NL  143  535  119  177  33  ...    
89697  pujolal01    2007      1    SLN   NL  158  565   99  185  38  ...    
91083  pujolal01    2008      1    SLN   NL  148  524  100  187  44  ...    
92468  pujolal01    2009      1    SLN   NL  160  568  124  186  45  ...    
93829  pujolal01    2010      1    SLN   NL  159  587  115  183  39  ...    
95210  pujolal01    2011      1    SLN   NL  147  579  105  173  29  ...    
96627  pujolal01    2012      1    LAA   AL  154  607   85  173  50  ...    
98043  pujolal01    2013      1    LAA   AL   99  391   49  101  19  ...    
99457  pujolal01    2014      1    LAA   AL  159  633   89  172  37  ...    

       RBI  SB  CS   BB  SO  IBB  HBP  SH  SF  GIDP  
81604  130   1   3   69  93    6    9   1   7    21  
82934  127   2   4   72  69   13    9   0   4    20  
84248  124   5   1   79  65   12   10   0   5    13  
85642  123   5   5   84  52   12    7   0   9    21  
86946  117  16   2   97  65   27    9   0   3    19  
88306  137   7   2   92  50   28    4   0   3    20  
89697  103   2   6   99  58   22    7   0   8    27  
91083  116   7   3  104  54   34    5   0   8    16  
92468  135  16   4  115  64   44    9   0   8    23  
93829  118  14   4  103  76   38    4   0   6    23  
95210   99   9   1   61  58   15    4   0   7    29  
96627  105   8   1   52  76   16    5   0   6    19  
98043   64   1   1   40  55    8    5   0   7    18  
99457  105   5   1   48  71   11    5   0   9    28  

[14 rows x 22 columns]



In [165]:

    
#from selenium import webdriver  
#from selenium.common.exceptions import NoSuchElementException  
#from selenium.webdriver.common.keys import Keys  
#from bs4 import BeautifulSoup


import pandas as pd
#import scipy as sp
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import FuncFormatter

batters = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Batting.csv')

batters = batters[batters['yearID'] >= 1950]  #from 2001 on
batters = batters[batters['AB'] > 500]        #set minimum at bats



bat_mlb_avg = batters.groupby(['yearID']).aggregate(np.mean)

numbat = batters.groupby('yearID')['playerID'].count()

numbat = numbat.values

print range(1960,2015)

#bat_mlb_avg = bat_mlb_avg.groupbydivide(numbat)
#bat_mlb_avg['HR'].hist()
hrarr = bat_mlb_avg['HR'].values
print hrarr
dummydf = pd.DataFrame([[hrarr],[range(1960,2015)]])

print dummydf.values

#dummydf.plot(x='yr', y='hr')









    



[1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]
[ 17.8245614   15.96153846  14.66666667  16.26315789  16.53846154
  18.43137255  19.55357143  17.69047619  17.5         19.2826087
  16.3877551   20.4         18.2875      15.3974359   17.88888889
  17.15789474  18.12328767  14.63934426  13.72307692  18.30769231
  18.23809524  14.89333333  13.88888889  16.34042553  13.          13.47058824
  11.68235294  17.21818182  14.03092784  16.29473684  14.02247191
  17.09615385  15.53333333  16.07692308  17.16129032  17.79775281
  21.34736842  14.97826087  14.03571429  15.1875      16.98780488
  14.70238095  18.02020202  21.01851852  22.40186916  20.87755102
  21.96694215  24.90740741  22.95495495  23.99029126  20.93137255
  21.93137255  21.31578947  20.79439252  21.51818182  20.10833333
  19.69811321  20.78431373  19.61904762  19.61290323  20.6728972
  18.2371134   16.92079208]
[[ array([ 17.8245614 ,  15.96153846,  14.66666667,  16.26315789,
        16.53846154,  18.43137255,  19.55357143,  17.69047619,
        17.5       ,  19.2826087 ,  16.3877551 ,  20.4       ,
        18.2875    ,  15.3974359 ,  17.88888889,  17.15789474,
        18.12328767,  14.63934426,  13.72307692,  18.30769231,
        18.23809524,  14.89333333,  13.88888889,  16.34042553,
        13.        ,  13.47058824,  11.68235294,  17.21818182,
        14.03092784,  16.29473684,  14.02247191,  17.09615385,
        15.53333333,  16.07692308,  17.16129032,  17.79775281,
        21.34736842,  14.97826087,  14.03571429,  15.1875    ,
        16.98780488,  14.70238095,  18.02020202,  21.01851852,
        22.40186916,  20.87755102,  21.96694215,  24.90740741,
        22.95495495,  23.99029126,  20.93137255,  21.93137255,
        21.31578947,  20.79439252,  21.51818182,  20.10833333,
        19.69811321,  20.78431373,  19.61904762,  19.61290323,
        20.6728972 ,  18.2371134 ,  16.92079208])]
 [ [1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]]]



In [61]:

    
import pandas as pd
#import scipy as sp
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import FuncFormatter

batters = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Batting.csv')

start_yr = 2001  
#this is the year the stats begin at (data set goes back to 1871)

min_ab = 5
#minimum at bats to be included in stats

#print batters['HR'][:5]

test = batters[batters['stint'] > 1].add(batters[batters['stint'] == 1])

#print batters[(batters['playerID'] == 'drewst01') & (batters['yearID'] == 2014)].index.values

traded_player_list = np.array(batters[batters['stint'] > 1].index.values)

#print batters.axes
#prints names of all columns and rows in the datatframe


#print traded_player_list

#for tr_ind in traded_player_list:
    

#print test.head(100)

batters = batters[batters['yearID'] >= start_yr]  
batters = batters[batters['AB'] > min_ab]        #set minimum at bats

#newbat = batters
newbat = batters.set_index(['yearID','playerID'])
#bat_mlb_avg = batters.groupby(['yearID']).aggregate(np.mean)
#gets mlb averages for every stat

#numbat = batters.groupby('yearID')['playerID'].count()
#numbat = numbat.values
#intended to get number of batters per season

#print "batters"
#print batters.head()

#newbat = batters.groupby(['yearID','playerID']).sum() #aggregate(np.mean)
#newbat = batters.groupby(['yearID','playerID']).aggregate(np.mean)
#print "newbat"
#print newbat.head()

#print newbat['RBI'][2013,'pujolal01']
#successfully gets rbi data for pujols for 2013

#print newbat.loc[2013]
#prints all stats for 2013 for all players

print newbat.loc[2013,'drewst01']
#prints only stats for stephen drew in 2013

#print range(1960,2015)









    



                 stint teamID lgID    G   AB   R    H  2B  3B  HR  RBI  SB  \
yearID playerID                                                              
2013   drewst01      1    BOS   AL  124  442  57  112  29   8  13   67   6   

                 CS  BB   SO  IBB  HBP  SH  SF  GIDP  
yearID playerID                                       
2013   drewst01   0  54  124    3    1   0   4     9



In [ ]:



In [111]:

    
import pandas as pd
#import scipy as sp
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import FuncFormatter

batters_raw = pd.read_csv('/Users/jasonday/Python/baseball-data/lahman-csv_2015-01-24/Batting.csv')

start_yr = 1900  
#this is the year the stats begin at (data set goes back to 1871)

min_ab = 500
#minimum at bats to be included in stats

batters_raw = batters_raw[batters_raw['yearID'] >= start_yr]  #filter out data before start year 

batters = batters_raw.groupby(['yearID','playerID']).aggregate(np.sum)
#combine all stints for a player for their season numbers

batters = batters[batters['AB'] > min_ab]        #set minimum at bats

batters_reset = batters.reset_index()  #need to reset index to calculate averages using groupby

bat_mlb_avg = batters_reset.groupby(['yearID']).aggregate(np.mean)

#print bat_mlb_avg.head()

### now calculate fantasy league stats using a function, and add them to the database
def add_redeye_bat_stats(batstat):
    avg = batstat.H/batstat.AB
    hr = batstat.HR
    rbi = batstat.RBI
    runs = batstat.R
    obp = (batstat.H + batstat.BB + batstat.HBP)/(batstat.AB + batstat.BB + batstat.HBP + batstat.SF)
    slg = (batstat.H + batstat['2B'] + 2*batstat['3B'] + 3*batstat.HR)/(batstat.AB)
    firstp = ((batstat.H - batstat['2B'] - batstat['3B'] - batstat.HR)/2) + batstat.BB + batstat.HBP
    netsb = batstat.SB - batstat.CS
    xbh = batstat['2B'] + batstat['3B'] + batstat.HR

#add the extra categories to the df
    batstat['AVG'] = pd.Series(avg, index=batstat.index)
    batstat['OBP'] = pd.Series(obp, index=batstat.index)
    batstat['SLG'] = pd.Series(slg, index=batstat.index)
    batstat['1B+'] = pd.Series(firstp, index=batstat.index)
    batstat['SB-CS'] = pd.Series(netsb, index=batstat.index)
    batstat['XBH'] = pd.Series(xbh, index=batstat.index)

    return batstat

batters = add_redeye_bat_stats(batters)

print batters.head()

bat_mlb_avg = add_redeye_bat_stats(bat_mlb_avg)

### Now, how did they do compared to league average?

pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.line_width', 5000) 
pd.set_option('display.max_columns', 60) 


bat_mlb_avg['HR'].plot









    



                  stint    G   AB    R    H  2B  3B  HR  RBI  SB ...   HBP  \
yearID playerID                                                  ...         
1900   barreji01      1  137  545  114  172  11   7   5   42  44 ...     5   
       beaumgi01      1  138  567  105  158  14   9   5   50  27 ...     4   
       becklja01      1  141  558   98  190  26  10   2   94  23 ...     4   
       burkeje01      1  141  559   88  203  11  15   7   68  32 ...     3   
       childcu01      1  137  531   67  128  14   5   0   44  15 ...     7   

                  SH  SF  GIDP       AVG  OBP       SLG    1B+  SB-CS  XBH  
yearID playerID                                                             
1900   barreji01  10 NaN   NaN  0.315596  NaN  0.388991  151.5    NaN   23  
       beaumgi01  21 NaN   NaN  0.278660  NaN  0.361552  109.0    NaN   28  
       becklja01  12 NaN   NaN  0.340502  NaN  0.433692  120.0    NaN   38  
       burkeje01  19 NaN   NaN  0.363148  NaN  0.474061  150.0    NaN   33  
       childcu01  20 NaN   NaN  0.241055  NaN  0.286252  118.5    NaN   19  

[5 rows x 24 columns]
line_width has been deprecated, use display.width instead (currently both are
identical)







    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-111-cd32745dbc44> in <module>()
     61 pd.set_option('display.max_columns', 60)
     62 
---> 63 figsize(15, 5)
     64 
     65 bat_mlb_avg['HR'].plot

NameError: name 'figsize' is not defined



In [ ]: