Exploratory Data Analysis for Baseball Database

The main purpose is to gain an overview of the dataset. Identify problems in the dataset that need to be corrected. Identify outliers and consider if these are real or erros in the dataset.

The exploratory data analysis (EDA) process will help firm up which dependent variables should be used to investigate independent variables and how the former depends on the latter.

The theme of this investigation is to ask if geographical location has an affect, if where a person was born, where the college was located has an impact on a dependent variable (e.g. Salary of player).



In [1]:

    
from __future__ import print_function
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# Move to the top level of the repository
# to access the script and ballbase module
os.chdir((os.path.join('..', '..')))

import Baseball_data_investigation
from ballbase import figures

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-ba1e365d6464> in <module>()
     11 
     12 import Baseball_data_investigation
---> 13 from ballbase import figures
     14 
     15 # Config the matplotlib backend as plotting inline in IPython

~/Documents/GIT/Investigating_Baseball_Data/ballbase/figures.py in <module>()
    561 
    562 
--> 563 def count_bar(data, name, color_set=custom, ax_size=(20, 6), funky=False, highlight=None, ax=None):
    564     """Make a univariate distribution
    565     of a variable.

NameError: name 'custom' is not defined



In [2]:

    
df = Baseball_data_investigation.main()
df.head()









    



Processed hall of fame data

Processed All Star data

Processed Player Awards data

Processed Salary data

Processed college locations
Birth Year Size 18973

Processed master file
master_merge is ready
data audit complete
final birth year size 6575






    Out[2]:







  
    
      
      birthYear
      birthMonth
      birthDay
      birthCountry
      birthState
      birthCity
      deathYear
      deathMonth
      deathDay
      deathCountry
      ...
      max_salary
      min_salary
      mean_salary_standardized_annually
      max_salary_standardized_annually
      min_salary_standardized_annually
      mode_schoolID
      college_name_full
      college_city
      college_state
      college_country
    
    
      playerID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      aardsda01
      1981.0
      12.0
      27.0
      USA
      CO
      Denver
      NaN
      NaN
      NaN
      NaN
      ...
      4500000.0
      300000.0
      -0.440097
      0.260102
      -0.670224
      rice
      Rice University
      Houston
      TX
      USA
    
    
      abadan01
      1972.0
      8.0
      25.0
      USA
      FL
      Palm Beach
      NaN
      NaN
      NaN
      NaN
      ...
      327000.0
      327000.0
      -0.663649
      -0.663649
      -0.663649
      gamiddl
      Middle Georgia College
      Cochran
      GA
      USA
    
    
      abbeybe01
      1869.0
      11.0
      11.0
      USA
      VT
      Essex
      1962.0
      6.0
      11.0
      USA
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      vermont
      University of Vermont
      Burlington
      VT
      USA
    
    
      abbotje01
      1972.0
      8.0
      17.0
      USA
      GA
      Atlanta
      NaN
      NaN
      NaN
      NaN
      ...
      300000.0
      175000.0
      -0.644164
      -0.598825
      -0.690417
      kentucky
      University of Kentucky
      Lexington
      KY
      USA
    
    
      abbotji01
      1967.0
      9.0
      19.0
      USA
      MI
      Flint
      NaN
      NaN
      NaN
      NaN
      ...
      2775000.0
      68000.0
      0.273098
      1.275547
      -0.814658
      michigan
      University of Michigan
      Ann Arbor
      MI
      USA
    
  

5 rows × 44 columns



In [3]:

    
custom = ['#192231','#3C3C3C','#CDCDCD', '#494E6B']
ToddTerje = ['#F24C4E', '#EAB126', '#1FB58F', '#1B7B34']

def frequency_polygon(x, name, categorical_v=None, color_set=custom, proportion=False, ax_size=(10, 5), funky=False, formatting_right=True, x_truncation_upper=None, x_truncation_lower=None, ax=None):
    """Returns a frequency polygon
    plot which can be used with catergorical
    data to show difference between categories.
    """
    
    if funky:
        color_set = ToddTerje
    
    figures.common_set_up(ax_size)
    
    x_max = x.max()
    x_min = x.min()
    bin_n = int(x_max)-int(x_min)

    # Get height and a position form a histogram
    # to be turned into point for frequency polygon
    y, bin_bounds=np.histogram(x, bins=bin_n)
    bin_edge = bin_bounds[:-1]

    if proportion:
        x_size = len(x)
        y = y/x_size
        y_str = "Proportion"
    else:
        y_str = "Frequency"
    
    fig = sns.pointplot(bin_edge, y, color='#192231', scale=0.3, marker='.')
    
    title_color = '#192231'
    font_colour = '#9099A2'

    # Do not add a title in a multi-figure plot.
    #
    # Title will be added to figure with all sub-plots
    # instead in this case
    if ax is None: 
        fig.set_title(('Frequency polygon of {0}'.format(name)),
                    fontsize=20, color=title_color)
    fig.set_xlabel('{0}'.format(name),
                   color=font_colour)
    fig.set_ylabel(y_str.format(name),
                   color=font_colour)

    # Limit the x axis by truncating
    if x_truncation_upper or x_truncation_lower:
        axes = fig.axes
        fig.set_xlim(x_truncation_lower, x_truncation_upper)
        # To be communicated back in Formatting notes
        x_truncation_upper_str = 'x axis truncated by {0}\n'.format(x_truncation_upper)
        x_truncation_lower_str = 'x axis truncated after {0}\n'.format(x_truncation_lower)

        parameters = ('Formatting:\n'
                    + x_truncation_lower_str
                    + x_truncation_upper_str)

        fig = figures.formatting_text_box(fig, parameters, formatting_right)
    
    # Will not work on multiple subplots within a figure
    if ax is None:
        # Seaborn despine to remove boundaries around plot
        sns.despine(offset=2, trim=True, left=True, bottom=True)

admc = frequency_polygon(df['allstar_count'].dropna(), "Allstar count", proportion=True)



In [15]:

    
df2 = df[["allstar_count", "birthState"]].dropna().copy()
dfs = {value: rows for value, rows in df2.groupby('birthState')}
dfs









    Out[15]:





{'AK':            allstar_count birthState
 playerID                           
 schilcu01            6.0         AK, 'AL':            allstar_count birthState
 playerID                           
 ageeto01             2.0         AL
 benedbr01            2.0         AL
 bollifr01            4.0         AL
 brantje01            1.0         AL
 davenji01            2.0         AL
 fostege01            5.0         AL
 jacksbo01            1.0         AL
 keyji01              4.0         AL
 klubeco01            1.0         AL
 laryfr01             3.0         AL
 roberda08            1.0         AL
 sewellu01            1.0         AL
 sewelri01            4.0         AL
 smithoz01           15.0         AL
 suttodo01            4.0         AL
 vealebo01            2.0         AL, 'AR':            allstar_count birthState
 playerID                           
 brocklo01            6.0         AR
 frankry01            1.0         AR
 jacksra01            2.0         AR
 kessido01            6.0         AR
 leecl02              4.0         AR
 mondari01            2.0         AR
 moonwa01             3.0         AR
 roepr01              5.0         AR, 'AZ':            allstar_count birthState
 playerID                           
 ethiean01            2.0         AZ
 hillesh02            2.0         AZ
 kinslia01            4.0         AZ
 leibeha01            3.0         AZ
 pagnoto01            1.0         AZ, 'BC':            allstar_count birthState
 playerID                           
 bayja01              3.0         BC
 zimmeje02            1.0         BC, 'CA':            allstar_count birthState
 playerID                           
 aguilri01            3.0         CA
 appieke01            1.0         CA
 bartlja01            1.0         CA
 bellhe01             3.0         CA
 blackew01            6.0         CA
 blausje01            2.0         CA
 bochtbr01            1.0         CA
 bondsba01           14.0         CA
 booneaa01            1.0         CA
 boonebo01            4.0         CA
 boonebr01            3.0         CA
 bowala01             5.0         CA
 boxbebr01            1.0         CA
 braunry02            6.0         CA
 brookhu01            2.0         CA
 burnije01            1.0         CA
 busbyst01            2.0         CA
 butlebr01            1.0         CA
 caminke01            3.0         CA
 castrja01            1.0         CA
 chapmsa01            1.0         CA
 cimolgi01            1.0         CA
 cirilje01            2.0         CA
 colboji01            1.0         CA
 colege01             1.0         CA
 cookry01             1.0         CA
 cordech01            1.0         CA
 correke01            1.0         CA
 craigal01            1.0         CA
 crawfbr01            1.0         CA
 ...                  ...        ...
 smallro02            1.0         CA
 smithda02            2.0         CA
 speiech01            3.0         CA
 spraged02            1.0         CA
 stiebda01            7.0         CA
 strasst01            2.0         CA
 thompja01            3.0         CA
 trachst01            1.0         CA
 tulowtr01            5.0         CA
 utleych01            6.0         CA
 vaughgr01            4.0         CA
 venturo01            2.0         CA
 vinafe01             1.0         CA
 walkbo01             1.0         CA
 wallati01            5.0         CA
 wallsle01            1.0         CA
 watsobo01            2.0         CA
 weaveje02            3.0         CA
 wettejo01            3.0         CA
 wiggity01            1.0         CA
 willima04            5.0         CA
 wilsocj01            2.0         CA
 wilsoja02            1.0         CA
 wilsoji02            3.0         CA
 winnra01             1.0         CA
 wolfra02             1.0         CA
 worreto01            3.0         CA
 wrighst01            1.0         CA
 youngma01            1.0         CA
 youngmi02            7.0         CA
 
 [135 rows x 2 columns], 'CO':            allstar_count birthState
 playerID                           
 martiti01            1.0         CO
 melanma01            3.0         CO
 stearjo01            4.0         CO, 'CT':            allstar_count birthState
 playerID                           
 bottari01            1.0         CT
 dibblro01            2.0         CT
 dropowa01            1.0         CT
 harvema01            1.0         CT
 nagych01             3.0         CT
 polloaj01            1.0         CT
 vaughmo01            3.0         CT, 'DC':            allstar_count birthState
 playerID                           
 donnebr01            1.0         DC, 'DE':            allstar_count birthState
 playerID                           
 goldspa01            4.0         DE
 shortch02            2.0         DE, 'Distrito Nacional':            allstar_count         birthState
 playerID                                   
 alvarpe01            1.0  Distrito Nacional
 bautijo02            6.0  Distrito Nacional
 penaca01             1.0  Distrito Nacional
 polanpl01            2.0  Distrito Nacional
 pujolal01           10.0  Distrito Nacional, 'FL':            allstar_count birthState
 playerID                           
 avilaal01            1.0         FL
 bicheda01            4.0         FL
 byrdma01             1.0         FL
 carltst01           10.0         FL
 cartela02            1.0         FL
 colemvi01            2.0         FL
 corbedo01            1.0         FL
 davisgl01            2.0         FL
 dawsoan01            8.0         FL
 donaljo02            3.0         FL
 drewjd01             1.0         FL
 eckstda01            2.0         FL
 garvest01           10.0         FL
 gonzalu01            5.0         FL
 howelja01            3.0         FL
 howsedi01            2.0         FL
 hudekjo01            1.0         FL
 johnsch04            2.0         FL
 johnsda02            4.0         FL
 joycema01            1.0         FL
 kendrho01            1.0         FL
 lucrojo01            2.0         FL
 ludwiry01            1.0         FL
 martijd02            1.0         FL
 martiti02            2.0         FL
 mcraeha01            3.0         FL
 millean01            1.0         FL
 murphda08            2.0         FL
 odayda01             1.0         FL
 parrila01            2.0         FL
 perezch01            2.0         FL
 rivermi01            1.0         FL
 salech01             5.0         FL
 sanchga01            1.0         FL
 stanlmi02            1.0         FL
 thigpbo01            1.0         FL
 thompro01            2.0         FL
 wakefti01            1.0         FL
 weeksri01            1.0         FL
 whitebi03            8.0         FL, 'GA':            allstar_count birthState
 playerID                           
 aloumo01             6.0         GA
 brownke01            6.0         GA
 chandsp01            4.0         GA
 davisjo02            2.0         GA
 dentbu01             3.0         GA
 fairlro01            2.0         GA
 grissma02            2.0         GA
 hearnji01            1.0         GA
 hudsoti01            4.0         GA
 jonesto02            1.0         GA
 joynewa01            1.0         GA
 poseybu01            4.0         GA
 robinja02            6.0         GA
 thomafr04            5.0         GA, 'HI':            allstar_count birthState
 playerID                           
 darliro01            1.0         HI
 suzukku01            1.0         HI, 'IA':            allstar_count birthState
 playerID                           
 boddimi01            1.0         IA
 brownma01            1.0         IA
 josepdu01            1.0         IA
 laudnti01            1.0         IA
 liebejo01            1.0         IA
 wachami01            1.0         IA
 watsoto01            1.0         IA, 'ID':            allstar_count birthState
 playerID                           
 jacksla01            5.0         ID
 lawva01              1.0         ID
 schroke01            1.0         ID, 'IL':            allstar_count birthState
 playerID                           
 boudrlo01            8.0         IL
 caprabu01            1.0         IL
 coomero01            1.0         IL
 finigji01            2.0         IL
 fletcda01            1.0         IL
 gaettga01            2.0         IL
 girarjo01            1.0         IL
 grandcu01            3.0         IL
 gurala01             1.0         IL
 halleto01            3.0         IL
 isrinja01            2.0         IL
 johnsdo02            2.0         IL
 kipnija01            2.0         IL
 kluszte01            4.0         IL
 kolbda01             1.0         IL
 lynnfr01             9.0         IL
 muldema01            2.0         IL
 otoolji01            1.0         IL
 pattima01            1.0         IL
 peterfr01            1.0         IL
 puckeki01           10.0         IL
 reuscri01            3.0         IL
 roberro01            7.0         IL
 seitzke01            2.0         IL
 skowrbi01            8.0         IL
 stonebi01            1.0         IL
 sundbji01            3.0         IL
 thomeji01            5.0         IL
 wakefdi01            1.0         IL
 wilsoda01            1.0         IL
 zobribe01            3.0         IL, 'IN':            allstar_count birthState
 playerID                           
 benesan01            1.0         IN
 bradlph01            1.0         IN
 crowege01            1.0         IN
 hildeor01            1.0         IN
 hodgegi01            8.0         IN
 loftoke01            6.0         IN
 lynnla01             1.0         IN
 maypi01              1.0         IN
 plesada01            3.0         IN
 reedro01             1.0         IN
 samarje01            1.0         IN, 'KS':            allstar_count birthState
 playerID                           
 crowaa01             1.0         KS
 grimsro02            1.0         KS
 hornebo01            1.0         KS, 'KY':            allstar_count birthState
 playerID                           
 buhneja01            1.0         KY
 byrdpa01             1.0         KY
 ugglada01            3.0         KY
 webbbr01             3.0         KY, 'Kingston':            allstar_count birthState
 playerID                           
 masteju01            1.0   Kingston, 'LA':            allstar_count birthState
 playerID                           
 adcocjo01            2.0         LA
 belleal01            5.0         LA
 charlno01            1.0         LA
 clarkwi02            6.0         LA
 finlech01            5.0         LA
 fisheed02            1.0         LA
 garrra01             1.0         LA
 guidrro01            4.0         LA
 harpeto01            1.0         LA
 lyonste01            1.0         LA
 mileywa01            1.0         LA
 papeljo01            6.0         LA
 pettian01            3.0         LA
 reynosh01            1.0         LA
 ryanbj01             2.0         LA
 ryanco01             1.0         LA
 sheetbe01            4.0         LA
 willisc01            1.0         LA
 wilsodo01            1.0         LA, 'La Habana':            allstar_count birthState
 playerID                           
 grandya01            1.0  La Habana
 palmera01            4.0  La Habana, 'MA':            allstar_count birthState
 playerID                           
 bagweje01            4.0         MA
 bedrost01            1.0         MA
 bereja01             1.0         MA
 capuach01            1.0         MA
 cochrmi01            2.0         MA
 disarga01            1.0         MA
 hillke01             1.0         MA
 lahaibr01            1.0         MA
 raschvi01            4.0         MA
 reardje01            4.0         MA
 siebedi01            1.0         MA, 'MD':            allstar_count birthState
 playerID                           
 anderbr01            3.0         MD
 byrneto01            1.0         MD
 cecilbr01            1.0         MD
 jordabr01            1.0         MD
 kellech01            5.0         MD
 neaglde01            2.0         MD
 nichobi01            5.0         MD
 teixema01            3.0         MD, 'MI':            allstar_count birthState
 playerID                           
 bordimi01            1.0         MI
 freehbi01           11.0         MI
 grillja01            1.0         MI
 howest01             1.0         MI
 kaatji01             3.0         MI
 maxwech01            2.0         MI
 maysjo01             1.0         MI
 pacioto01            1.0         MI
 putzjj01             1.0         MI
 radatdi01            2.0         MI
 saboch01             3.0         MI
 sandesc01            1.0         MI
 sorenla01            1.0         MI
 thornma01            1.0         MI
 treshto01            3.0         MI
 varitja01            3.0         MI
 welchbo01            2.0         MI
 whitter01            1.0         MI, 'MN':            allstar_count birthState
 playerID                           
 molitpa01            7.0         MN
 morrija02            5.0         MN
 olsongr02            1.0         MN
 perkigl01            3.0         MN
 repulri01            1.0         MN
 seleaa01             2.0         MN
 steinte01            3.0         MN
 winfida01           12.0         MN, 'MO':            allstar_count birthState
 playerID                           
 allisbo01            3.0         MO
 arrieja01            1.0         MO
 ashbyan01            2.0         MO
 buehrma01            5.0         MO
 eversho01            2.0         MO
 fettelo01            1.0         MO
 henketo01            2.0         MO
 hennemi01            1.0         MO
 holtzke01            2.0         MO
 howarry01            3.0         MO
 mcbriba01            1.0         MO
 odonojo01            1.0         MO
 rogerst01            5.0         MO
 scherma01            4.0         MO
 siebeso01            2.0         MO
 stottme01            5.0         MO, 'MS':            allstar_count birthState
 playerID                           
 burksel01            2.0         MS
 doziebr01            1.0         MS
 ferrida01            2.0         MS
 lawtoma02            2.0         MS
 meltobi01            1.0         MS
 myerbu01             2.0         MS
 oswalro01            3.0         MS
 passecl01            5.0         MS
 walkege02            1.0         MS, 'NC':            allstar_count birthState
 playerID                           
 altmage01            3.0         NC
 andrena01            1.0         NC
 bibbyji01            1.0         NC
 breweto01            1.0         NC
 brownji03            1.0         NC
 ferreri01            8.0         NC
 gracema01            3.0         NC
 hintoch01            1.0         NC
 hollagr01            2.0         NC
 martist01            1.0         NC
 perryji01            3.0         NC
 roberbr01            2.0         NC
 scarbra01            1.0         NC
 seageky01            1.0         NC
 vogelry01            1.0         NC
 whitebu01            2.0         NC
 zimmery01            1.0         NC, 'ND':            allstar_count birthState
 playerID                           
 erstada01            2.0         ND, 'NE':            allstar_count birthState
 playerID                           
 burketi01            1.0         NE
 cervbo01             1.0         NE
 gibsobo01            9.0         NE
 gordoal01            3.0         NE
 olsongr01            1.0         NE, 'NH':            allstar_count birthState
 playerID                           
 flanami01            1.0         NH
 rolfere01            4.0         NH
 tewksbo01            1.0         NH
 wilsobr01            3.0         NH, 'NJ':            allstar_count birthState
 playerID                           
 armstja01            1.0         NJ
 bailean01            2.0         NJ
 borowha01            2.0         NJ
 brachbr01            1.0         NJ
 caseyse01            3.0         NJ
 frazito01            2.0         NJ
 hammoje01            1.0         NJ
 hansoer01            1.0         NJ
 messean01            4.0         NJ
 montejo01            1.0         NJ
 rosadjo01            2.0         NJ
 wagneha01            2.0         NJ
 younger01            1.0         NJ, 'NM':            allstar_count birthState
 playerID                           
 ontivst02            1.0         NM
 stephve01            8.0         NM, 'NV':            allstar_count birthState
 playerID                           
 macdomi01            1.0         NV
 zitoba01             3.0         NV, 'NY':            allstar_count birthState
 playerID                           
 aurilri01            1.0         NY
 biggicr01            7.0         NY
 brancra01            3.0         NY
 easleda01            1.0         NY
 francjo01            4.0         NY
 friscfr01            3.0         NY
 gehrilo01            7.0         NY
 giustda01            1.0         NY
 harnipe01            1.0         NY
 heatone01            1.0         NY
 hershor01            3.0         NY
 hollida01            1.0         NY
 ibanera01            1.0         NY
 keegabo01            1.0         NY
 konstji01            1.0         NY
 koufasa01            7.0         NY
 lemanda01            1.0         NY
 loducpa01            4.0         NY
 morrima01            2.0         NY
 murphjo04            3.0         NY
 norenir01            1.0         NY
 remlimi01            1.0         NY
 russoma01            1.0         NY
 scheiri01            1.0         NY
 schumha02            2.0         NY
 shawbo01             1.0         NY
 singlke01            3.0         NY
 stirnsn01            2.0         NY
 surhobj01            1.0         NY
 violafr01            3.0         NY
 weisswa01            1.0         NY
 yosted01             1.0         NY, 'OH':            allstar_count birthState
 playerID                           
 bandosa01            4.0         OH
 brenlbo01            1.0         OH
 chambch01            1.0         OH
 clemero02           11.0         OH
 draveda01            1.0         OH
 edwarjo01            3.0         OH
 ellissa01            1.0         OH
 greenty01            1.0         OH
 harrijo05            1.0         OH
 howarfr01            4.0         OH
 humeto01             1.0         OH
 johnsla03            1.0         OH
 justida01            3.0         OH
 kennete02            4.0         OH
 larkiba01           12.0         OH
 lawrebr01            1.0         OH
 montgje01            3.0         OH
 munsoth01            7.0         OH
 niekrjo01            1.0         OH
 russeje01            2.0         OH
 schmimi01           12.0         OH
 shawje01             2.0         OH
 stonest01            1.0         OH
 swishni01            1.0         OH
 tekulke01            1.0         OH
 wynnji01             3.0         OH
 youklke01            3.0         OH, 'OK':            allstar_count birthState
 playerID                           
 cartejo01            5.0         OK
 conlege01            4.0         OK
 darkal01             3.0         OK
 keuchda01            1.0         OK
 mitchda01            2.0         OK
 mooremi01            1.0         OK
 rayjo01              1.0         OK
 reynoal01            6.0         OK
 spiveju01            1.0         OK
 tettlmi01            2.0         OK
 wanerpa01            4.0         OK, 'ON':            allstar_count birthState
 playerID                           
 crainje01            1.0         ON
 dicksja01            1.0         ON
 martiru01            4.0         ON
 quantpa01            1.0         ON, 'OR':            allstar_count birthState
 playerID                           
 brosisc01            1.0         OR
 ellsbja01            1.0         OR
 kingmda01            3.0         OR
 reynoha01            2.0         OR
 rowanaa01            1.0         OR, 'PA':            allstar_count birthState
 playerID                           
 groatdi01            8.0         PA
 huntebi03            1.0         PA
 jacksre01           14.0         PA
 litwhda01            1.0         PA
 miltoer01            1.0         PA
 moranmi01            1.0         PA
 moyerja01            1.0         PA
 mulhote01            1.0         PA
 mussimi01            5.0         PA
 piazzmi01           12.0         PA
 rolliri01            2.0         PA
 schwado01            1.0         PA
 vernomi01            7.0         PA, 'QC':            allstar_count birthState
 playerID                           
 gagneer01            3.0         QC, 'RI':            allstar_count birthState
 playerID                           
 hartnga01            6.0         RI
 lopesda01            4.0         RI
 stenhda01            2.0         RI, 'SC':            allstar_count birthState
 playerID                           
 brisslo01            1.0         SC
 gardnbr01            1.0         SC
 hudsoor01            2.0         SC
 odellbi01            2.0         SC
 rosenal01            4.0         SC
 sandere02            1.0         SC
 sharpmi01            1.0         SC
 wietema01            4.0         SC, 'SD':            allstar_count birthState
 playerID                           
 bannifl01            1.0         SD
 doolise01            1.0         SD
 foulkke01            1.0         SD, 'Sonora':            allstar_count birthState
 playerID                           
 estrama01            1.0     Sonora, 'TN':            allstar_count birthState
 playerID                           
 baileed01            6.0         TN
 bridgto01            6.0         TN
 dickera01            1.0         TN
 finlest01            2.0         TN
 garneph01            3.0         TN
 grayso01             1.0         TN
 harvebr01            2.0         TN
 heltoto01            5.0         TN
 honeyri01            2.0         TN
 madlobi01            3.0         TN
 pomerdr01            1.0         TN
 priceda01            5.0         TN
 sherrge01            1.0         TN
 wagnele01            3.0         TN
 wrighcl01            1.0         TN, 'TX':            allstar_count birthState
 playerID                           
 alvisma01            2.0         TX
 bellbe01             1.0         TX
 beltbr01             1.0         TX
 berkmla01            6.0         TX
 blackch02            1.0         TX
 bournmi01            2.0         TX
 buchhcl01            2.0         TX
 bufordo01            1.0         TX
 busbyji01            1.0         TX
 carpema01            3.0         TX
 cashno01             5.0         TX
 chalkda01            2.0         TX
 davisch02            1.0         TX
 davisro02            1.0         TX
 drabedo01            1.0         TX
 freesda01            1.0         TX
 grababi01            1.0         TX
 groteje01            2.0         TX
 hargrmi01            1.0         TX
 harriwi02            1.0         TX
 hattogr01            1.0         TX
 hawpebr01            1.0         TX
 higgipi01            3.0         TX
 holtbr01             1.0         TX
 hootobu01            1.0         TX
 horlejo01            1.0         TX
 hughste01            3.0         TX
 jacksda02            2.0         TX
 kempst01             1.0         TX
 knoblch01            4.0         TX
 lackejo01            1.0         TX
 mintogr01            1.0         TX
 mooredo01            1.0         TX
 morgajo02           10.0         TX
 nathajo01            6.0         TX
 newmaje01            1.0         TX
 pencehu01            3.0         TX
 ramosaj01            1.0         TX
 robined01            4.0         TX
 stantmi02            1.0         TX
 streehu01            2.0         TX
 swindgr01            1.0         TX
 williwo02            1.0         TX
 wilsogl01            1.0         TX
 youngch03            1.0         TX, 'VA':            allstar_count birthState
 playerID                           
 atwelto01            1.0         VA
 bradlja02            1.0         VA
 bumbral01            1.0         VA
 grubbjo01            1.0         VA
 hollaal01            1.0         VA
 ingebr01             1.0         VA
 saundjo01            1.0         VA
 verlaju01            6.0         VA
 wagnebi02            7.0         VA
 willimi03            2.0         VA
 womacto01            1.0         VA, 'VT':            allstar_count birthState
 playerID                           
 fiskca01            11.0         VT
 tebbebi01            4.0         VT, 'WA':            allstar_count birthState
 playerID                           
 ceyro01              6.0         WA
 coninje01            2.0         WA
 jenkige01            1.0         WA
 linceti01            4.0         WA
 meekev01             1.0         WA
 myersra01            4.0         WA
 olerujo01            2.0         WA
 whitesa02            1.0         WA, 'WI':            allstar_count birthState
 playerID                           
 grudzma01            1.0         WI
 kuennha01           10.0         WI
 milleda02            1.0         WI
 neshepa01            1.0         WI
 rawlesh01            1.0         WI
 wickmbo01            2.0         WI
 zimmejo02            2.0         WI, 'WV':            allstar_count birthState
 playerID                           
 burdele01            3.0         WV
 reedri01             2.0         WV
 swishst01            1.0         WV, 'WY':            allstar_count birthState
 playerID                           
 brownto05            1.0         WY}



In [ ]:

    
for key, value in dfs:
    df_temp = pd.DataFrame(value, index="playerID", columns=[["allstar_count", "birthState"]])
    frequency_polygon(df_temp['allstar_count'].dropna(), "Allstar count", proportion=True)



In [24]:

    
df.isnull().values.any()









    Out[24]:





True

Univariate exploratory data analysis



In [6]:

    
fig_7 = figures.univariate(df['birthYear'].dropna(), 'Birth Year', bin_n=None, formatting_right=False)
df['birthYear'].dropna().describe()









    Out[6]:





count    6575.000000
mean     1947.320152
std        33.161191
min      1845.000000
25%      1924.000000
50%      1958.000000
75%      1973.000000
max      1993.000000
Name: birthYear, dtype: float64



In [3]:

    
fig_7 = figures.univariate(df['birthYear'].dropna(), 'Birth Year', bin_n=None, formatting_right=False)
df['birthYear'].dropna().describe()









    Out[3]:





count    18973.000000
mean      1931.435356
std         41.555514
min       1820.000000
25%       1895.000000
50%       1937.000000
75%       1969.000000
max       1996.000000
Name: birthYear, dtype: float64



In [16]:

    
fig_1 = figures.univariate(df['award_count'].dropna(), 'Player\'s Awards')
df['award_count'].dropna().describe()









    Out[16]:





count    473.000000
mean       4.209302
std        5.350487
min        1.000000
25%        1.000000
50%        2.000000
75%        5.000000
max       47.000000
Name: award_count, dtype: float64



In [8]:

    
figures.univariate_overdispersed(df['award_count'].dropna(), 'Player\'s Awards', bin_n=None)









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x12321ebe0>



In [17]:

    
fig_2 = figures.univariate(df['allstar_count'].dropna(), 'Allstar Appearence')
df['allstar_count'].dropna().describe()









    Out[17]:





count    642.000000
mean       2.565421
std        2.379630
min        1.000000
25%        1.000000
50%        2.000000
75%        3.000000
max       15.000000
Name: allstar_count, dtype: float64



In [18]:

    
fig_3 = figures.univariate(df['max_salary_standardized_annually'].dropna(), 'Stanardised Max Career Salary', bin_n=None)
df['max_salary_standardized_annually'].dropna().describe()









    Out[18]:





count    2677.000000
mean        0.009855
std         1.036265
min        -1.212567
25%        -0.637441
50%        -0.513423
75%         0.284740
max         6.047580
Name: max_salary_standardized_annually, dtype: float64



In [5]:

    
fig_4 = figures.univariate(df['min_salary_standardized_annually'].dropna(), 'Stanardised Min Career Salary', bin_n=None)
df['min_salary_standardized_annually'].dropna().describe()









    Out[5]:





count    5155.000000
mean       -0.650137
std         0.271276
min        -1.212567
25%        -0.712534
50%        -0.671235
75%        -0.644166
max         3.925495
Name: min_salary_standardized_annually, dtype: float64



In [9]:

    
fig_5 = figures.univariate(df['mean_salary_standardized_annually'].dropna(), 'Stanardised Mean Career Salary', bin_n=None)
df['mean_salary_standardized_annually'].dropna().describe()









    Out[9]:





count    5155.000000
mean       -0.319813
std         0.595574
min        -1.212567
25%        -0.656584
50%        -0.590689
75%        -0.223479
max         4.268085
Name: mean_salary_standardized_annually, dtype: float64



In [19]:

    
fig_a = figures.dist_transform_plot(df['mean_salary'].dropna(), 'Mean Salary', bin_n=None)



In [10]:

    
# Unable to turn this into a function at the moment
figsize = (15, 15)
# Needed to set up figure style
figures.common_set_up(figsize)

fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=figsize)

fig.suptitle("Distributions of Weight", fontsize=16)
fig.subplots_adjust(hspace=0.18, top=0.95)

figures.univariate(df['weight'].dropna(), 'Weight', rug=False, bin_n=None, ax=ax1)
figures.univariate(df['weight'].dropna(), 'Weight', rug=False, ax=ax2)
figures.univariate(df['weight'].dropna(), 'Weight', rug=False, 
                   x_truncation_upper=200, x_truncation_lower=150, 
                   formatting_right=False, ax=ax3)

sns.despine(offset=2, trim=True, left=True, bottom=True)



In [9]:

    
fig_8 = figures.univariate(df['weight'].dropna(), 'Weight', rug=False, x_truncation_upper=200, x_truncation_lower=150, formatting_right=False)

annot = "Spikes suggest entries\n  rounded to every 5"
fig_8 = figures.annotation_text(fig_8, annot, 0.4, 0.2, strong_colour=True, font_size=14)

df['weight'].dropna().describe()









    Out[9]:





count    18251.000000
mean       186.375596
std         21.524765
min         65.000000
25%        170.000000
50%        185.000000
75%        200.000000
max        320.000000
Name: weight, dtype: float64



In [6]:

    
fig_8 = figures.univariate(df['weight'].dropna(), 'Weight', rug=True, bin_n= 20)

annot = "Extreme outlier"
fig_8 = figures.annotation_text(fig_8, annot, 0.1, 0.05, strong_colour=False, font_size=12)

df['weight'].dropna().describe()









    Out[6]:





count    18251.000000
mean       186.375596
std         21.524765
min         65.000000
25%        170.000000
50%        185.000000
75%        200.000000
max        320.000000
Name: weight, dtype: float64



In [3]:

    
fig_9 = figures.univariate(df['height'].dropna(), 'Height', rug=False)
df['height'].dropna().describe()









    Out[3]:





count    18320.000000
mean        72.273799
std          2.603904
min         43.000000
25%         71.000000
50%         72.000000
75%         74.000000
max         83.000000
Name: height, dtype: float64

Bivariate distributions



In [13]:

    
sns.jointplot(x=df['weight'].dropna(), y=df['height'].dropna())









    Out[13]:





<seaborn.axisgrid.JointGrid at 0x1ff2d9da978>

Binary plots



In [5]:

    
fig_c2 = figures.boolean_bar(df['birthCountry'].dropna()=='USA', 'USA as birth country')



In [13]:

    
fig_c2 = figures.boolean_bar(df['birthYear'].dropna() >= 1975, 'Born in, or after 1975')



In [6]:

    
fig_c3 = figures.boolean_bar(df['college_country'].dropna()=='USA', 'College in USA', annotate=False)
(df['college_country'].dropna()=='USA').describe()









    Out[6]:





count      6575
unique        2
top       False
freq       6570
Name: college_country, dtype: object

Categorical data



In [20]:

    
columns = list(df.columns.values)
print(columns)









    



['birthYear', 'birthMonth', 'birthDay', 'birthCountry', 'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast', 'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame', 'retroID', 'bbrefID', 'allstar_count', 'award_count', 'yearid', 'votedBy', 'ballots', 'needed', 'votes', 'inducted', 'category', 'needed_note', 'mean_salary', 'max_salary', 'min_salary', 'mean_salary_standardized_annually', 'max_salary_standardized_annually', 'min_salary_standardized_annually', 'mode_schoolID', 'college_name_full', 'college_city', 'college_state', 'college_country']



In [7]:

    
# df where birthCountry == USA, sort on birthState then display birthState
fig_c4 = figures.count_bar((
                            df[                              # From DataFrame
                                df['birthCountry'] == 'USA'  # Select only USA as birthCountry
                               ].sort_values(['birthState']) # Sort by birthState
                          ['birthState']),                   # Display birthState
                          'Birth State of Players',
                          highlight=4
                          );



In [10]:

    
fig_c3 = figures.count_bar((df.sort_values(['college_state'])      # Sort by birthState
                           ['college_state']),                     # Display state of college
                           r"Player´s College State",
                            highlight=3);



In [16]:

    
print("Unique birth cities", len(df["birthCity"].unique()))
print("Unique college cities", len(df["college_city"].unique()))









    



Unique birth cities 2208
Unique college cities 721



In [8]:

    
fig_c3 = figures.count_bar((
                            df[                                  # From DataFrame
                                df['birthCountry'] != 'USA'      # Select all that are not USA as birthCountry
                               ].sort_values(['birthCountry'])   # Sort by birthCountry
                            ['birthCountry']),                   # Display state of college
                           "Country of Birth Excluding the USA",
                            highlight=None);



In [20]:

    
series_x = df.sort_values(['state'])['state'].dropna()
series_y = df[df['birthCountry'] == 'USA'].sort_values(['birthState'])['birthState']



In [ ]:

	birthYear	birthMonth	birthDay	birthCountry	birthState	birthCity	deathYear	deathMonth	deathDay	deathCountry	...	max_salary	min_salary	mean_salary_standardized_annually	max_salary_standardized_annually	min_salary_standardized_annually	mode_schoolID	college_name_full	college_city	college_state	college_country
playerID
aardsda01	1981.0	12.0	27.0	USA	CO	Denver	NaN	NaN	NaN	NaN	...	4500000.0	300000.0	-0.440097	0.260102	-0.670224	rice	Rice University	Houston	TX	USA
abadan01	1972.0	8.0	25.0	USA	FL	Palm Beach	NaN	NaN	NaN	NaN	...	327000.0	327000.0	-0.663649	-0.663649	-0.663649	gamiddl	Middle Georgia College	Cochran	GA	USA
abbeybe01	1869.0	11.0	11.0	USA	VT	Essex	1962.0	6.0	11.0	USA	...	NaN	NaN	NaN	NaN	NaN	vermont	University of Vermont	Burlington	VT	USA
abbotje01	1972.0	8.0	17.0	USA	GA	Atlanta	NaN	NaN	NaN	NaN	...	300000.0	175000.0	-0.644164	-0.598825	-0.690417	kentucky	University of Kentucky	Lexington	KY	USA
abbotji01	1967.0	9.0	19.0	USA	MI	Flint	NaN	NaN	NaN	NaN	...	2775000.0	68000.0	0.273098	1.275547	-0.814658	michigan	University of Michigan	Ann Arbor	MI	USA