In [36]:

    
% matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import itertools, powerlaw
from collections import Counter
import seaborn as sns
sns.set(style="white",rc={"figure.figsize": (6, 6)})

# fdir = 'E:/Dropbox/Hacking/Baseball/'
fdir = '/Users/brianckeegan/Dropbox/Hacking/Baseball/'
figpath = '/Users/brianckeegan/Dropbox/Papers/Network Science/Baseball/'

Batting



In [2]:

    
batting_df = pd.read_csv(fdir + 'Lahman/Batting.csv')
batting_df.tail()









    Out[2]:






  
    
      
      playerID
      yearID
      stint
      teamID
      lgID
      G
      G_batting
      AB
      R
      H
      ...
      SB
      CS
      BB
      SO
      IBB
      HBP
      SH
      SF
      GIDP
      G_old
    
  
  
    
      97884
       zimmejo02
       2013
       1
       WAS
       NL
        32
        32
        65
        4
         8
      ...
        0
       0
        1
        20
       0
       0
       6
       1
        0
      NaN
    
    
      97885
       zimmery01
       2013
       1
       WAS
       NL
       147
       147
       568
       84
       156
      ...
        6
       0
       60
       133
       2
       2
       0
       3
       16
      NaN
    
    
      97886
        zitoba01
       2013
       1
       SFN
       NL
        30
        30
        34
        3
         5
      ...
        0
       0
        0
         8
       0
       0
       9
       0
        1
      NaN
    
    
      97887
       zobribe01
       2013
       1
       TBA
       AL
       157
       157
       612
       77
       168
      ...
       11
       3
       72
        91
       4
       7
       1
       6
       18
      NaN
    
    
      97888
       zuninmi01
       2013
       1
       SEA
       AL
        52
        52
       173
       22
        37
      ...
        1
       0
       16
        49
       0
       3
       0
       1
        5
      NaN
    
  

5 rows × 24 columns

Teams



In [3]:

    
teams_df = pd.read_csv(fdir + 'Lahman/Teams.csv')
teams_df.tail()









    Out[3]:






  
    
      
      yearID
      lgID
      teamID
      franchID
      divID
      Rank
      G
      Ghome
      W
      L
      ...
      DP
      FP
      name
      park
      attendance
      BPF
      PPF
      teamIDBR
      teamIDlahman45
      teamIDretro
    
  
  
    
      2740
       2013
       NL
       LAN
       LAD
       W
       1
       162
       81
       92
       70
      ...
       160
       0.982
        Los Angeles Dodgers
       Dodger Stadium
       3743527
        95
        95
       LAD
       LAN
       LAN
    
    
      2741
       2013
       NL
       ARI
       ARI
       W
       2
       162
       81
       81
       81
      ...
       134
       0.988
       Arizona Diamondbacks
          Chase Field
       2134795
       102
       102
       ARI
       ARI
       ARI
    
    
      2742
       2013
       NL
       SDN
       SDP
       W
       3
       162
       81
       76
       86
      ...
       140
       0.986
           San Diego Padres
           Petco Park
       2166691
        91
        91
       SDP
       SDN
       SDN
    
    
      2743
       2013
       NL
       SFN
       SFG
       W
       4
       162
       82
       76
       86
      ...
       126
       0.982
       San Francisco Giants
            AT&T Park
       3326796
        90
        89
       SFG
       SFN
       SFN
    
    
      2744
       2013
       NL
       COL
       COL
       W
       5
       162
       81
       74
       88
      ...
       162
       0.986
           Colorado Rockies
          Coors Field
       2793828
       117
       118
       COL
       COL
       COL
    
  

5 rows × 48 columns

Game logs



In [4]:

    
gl_2013 = pd.read_csv(fdir + 'GameLogs/GL2013.TXT',header=None)
gl_2013.head()









    Out[4]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      151
      152
      153
      154
      155
      156
      157
      158
      159
      160
    
  
  
    
      0
       20130331
       0
       Sun
       TEX
       AL
       1
       HOU
       AL
       1
       2
      ...
         Matt Dominguez
       5
       barnb002
       Brandon Barnes
        9
       ceder002
             Ronny Cedeno
       6
       NaN
       Y
    
    
      1
       20130401
       0
       Mon
       KCA
       AL
       1
       CHA
       AL
       1
       0
      ...
         Alexei Ramirez
       6
       flowt001
        Tyler Flowers
        2
       beckg001
           Gordon Beckham
       4
       NaN
       Y
    
    
      2
       20130401
       0
       Mon
       DET
       AL
       1
       MIN
       AL
       1
       4
      ...
         Chris Parmelee
       9
       dozib001
         Brian Dozier
        4
       florp001
           Pedro Florimon
       6
       NaN
       Y
    
    
      3
       20130401
       0
       Mon
       BOS
       AL
       1
       NYA
       AL
       1
       8
      ...
          Ichiro Suzuki
       9
       nix-j001
           Jayson Nix
        5
       cervf001
       Francisco Cervelli
       2
       NaN
       Y
    
    
      4
       20130401
       0
       Mon
       SEA
       AL
       1
       OAK
       AL
       1
       2
      ...
       Joshua Donaldson
       5
       smits002
           Seth Smith
       10
       sogae001
              Eric Sogard
       4
       NaN
       Y
    
  

5 rows × 161 columns



In [5]:

    
home_col = [3] + np.arange(105,132,3).tolist()
away_col = [6] + np.arange(132,159,3).tolist()



In [6]:

    
# Get the lineups of all the home team lineups
home_df = gl_2013[home_col]

# Rename the columns so the concatenate and melt nicer
home_df.columns = ['Team'] + np.arange(1,10).tolist()

# Get all the lineups of the away team lineups
away_df = gl_2013[away_col]

# Rename the columns so the concatenate and melt nicer
away_df.columns = ['Team'] + np.arange(1,10).tolist()

# Combine the home and away lineups
all_games_df = pd.concat([home_df,away_df])

# Big nasty operation that melts wide format into long format, 
# removes duplicate entries, groups by playerID, 
# aggregates into a list of teams played, then converts to a dictionary.
# This dictionary is keyed by playerID and returns a list of all the 
# teams of which the player was a member
affiliations_by_player = pd.melt(all_games_df,id_vars='Team',value_vars=np.arange(1,10).tolist()).drop('variable',1).drop_duplicates().groupby('value').agg({'Team':lambda x:list(x)}).to_dict()['Team']
affiliations_by_team = pd.melt(all_games_df,id_vars='Team',value_vars=np.arange(1,10).tolist()).drop('variable',1).drop_duplicates().groupby('Team').agg({'value':lambda x:list(x)}).to_dict()['value']

2013 analysis



In [7]:

    
# Get all the home and away lineups, create a list of combinations of these names 
# reflecting teammates who've played together, then count how often these edges occur
home = Counter([j for i in gl_2013[np.arange(105,132,3)].values.tolist() for j in list(itertools.combinations(i,2))])
away = Counter([j for i in gl_2013[np.arange(132,159,3)].values.tolist() for j in list(itertools.combinations(i,2))])

# Add the home and away Counter objects together to get all games played together
games = home + away

# Create a network object
g2013 = nx.Graph()

# Iterate over the edges in the games Counter object adding edges, 
# also add node properties from affiliations
for (p1,p2),c in games.iteritems():
    g2013.add_edge(p1,p2,weight=c)
    g2013.add_node(p1,teams=affiliations_by_player[p1])
    g2013.add_node(p2,teams=affiliations_by_player[p2])



In [8]:

    
# http://orange.biolab.si/blog/2012/06/15/joint-entropy-in-python/
def entropy2(*X):
    return np.sum(-p * np.log2(p) if p > 0 else 0 for p in
        (np.mean(reduce(np.logical_and, (predictions == c for predictions, c in zip(X, classes))))
            for classes in itertools.product(*[set(x) for x in X])))



In [9]:

    
def ei_index(g,affiliations,team):
    team_graph = g.subgraph(affiliations[team])
    non_team_edges = [(node,neighbor,d) for node in team_graph for neighbor,d in g[node].items() if neighbor not in team_graph.nodes()]
    return (len(non_team_edges) - len(team_graph.edges()))/float(len(non_team_edges) + len(team_graph.edges()))

def weight_entropy(g,affiliations,team):
    weights = [float(d['weight']) for n1,n2,d in g.subgraph(affiliations[team]).edges(data=True)]
    return entropy2(np.array(weights)/float(sum(weights)))

def strength_entropy(g,affiliations,team):
    subgraph = g.subgraph(affiliations[team])
    strengths = [sum(subgraph[node][neighbor]['weight'] for neighbor in subgraph[node]) for node in subgraph.nodes()]
    return entropy2(np.array(strengths)/float(sum(strengths)))



In [10]:

    
team_network_statistics = pd.DataFrame(index=affiliations_by_team.keys())

team_network_statistics['Nodes'] = pd.Series({team:g2013.subgraph(affiliations_by_team[team]).number_of_nodes() for team in affiliations_by_team.keys()})
team_network_statistics['Edges'] = pd.Series({team:g2013.subgraph(affiliations_by_team[team]).number_of_edges() for team in affiliations_by_team.keys()})
team_network_statistics['Density'] = pd.Series({team:nx.density(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Clustering'] = pd.Series({team:nx.average_clustering(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Weighted_Clustering'] = pd.Series({team:nx.average_clustering(g2013.subgraph(affiliations_by_team[team]),weight='weight') for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Connectivity'] = pd.Series({team:np.mean(nx.degree_centrality(g2013.subgraph(affiliations_by_team[team])).values()) for team in affiliations_by_team.keys()})
team_network_statistics['Diameter'] = pd.Series({team:nx.diameter(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Radius'] = pd.Series({team:nx.radius(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Shortest_path'] = pd.Series({team:nx.average_shortest_path_length(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['EI_index'] = pd.Series({team:ei_index(g2013,affiliations_by_team,team) for team in affiliations_by_team.keys()})
team_network_statistics['Weight_entropy'] = pd.Series({team:weight_entropy(g2013,affiliations_by_team,team) for team in affiliations_by_team.keys()})
team_network_statistics['Strength_entropy'] = pd.Series({team:strength_entropy(g2013,affiliations_by_team,team) for team in affiliations_by_team.keys()})



In [11]:

    
team_joined_df = team_network_statistics.join(teams_df[teams_df['yearID'] == 2013].set_index('teamID'))
team_joined_df['BA'] = team_joined_df['H'] / team_joined_df['AB']
team_joined_df['SLG'] = (team_joined_df['H'] + 2*team_joined_df['2B'] + 3*team_joined_df['3B'] + 4*team_joined_df['HR']) / team_joined_df['AB']
team_joined_df









    Out[11]:






  
    
      
      Nodes
      Edges
      Density
      Avg_Clustering
      Avg_Weighted_Clustering
      Avg_Connectivity
      Diameter
      Radius
      Avg_Shortest_path
      EI_index
      ...
      name
      park
      attendance
      BPF
      PPF
      teamIDBR
      teamIDlahman45
      teamIDretro
      BA
      SLG
    
  
  
    
      MIL
       31
       319
       0.686022
       0.847033
       0.065738
       0.686022
       2
       1
       1.313978
      -0.893175
      ...
           Milwaukee Brewers
                         Miller Park
       2531105
       105
       105
       MIL
       ML4
       MIL
       0.252284
       0.477530
    
    
      MIN
       27
       247
       0.703704
       0.863261
       0.073799
       0.703704
       2
       1
       1.296296
      -0.588424
      ...
             Minnesota Twins
                        Target Field
       2477644
        99
       101
       MIN
       MIN
       MIN
       0.241912
       0.460999
    
    
      MIA
       37
       402
       0.603604
       0.805358
       0.079482
       0.603604
       2
       1
       1.396396
      -0.923445
      ...
               Miami Marlins
                        Marlins Park
       1586322
       102
       103
       MIA
       FLO
       MIA
       0.230685
       0.397871
    
    
      ATL
       30
       301
       0.691954
       0.849447
       0.064868
       0.691954
       2
       1
       1.308046
      -0.750000
      ...
              Atlanta Braves
                        Turner Field
       2548679
       104
       103
       ATL
       ATL
       ATL
       0.248851
       0.484286
    
    
      BOS
       27
       239
       0.680912
       0.855273
       0.074903
       0.680912
       2
       1
       1.319088
      -0.397661
      ...
              Boston Red Sox
                      Fenway Park II
       2833333
       102
       102
       BOS
       BOS
       BOS
       0.277119
       0.546983
    
    
      DET
       24
       197
       0.713768
       0.860695
       0.082192
       0.713768
       2
       1
       1.286232
      -0.690987
      ...
              Detroit Tigers
                       Comerica Park
       3083397
       106
       105
       DET
       DET
       DET
       0.283348
       0.519965
    
    
      CIN
       25
       231
       0.770000
       0.858601
       0.078578
       0.770000
       2
       1
       1.230000
      -1.000000
      ...
             Cincinnati Reds
            Great American Ball Park
       2534369
       103
       102
       CIN
       CIN
       CIN
       0.249136
       0.472450
    
    
      NYN
       37
       440
       0.660661
       0.814347
       0.063154
       0.660661
       2
       1
       1.339339
      -0.577061
      ...
               New York Mets
                          Citi Field
       2135657
        94
        95
       NYM
       NYN
       NYN
       0.237093
       0.442526
    
    
      BAL
       33
       274
       0.518939
       0.835306
       0.045051
       0.518939
       2
       1
       1.481061
      -0.513812
      ...
           Baltimore Orioles
         Oriole Park at Camden Yards
       2357561
       106
       105
       BAL
       BAL
       BAL
       0.259786
       0.524199
    
    
      NYA
       39
       394
       0.531714
       0.812334
       0.049769
       0.531714
       2
       1
       1.468286
      -0.340136
      ...
            New York Yankees
                  Yankee Stadium III
       3279589
       102
       101
       NYY
       NYA
       NYA
       0.242430
       0.452010
    
    
      COL
       31
       345
       0.741935
       0.854358
       0.093368
       0.741935
       2
       1
       1.258065
      -0.582569
      ...
            Colorado Rockies
                         Coors Field
       2793828
       117
       118
       COL
       COL
       COL
       0.269870
       0.503840
    
    
      OAK
       29
       262
       0.645320
       0.841020
       0.062386
       0.645320
       2
       1
       1.354680
      -0.435616
      ...
           Oakland Athletics
                       O.co Coliseum
       1809302
        95
        93
       OAK
       OAK
       OAK
       0.254121
       0.511502
    
    
      TEX
       27
       233
       0.663818
       0.853693
       0.059061
       0.663818
       2
       1
       1.336182
      -0.618056
      ...
               Texas Rangers
       Rangers Ballpark in Arlington
       3178273
       104
       103
       TEX
       TEX
       TEX
       0.262310
       0.494539
    
    
      TOR
       28
       244
       0.645503
       0.839040
       0.074596
       0.645503
       2
       2
       1.354497
      -0.787546
      ...
           Toronto Blue Jays
                       Rogers Centre
       2536562
       102
       102
       TOR
       TOR
       TOR
       0.252483
       0.497742
    
    
      SEA
       32
       299
       0.602823
       0.830240
       0.058128
       0.602823
       2
       1
       1.397177
      -0.607527
      ...
            Seattle Mariners
                        Safeco Field
       1761546
        92
        92
       SEA
       SEA
       SEA
       0.237136
       0.471213
    
    
      PIT
       34
       375
       0.668449
       0.831317
       0.051754
       0.668449
       2
       1
       1.331551
      -0.488095
      ...
          Pittsburgh Pirates
                            PNC Park
       2256862
        94
        94
       PIT
       PIT
       PIT
       0.245170
       0.481225
    
    
      CHA
       29
       250
       0.615764
       0.831665
       0.060447
       0.615764
       2
       2
       1.384236
      -0.501502
      ...
                Chicago Cubs
                       Wrigley Field
       2642682
       104
       105
       CHC
       CHN
       CHN
       0.248966
       0.450836
    
    
      CLE
       26
       203
       0.624615
       0.842396
       0.072385
       0.624615
       2
       1
       1.375385
      -0.353333
      ...
           Cleveland Indians
                   Progressive Field
       1572926
        93
        94
       CLE
       CLE
       CLE
       0.254529
       0.498445
    
    
      PHI
       35
       383
       0.643697
       0.817558
       0.053483
       0.643697
       2
       2
       1.356303
      -0.445283
      ...
       Philadelphia Phillies
                  Citizens Bank Park
       3012403
       101
       102
       PHI
       PHI
       PHI
       0.248350
       0.462060
    
    
      CHN
       34
       372
       0.663102
       0.815573
       0.049934
       0.663102
       2
       1
       1.336898
      -0.433526
      ...
           Chicago White Sox
                 U.S. Cellular Field
       1768413
       107
       107
       CHW
       CHA
       CHA
       0.237723
       0.480720
    
    
      SLN
       28
       248
       0.656085
       0.843795
       0.079436
       0.656085
       2
       1
       1.343915
      -1.000000
      ...
         St. Louis Cardinals
                   Busch Stadium III
       3369769
        99
        97
       STL
       SLN
       SLN
       0.268850
       0.485514
    
    
      HOU
       30
       301
       0.691954
       0.858215
       0.051790
       0.691954
       2
       1
       1.308046
      -0.618280
      ...
              Houston Astros
                    Minute Maid Park
       1651883
        99
       101
       HOU
       HOU
       HOU
       0.239509
       0.454279
    
    
      WAS
       31
       307
       0.660215
       0.828591
       0.057780
       0.660215
       2
       1
       1.339785
      -0.691460
      ...
        Washington Nationals
                      Nationals Park
       2652422
       102
       101
       WSN
       MON
       WAS
       0.251104
       0.479765
    
    
      LAN
       34
       390
       0.695187
       0.837954
       0.058322
       0.695187
       3
       2
       1.306595
      -0.614907
      ...
         Los Angeles Dodgers
                      Dodger Stadium
       3743527
        95
        95
       LAD
       LAN
       LAN
       0.263522
       0.475687
    
    
      KCA
       28
       244
       0.645503
       0.837996
       0.050331
       0.645503
       2
       1
       1.354497
      -0.280840
      ...
          Kansas City Royals
                    Kauffman Stadium
       1750754
       103
       103
       KCR
       KCA
       KCA
       0.260047
       0.450712
    
    
      ANA
       25
       223
       0.743333
       0.866293
       0.069387
       0.743333
       2
       1
       1.256667
      -0.452769
      ...
                         NaN
                                 NaN
           NaN
       NaN
       NaN
       NaN
       NaN
       NaN
            NaN
            NaN
    
    
      SFN
       32
       338
       0.681452
       0.835695
       0.058266
       0.681452
       2
       1
       1.318548
      -0.802667
      ...
        San Francisco Giants
                           AT&T Park
       3326796
        90
        89
       SFG
       SFN
       SFN
       0.260447
       0.457313
    
    
      TBA
       26
       209
       0.643077
       0.847691
       0.058350
       0.643077
       2
       1
       1.356923
      -0.583333
      ...
              Tampa Bay Rays
                     Tropicana Field
       1510300
        96
        95
       TBR
       TBA
       TBA
       0.256591
       0.495125
    
    
      ARI
       30
       317
       0.728736
       0.844379
       0.122853
       0.728736
       2
       1
       1.271264
      -0.837681
      ...
        Arizona Diamondbacks
                         Chase Field
       2134795
       102
       102
       ARI
       ARI
       ARI
       0.258633
       0.473044
    
    
      SDN
       34
       376
       0.670232
       0.822551
       0.089665
       0.670232
       2
       2
       1.329768
      -0.534694
      ...
            San Diego Padres
                          Petco Park
       2166691
        91
        91
       SDP
       SDN
       SDN
       0.244517
       0.453689
    
  

30 rows × 61 columns



In [108]:

    
team_joined_df.columns









    Out[108]:





Index([u'Nodes', u'Edges', u'Density', u'Avg_Clustering', u'Avg_Weighted_Clustering', u'Avg_Connectivity', u'Diameter', u'Radius', u'Avg_Shortest_path', u'yearID', u'lgID', u'franchID', u'divID', u'Rank', u'G', u'Ghome', u'W', u'L', u'DivWin', u'WCWin', u'LgWin', u'WSWin', u'R', u'AB', u'H', u'2B', u'3B', u'HR', u'BB', u'SO', u'SB', u'CS', u'HBP', u'SF', u'RA', u'ER', u'ERA', u'CG', u'SHO', u'SV', u'IPouts', u'HA', u'HRA', u'BBA', u'SOA', u'E', u'DP', u'FP', u'name', u'park', u'attendance', u'BPF', u'PPF', u'teamIDBR', u'teamIDlahman45', u'teamIDretro', u'BA', u'SLG'], dtype='object')



In [175]:

    
sns.lmplot('Avg_Clustering','SLG',team_joined_df,hue='lgID')
plt.xlabel('Average team clustering')
plt.ylabel('Slugging percentage')
plt.tight_layout()
plt.savefig('cluster-SLG.png')



In [176]:

    
sns.lmplot(u'Strength_entropy','W',team_joined_df,hue='lgID')
plt.xlabel('Within team strength entropy')
plt.ylabel('Wins')
plt.tight_layout()
plt.savefig('strength_entropy-wins.png')

Annual analyses



In [12]:

    
total_edgelist = Counter()
yearly_graphlist = list()

for year in np.arange(1914,2014):
    gl = pd.read_csv(fdir + 'GameLogs/GL{0}.TXT'.format(str(year)),header=None)
    home = Counter([j for i in gl[np.arange(105,132,3)].values.tolist() for j in list(itertools.combinations(i,2))])
    away = Counter([j for i in gl[np.arange(132,159,3)].values.tolist() for j in list(itertools.combinations(i,2))])
    games = home + away
    total_edgelist += games
    
    g = nx.Graph()
    for (p1,p2),c in games.iteritems():
        g.add_edge(p1,p2,weight=c)
    yearly_graphlist.append(g)



In [97]:

    
nx.write_gexf(yearly_graphlist[-2],'2012.gexf')



In [80]:

    
annual_network_stats = pd.DataFrame(index=np.arange(1914,2014))
annual_network_stats['Nodes'] = [g.number_of_nodes() for g in yearly_graphlist]
annual_network_stats['Edges'] = [g.number_of_edges() for g in yearly_graphlist]
annual_network_stats['Density'] = [nx.density(g) for g in yearly_graphlist]
annual_network_stats['Avg_Clustering'] = [nx.average_clustering(g) for g in yearly_graphlist]
annual_network_stats['Avg_Connectivity'] = [np.mean(nx.degree_centrality(g).values())*(len(g)-1) for g in yearly_graphlist]
#annual_network_stats['Rich_Club'] = [nx.rich_club_coefficient(g) for g in yearly_graphlist]
annual_network_stats['Components'] = [nx.number_connected_components(g) for g in yearly_graphlist]
annual_network_stats['Exponent_Centrality'] = [powerlaw.Fit([int(i*(len(g)-1)) for i in nx.degree_centrality(g).values()],xmin=8,discrete=True).power_law.alpha for g in yearly_graphlist]
annual_network_stats['Exponent_Weight'] = [powerlaw.Fit([d['weight'] for n1,n2,d in g.edges_iter(data=True)],discrete=True).power_law.alpha for g in yearly_graphlist]

diameters = list()
radiuses = list()
avg_shortest_path = list()
frac_lcc = list()

for g in yearly_graphlist:
    lcc = nx.connected_component_subgraphs(g)[0]
    diameters.append(nx.diameter(lcc))
    radiuses.append(nx.radius(lcc))
    avg_shortest_path.append(nx.average_shortest_path_length(lcc))
    frac_lcc.append(len(lcc)/float(len(g)))
    
annual_network_stats['Diameter'] = diameters
annual_network_stats['Radius'] = radiuses
annual_network_stats['Avg_Shortest_Path'] = avg_shortest_path
annual_network_stats['Frac_LCC'] = frac_lcc

annual_network_stats.to_csv('1914-2014.csv')
annual_network_stats.head()









    



Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit






    Out[80]:






  
    
      
      Nodes
      Edges
      Density
      Avg_Clustering
      Avg_Connectivity
      Components
      Exponent_Centrality
      Exponent_Weight
      Diameter
      Radius
      Avg_Shortest_Path
      Frac_LCC
    
  
  
    
      1914
       652
       6516
       0.030703
       0.787874
       19.987730
       4
       2.105651
       2.424115
        9
       5
       4.334811
       0.831288
    
    
      1915
       662
       6571
       0.030033
       0.780841
       19.851964
       3
       2.135761
       2.366330
       12
       6
       5.217547
       0.922961
    
    
      1916
       458
       4689
       0.044805
       0.787593
       20.475983
       3
       2.097455
       2.458791
       12
       6
       5.158481
       0.893013
    
    
      1917
       438
       4298
       0.044910
       0.801067
       19.625571
       4
       2.126881
       2.345605
       12
       6
       4.918967
       0.840183
    
    
      1918
       427
       3869
       0.042539
       0.806555
       18.121780
       6
       2.237536
       2.233264
       12
       6
       5.257413
       0.700234

Charts



In [88]:

    
ax = annual_network_stats['Nodes'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Players')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'nodes.pdf')



In [89]:

    
ax = annual_network_stats['Density'].plot(lw=4)
ax.set_ylabel('Density')
ax.set_xlabel('Year')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'density.pdf')



In [90]:

    
ax = annual_network_stats['Avg_Clustering'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Average Clustering')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'clustering.pdf')



In [91]:

    
ax = annual_network_stats['Avg_Shortest_Path'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Average shortest path')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'shortest_path.pdf')



In [92]:

    
ax = annual_network_stats['Frac_LCC'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('LCC node fraction')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'frac_lcc.pdf')



In [93]:

    
ax = annual_network_stats['Avg_Connectivity'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Average degree')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'degree.pdf')



In [94]:

    
ax = annual_network_stats['Exponent_Centrality'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Power law exponent, centrality')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.ylim((1.6,2.4))
plt.savefig(figpath+'exponent_centrality.pdf')



In [95]:

    
ax = annual_network_stats['Exponent_Weight'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Power law exponent, weight')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'exponent_weight.pdf')

Whole graph analyses



In [16]:

    
g_all = nx.Graph()
for (p1,p2),c in total_edgelist.iteritems():
    g_all.add_edge(p1,p2,weight=c)



In [49]:

    
centralities = [int(i*(len(g_all)-1)) for i in nx.degree_centrality(g_all).values()]
degree_counter = Counter([i for i in centralities if i >= 8])

plt.scatter(degree_counter.keys(),degree_counter.values(),c='r')

#plt.ylabel('Count')
plt.yscale('log')
plt.ylim((.9,1e3))
#plt.xlabel('Observation')
plt.xscale('log')

plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('degree_distribution.png')
plt.savefig(figpath+'degree_distribution.pdf')



In [51]:

    
weights = [d['weight'] for n1,n2,d in g_all.edges_iter(data=True)]
weight_counter = Counter(weights)

plt.scatter(weight_counter.keys(),weight_counter.values(),c='b')

#plt.ylabel('Count')
plt.yscale('log')
#plt.ylim((.9,1e3))
#plt.xlabel('Observation')
plt.xscale('log')

plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('weight_distribution.png')
plt.savefig(figpath+'weight_distribution.pdf')



In [56]:

    
strengths = [sum(g_all[node][neighbor]['weight'] for neighbor in g_all[node]) for node in g_all.nodes()]
strength_counter = Counter(strengths)

plt.scatter(strength_counter.keys(),strength_counter.values(),c='g')

#plt.ylabel('Count')
plt.yscale('log')
#plt.ylim((.9,1e3))
#plt.xlabel('Observation')
plt.xscale('log')

plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('strength_distribution.png')
plt.savefig(figpath+'strength_distribution.pdf')



In [35]:

    
centralities_pl_alpha = powerlaw.Fit(centralities,xmin=8,discrete=True).power_law.alpha

weights_pl_alpha = powerlaw.Fit(degrees,discrete=True).power_law.alpha

print centralities_pl_alpha, weights_pl_alpha









    



Calculating best minimal value for power law fit
1.58946237539 3.2291248836



In [61]:

    
def extract_backbone(g, alpha):
  backbone_graph = nx.Graph()
  for node in g:
      k_n = len(g[node])
      if k_n > 1:
          sum_w = sum( g[node][neighbor]['weight'] for neighbor in g[node] )
          for neighbor in g[node]:
              edgeWeight = g[node][neighbor]['weight']
              pij = float(edgeWeight)/sum_w
              if (1-pij)**(k_n-1) < alpha: # equation 2
                  backbone_graph.add_edge( node,neighbor, weight = edgeWeight)
  return backbone_graph



In [65]:

    
g_backbone = extract_backbone(g,.125)
nx.write_gexf(g_backbone,'1995-2013_backbone.gexf')
g.number_of_edges(), g_backbone.number_of_edges()









    Out[65]:





(111309, 21387)



In [ ]:

	playerID	yearID	stint	teamID	lgID	G	G_batting	AB	R	H	...	SB	CS	BB	SO	IBB	HBP	SH	SF	GIDP	G_old
97884	zimmejo02	2013	1	WAS	NL	32	32	65	4	8	...	0	0	1	20	0	0	6	1	0	NaN
97885	zimmery01	2013	1	WAS	NL	147	147	568	84	156	...	6	0	60	133	2	2	0	3	16	NaN
97886	zitoba01	2013	1	SFN	NL	30	30	34	3	5	...	0	0	0	8	0	0	9	0	1	NaN
97887	zobribe01	2013	1	TBA	AL	157	157	612	77	168	...	11	3	72	91	4	7	1	6	18	NaN
97888	zuninmi01	2013	1	SEA	AL	52	52	173	22	37	...	1	0	16	49	0	3	0	1	5	NaN

	yearID	lgID	teamID	franchID	divID	Rank	G	Ghome	W	L	...	DP	FP	name	park	attendance	BPF	PPF	teamIDBR	teamIDlahman45	teamIDretro
2740	2013	NL	LAN	LAD	W	1	162	81	92	70	...	160	0.982	Los Angeles Dodgers	Dodger Stadium	3743527	95	95	LAD	LAN	LAN
2741	2013	NL	ARI	ARI	W	2	162	81	81	81	...	134	0.988	Arizona Diamondbacks	Chase Field	2134795	102	102	ARI	ARI	ARI
2742	2013	NL	SDN	SDP	W	3	162	81	76	86	...	140	0.986	San Diego Padres	Petco Park	2166691	91	91	SDP	SDN	SDN
2743	2013	NL	SFN	SFG	W	4	162	82	76	86	...	126	0.982	San Francisco Giants	AT&T Park	3326796	90	89	SFG	SFN	SFN
2744	2013	NL	COL	COL	W	5	162	81	74	88	...	162	0.986	Colorado Rockies	Coors Field	2793828	117	118	COL	COL	COL

	0	2	3	4	5	6	7	8	9	...	151	152	153	154	155	156	157	158	159	160
0	20130331	Sun	TEX	AL	1	HOU	AL	1	2	...	Matt Dominguez	5	barnb002	Brandon Barnes	9	ceder002	Ronny Cedeno	6	NaN	Y
1	20130401	Mon	KCA	AL	1	CHA	AL	1	0	...	Alexei Ramirez	6	flowt001	Tyler Flowers	2	beckg001	Gordon Beckham	4	NaN	Y
2	20130401	Mon	DET	AL	1	MIN	AL	1	4	...	Chris Parmelee	9	dozib001	Brian Dozier	4	florp001	Pedro Florimon	6	NaN	Y
3	20130401	Mon	BOS	AL	1	NYA	AL	1	8	...	Ichiro Suzuki	9	nix-j001	Jayson Nix	5	cervf001	Francisco Cervelli	2	NaN	Y
4	20130401	Mon	SEA	AL	1	OAK	AL	1	2	...	Joshua Donaldson	5	smits002	Seth Smith	10	sogae001	Eric Sogard	4	NaN	Y

	Nodes	Edges	Density	Avg_Clustering	Avg_Weighted_Clustering	Avg_Connectivity	Diameter	Radius	Avg_Shortest_path	EI_index	...	name	park	attendance	BPF	PPF	teamIDBR	teamIDlahman45	teamIDretro	BA	SLG
MIL	31	319	0.686022	0.847033	0.065738	0.686022	2	1	1.313978	-0.893175	...	Milwaukee Brewers	Miller Park	2531105	105	105	MIL	ML4	MIL	0.252284	0.477530
MIN	27	247	0.703704	0.863261	0.073799	0.703704	2	1	1.296296	-0.588424	...	Minnesota Twins	Target Field	2477644	99	101	MIN	MIN	MIN	0.241912	0.460999
MIA	37	402	0.603604	0.805358	0.079482	0.603604	2	1	1.396396	-0.923445	...	Miami Marlins	Marlins Park	1586322	102	103	MIA	FLO	MIA	0.230685	0.397871
ATL	30	301	0.691954	0.849447	0.064868	0.691954	2	1	1.308046	-0.750000	...	Atlanta Braves	Turner Field	2548679	104	103	ATL	ATL	ATL	0.248851	0.484286
BOS	27	239	0.680912	0.855273	0.074903	0.680912	2	1	1.319088	-0.397661	...	Boston Red Sox	Fenway Park II	2833333	102	102	BOS	BOS	BOS	0.277119	0.546983
DET	24	197	0.713768	0.860695	0.082192	0.713768	2	1	1.286232	-0.690987	...	Detroit Tigers	Comerica Park	3083397	106	105	DET	DET	DET	0.283348	0.519965
CIN	25	231	0.770000	0.858601	0.078578	0.770000	2	1	1.230000	-1.000000	...	Cincinnati Reds	Great American Ball Park	2534369	103	102	CIN	CIN	CIN	0.249136	0.472450
NYN	37	440	0.660661	0.814347	0.063154	0.660661	2	1	1.339339	-0.577061	...	New York Mets	Citi Field	2135657	94	95	NYM	NYN	NYN	0.237093	0.442526
BAL	33	274	0.518939	0.835306	0.045051	0.518939	2	1	1.481061	-0.513812	...	Baltimore Orioles	Oriole Park at Camden Yards	2357561	106	105	BAL	BAL	BAL	0.259786	0.524199
NYA	39	394	0.531714	0.812334	0.049769	0.531714	2	1	1.468286	-0.340136	...	New York Yankees	Yankee Stadium III	3279589	102	101	NYY	NYA	NYA	0.242430	0.452010
COL	31	345	0.741935	0.854358	0.093368	0.741935	2	1	1.258065	-0.582569	...	Colorado Rockies	Coors Field	2793828	117	118	COL	COL	COL	0.269870	0.503840
OAK	29	262	0.645320	0.841020	0.062386	0.645320	2	1	1.354680	-0.435616	...	Oakland Athletics	O.co Coliseum	1809302	95	93	OAK	OAK	OAK	0.254121	0.511502
TEX	27	233	0.663818	0.853693	0.059061	0.663818	2	1	1.336182	-0.618056	...	Texas Rangers	Rangers Ballpark in Arlington	3178273	104	103	TEX	TEX	TEX	0.262310	0.494539
TOR	28	244	0.645503	0.839040	0.074596	0.645503	2	2	1.354497	-0.787546	...	Toronto Blue Jays	Rogers Centre	2536562	102	102	TOR	TOR	TOR	0.252483	0.497742
SEA	32	299	0.602823	0.830240	0.058128	0.602823	2	1	1.397177	-0.607527	...	Seattle Mariners	Safeco Field	1761546	92	92	SEA	SEA	SEA	0.237136	0.471213
PIT	34	375	0.668449	0.831317	0.051754	0.668449	2	1	1.331551	-0.488095	...	Pittsburgh Pirates	PNC Park	2256862	94	94	PIT	PIT	PIT	0.245170	0.481225
CHA	29	250	0.615764	0.831665	0.060447	0.615764	2	2	1.384236	-0.501502	...	Chicago Cubs	Wrigley Field	2642682	104	105	CHC	CHN	CHN	0.248966	0.450836
CLE	26	203	0.624615	0.842396	0.072385	0.624615	2	1	1.375385	-0.353333	...	Cleveland Indians	Progressive Field	1572926	93	94	CLE	CLE	CLE	0.254529	0.498445
PHI	35	383	0.643697	0.817558	0.053483	0.643697	2	2	1.356303	-0.445283	...	Philadelphia Phillies	Citizens Bank Park	3012403	101	102	PHI	PHI	PHI	0.248350	0.462060
CHN	34	372	0.663102	0.815573	0.049934	0.663102	2	1	1.336898	-0.433526	...	Chicago White Sox	U.S. Cellular Field	1768413	107	107	CHW	CHA	CHA	0.237723	0.480720
SLN	28	248	0.656085	0.843795	0.079436	0.656085	2	1	1.343915	-1.000000	...	St. Louis Cardinals	Busch Stadium III	3369769	99	97	STL	SLN	SLN	0.268850	0.485514
HOU	30	301	0.691954	0.858215	0.051790	0.691954	2	1	1.308046	-0.618280	...	Houston Astros	Minute Maid Park	1651883	99	101	HOU	HOU	HOU	0.239509	0.454279
WAS	31	307	0.660215	0.828591	0.057780	0.660215	2	1	1.339785	-0.691460	...	Washington Nationals	Nationals Park	2652422	102	101	WSN	MON	WAS	0.251104	0.479765
LAN	34	390	0.695187	0.837954	0.058322	0.695187	3	2	1.306595	-0.614907	...	Los Angeles Dodgers	Dodger Stadium	3743527	95	95	LAD	LAN	LAN	0.263522	0.475687
KCA	28	244	0.645503	0.837996	0.050331	0.645503	2	1	1.354497	-0.280840	...	Kansas City Royals	Kauffman Stadium	1750754	103	103	KCR	KCA	KCA	0.260047	0.450712
ANA	25	223	0.743333	0.866293	0.069387	0.743333	2	1	1.256667	-0.452769	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
SFN	32	338	0.681452	0.835695	0.058266	0.681452	2	1	1.318548	-0.802667	...	San Francisco Giants	AT&T Park	3326796	90	89	SFG	SFN	SFN	0.260447	0.457313
TBA	26	209	0.643077	0.847691	0.058350	0.643077	2	1	1.356923	-0.583333	...	Tampa Bay Rays	Tropicana Field	1510300	96	95	TBR	TBA	TBA	0.256591	0.495125
ARI	30	317	0.728736	0.844379	0.122853	0.728736	2	1	1.271264	-0.837681	...	Arizona Diamondbacks	Chase Field	2134795	102	102	ARI	ARI	ARI	0.258633	0.473044
SDN	34	376	0.670232	0.822551	0.089665	0.670232	2	2	1.329768	-0.534694	...	San Diego Padres	Petco Park	2166691	91	91	SDP	SDN	SDN	0.244517	0.453689

	Nodes	Edges	Density	Avg_Clustering	Avg_Connectivity	Components	Exponent_Centrality	Exponent_Weight	Diameter	Radius	Avg_Shortest_Path	Frac_LCC
1914	652	6516	0.030703	0.787874	19.987730	4	2.105651	2.424115	9	5	4.334811	0.831288
1915	662	6571	0.030033	0.780841	19.851964	3	2.135761	2.366330	12	6	5.217547	0.922961
1916	458	4689	0.044805	0.787593	20.475983	3	2.097455	2.458791	12	6	5.158481	0.893013
1917	438	4298	0.044910	0.801067	19.625571	4	2.126881	2.345605	12	6	4.918967	0.840183
1918	427	3869	0.042539	0.806555	18.121780	6	2.237536	2.233264	12	6	5.257413	0.700234