In [36]:
% matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import itertools, powerlaw
from collections import Counter
import seaborn as sns
sns.set(style="white",rc={"figure.figsize": (6, 6)})
# fdir = 'E:/Dropbox/Hacking/Baseball/'
fdir = '/Users/brianckeegan/Dropbox/Hacking/Baseball/'
figpath = '/Users/brianckeegan/Dropbox/Papers/Network Science/Baseball/'
In [2]:
batting_df = pd.read_csv(fdir + 'Lahman/Batting.csv')
batting_df.tail()
Out[2]:
In [3]:
teams_df = pd.read_csv(fdir + 'Lahman/Teams.csv')
teams_df.tail()
Out[3]:
In [4]:
gl_2013 = pd.read_csv(fdir + 'GameLogs/GL2013.TXT',header=None)
gl_2013.head()
Out[4]:
In [5]:
home_col = [3] + np.arange(105,132,3).tolist()
away_col = [6] + np.arange(132,159,3).tolist()
In [6]:
# Get the lineups of all the home team lineups
home_df = gl_2013[home_col]
# Rename the columns so the concatenate and melt nicer
home_df.columns = ['Team'] + np.arange(1,10).tolist()
# Get all the lineups of the away team lineups
away_df = gl_2013[away_col]
# Rename the columns so the concatenate and melt nicer
away_df.columns = ['Team'] + np.arange(1,10).tolist()
# Combine the home and away lineups
all_games_df = pd.concat([home_df,away_df])
# Big nasty operation that melts wide format into long format,
# removes duplicate entries, groups by playerID,
# aggregates into a list of teams played, then converts to a dictionary.
# This dictionary is keyed by playerID and returns a list of all the
# teams of which the player was a member
affiliations_by_player = pd.melt(all_games_df,id_vars='Team',value_vars=np.arange(1,10).tolist()).drop('variable',1).drop_duplicates().groupby('value').agg({'Team':lambda x:list(x)}).to_dict()['Team']
affiliations_by_team = pd.melt(all_games_df,id_vars='Team',value_vars=np.arange(1,10).tolist()).drop('variable',1).drop_duplicates().groupby('Team').agg({'value':lambda x:list(x)}).to_dict()['value']
In [7]:
# Get all the home and away lineups, create a list of combinations of these names
# reflecting teammates who've played together, then count how often these edges occur
home = Counter([j for i in gl_2013[np.arange(105,132,3)].values.tolist() for j in list(itertools.combinations(i,2))])
away = Counter([j for i in gl_2013[np.arange(132,159,3)].values.tolist() for j in list(itertools.combinations(i,2))])
# Add the home and away Counter objects together to get all games played together
games = home + away
# Create a network object
g2013 = nx.Graph()
# Iterate over the edges in the games Counter object adding edges,
# also add node properties from affiliations
for (p1,p2),c in games.iteritems():
g2013.add_edge(p1,p2,weight=c)
g2013.add_node(p1,teams=affiliations_by_player[p1])
g2013.add_node(p2,teams=affiliations_by_player[p2])
In [8]:
# http://orange.biolab.si/blog/2012/06/15/joint-entropy-in-python/
def entropy2(*X):
return np.sum(-p * np.log2(p) if p > 0 else 0 for p in
(np.mean(reduce(np.logical_and, (predictions == c for predictions, c in zip(X, classes))))
for classes in itertools.product(*[set(x) for x in X])))
In [9]:
def ei_index(g,affiliations,team):
team_graph = g.subgraph(affiliations[team])
non_team_edges = [(node,neighbor,d) for node in team_graph for neighbor,d in g[node].items() if neighbor not in team_graph.nodes()]
return (len(non_team_edges) - len(team_graph.edges()))/float(len(non_team_edges) + len(team_graph.edges()))
def weight_entropy(g,affiliations,team):
weights = [float(d['weight']) for n1,n2,d in g.subgraph(affiliations[team]).edges(data=True)]
return entropy2(np.array(weights)/float(sum(weights)))
def strength_entropy(g,affiliations,team):
subgraph = g.subgraph(affiliations[team])
strengths = [sum(subgraph[node][neighbor]['weight'] for neighbor in subgraph[node]) for node in subgraph.nodes()]
return entropy2(np.array(strengths)/float(sum(strengths)))
In [10]:
team_network_statistics = pd.DataFrame(index=affiliations_by_team.keys())
team_network_statistics['Nodes'] = pd.Series({team:g2013.subgraph(affiliations_by_team[team]).number_of_nodes() for team in affiliations_by_team.keys()})
team_network_statistics['Edges'] = pd.Series({team:g2013.subgraph(affiliations_by_team[team]).number_of_edges() for team in affiliations_by_team.keys()})
team_network_statistics['Density'] = pd.Series({team:nx.density(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Clustering'] = pd.Series({team:nx.average_clustering(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Weighted_Clustering'] = pd.Series({team:nx.average_clustering(g2013.subgraph(affiliations_by_team[team]),weight='weight') for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Connectivity'] = pd.Series({team:np.mean(nx.degree_centrality(g2013.subgraph(affiliations_by_team[team])).values()) for team in affiliations_by_team.keys()})
team_network_statistics['Diameter'] = pd.Series({team:nx.diameter(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Radius'] = pd.Series({team:nx.radius(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Shortest_path'] = pd.Series({team:nx.average_shortest_path_length(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['EI_index'] = pd.Series({team:ei_index(g2013,affiliations_by_team,team) for team in affiliations_by_team.keys()})
team_network_statistics['Weight_entropy'] = pd.Series({team:weight_entropy(g2013,affiliations_by_team,team) for team in affiliations_by_team.keys()})
team_network_statistics['Strength_entropy'] = pd.Series({team:strength_entropy(g2013,affiliations_by_team,team) for team in affiliations_by_team.keys()})
In [11]:
team_joined_df = team_network_statistics.join(teams_df[teams_df['yearID'] == 2013].set_index('teamID'))
team_joined_df['BA'] = team_joined_df['H'] / team_joined_df['AB']
team_joined_df['SLG'] = (team_joined_df['H'] + 2*team_joined_df['2B'] + 3*team_joined_df['3B'] + 4*team_joined_df['HR']) / team_joined_df['AB']
team_joined_df
Out[11]:
In [108]:
team_joined_df.columns
Out[108]:
In [175]:
sns.lmplot('Avg_Clustering','SLG',team_joined_df,hue='lgID')
plt.xlabel('Average team clustering')
plt.ylabel('Slugging percentage')
plt.tight_layout()
plt.savefig('cluster-SLG.png')
In [176]:
sns.lmplot(u'Strength_entropy','W',team_joined_df,hue='lgID')
plt.xlabel('Within team strength entropy')
plt.ylabel('Wins')
plt.tight_layout()
plt.savefig('strength_entropy-wins.png')
In [12]:
total_edgelist = Counter()
yearly_graphlist = list()
for year in np.arange(1914,2014):
gl = pd.read_csv(fdir + 'GameLogs/GL{0}.TXT'.format(str(year)),header=None)
home = Counter([j for i in gl[np.arange(105,132,3)].values.tolist() for j in list(itertools.combinations(i,2))])
away = Counter([j for i in gl[np.arange(132,159,3)].values.tolist() for j in list(itertools.combinations(i,2))])
games = home + away
total_edgelist += games
g = nx.Graph()
for (p1,p2),c in games.iteritems():
g.add_edge(p1,p2,weight=c)
yearly_graphlist.append(g)
In [97]:
nx.write_gexf(yearly_graphlist[-2],'2012.gexf')
In [80]:
annual_network_stats = pd.DataFrame(index=np.arange(1914,2014))
annual_network_stats['Nodes'] = [g.number_of_nodes() for g in yearly_graphlist]
annual_network_stats['Edges'] = [g.number_of_edges() for g in yearly_graphlist]
annual_network_stats['Density'] = [nx.density(g) for g in yearly_graphlist]
annual_network_stats['Avg_Clustering'] = [nx.average_clustering(g) for g in yearly_graphlist]
annual_network_stats['Avg_Connectivity'] = [np.mean(nx.degree_centrality(g).values())*(len(g)-1) for g in yearly_graphlist]
#annual_network_stats['Rich_Club'] = [nx.rich_club_coefficient(g) for g in yearly_graphlist]
annual_network_stats['Components'] = [nx.number_connected_components(g) for g in yearly_graphlist]
annual_network_stats['Exponent_Centrality'] = [powerlaw.Fit([int(i*(len(g)-1)) for i in nx.degree_centrality(g).values()],xmin=8,discrete=True).power_law.alpha for g in yearly_graphlist]
annual_network_stats['Exponent_Weight'] = [powerlaw.Fit([d['weight'] for n1,n2,d in g.edges_iter(data=True)],discrete=True).power_law.alpha for g in yearly_graphlist]
diameters = list()
radiuses = list()
avg_shortest_path = list()
frac_lcc = list()
for g in yearly_graphlist:
lcc = nx.connected_component_subgraphs(g)[0]
diameters.append(nx.diameter(lcc))
radiuses.append(nx.radius(lcc))
avg_shortest_path.append(nx.average_shortest_path_length(lcc))
frac_lcc.append(len(lcc)/float(len(g)))
annual_network_stats['Diameter'] = diameters
annual_network_stats['Radius'] = radiuses
annual_network_stats['Avg_Shortest_Path'] = avg_shortest_path
annual_network_stats['Frac_LCC'] = frac_lcc
annual_network_stats.to_csv('1914-2014.csv')
annual_network_stats.head()
Out[80]:
In [88]:
ax = annual_network_stats['Nodes'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Players')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'nodes.pdf')
In [89]:
ax = annual_network_stats['Density'].plot(lw=4)
ax.set_ylabel('Density')
ax.set_xlabel('Year')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'density.pdf')
In [90]:
ax = annual_network_stats['Avg_Clustering'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Average Clustering')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'clustering.pdf')
In [91]:
ax = annual_network_stats['Avg_Shortest_Path'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Average shortest path')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'shortest_path.pdf')
In [92]:
ax = annual_network_stats['Frac_LCC'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('LCC node fraction')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'frac_lcc.pdf')
In [93]:
ax = annual_network_stats['Avg_Connectivity'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Average degree')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'degree.pdf')
In [94]:
ax = annual_network_stats['Exponent_Centrality'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Power law exponent, centrality')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.ylim((1.6,2.4))
plt.savefig(figpath+'exponent_centrality.pdf')
In [95]:
ax = annual_network_stats['Exponent_Weight'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Power law exponent, weight')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'exponent_weight.pdf')
In [16]:
g_all = nx.Graph()
for (p1,p2),c in total_edgelist.iteritems():
g_all.add_edge(p1,p2,weight=c)
In [49]:
centralities = [int(i*(len(g_all)-1)) for i in nx.degree_centrality(g_all).values()]
degree_counter = Counter([i for i in centralities if i >= 8])
plt.scatter(degree_counter.keys(),degree_counter.values(),c='r')
#plt.ylabel('Count')
plt.yscale('log')
plt.ylim((.9,1e3))
#plt.xlabel('Observation')
plt.xscale('log')
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('degree_distribution.png')
plt.savefig(figpath+'degree_distribution.pdf')
In [51]:
weights = [d['weight'] for n1,n2,d in g_all.edges_iter(data=True)]
weight_counter = Counter(weights)
plt.scatter(weight_counter.keys(),weight_counter.values(),c='b')
#plt.ylabel('Count')
plt.yscale('log')
#plt.ylim((.9,1e3))
#plt.xlabel('Observation')
plt.xscale('log')
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('weight_distribution.png')
plt.savefig(figpath+'weight_distribution.pdf')
In [56]:
strengths = [sum(g_all[node][neighbor]['weight'] for neighbor in g_all[node]) for node in g_all.nodes()]
strength_counter = Counter(strengths)
plt.scatter(strength_counter.keys(),strength_counter.values(),c='g')
#plt.ylabel('Count')
plt.yscale('log')
#plt.ylim((.9,1e3))
#plt.xlabel('Observation')
plt.xscale('log')
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('strength_distribution.png')
plt.savefig(figpath+'strength_distribution.pdf')
In [35]:
centralities_pl_alpha = powerlaw.Fit(centralities,xmin=8,discrete=True).power_law.alpha
weights_pl_alpha = powerlaw.Fit(degrees,discrete=True).power_law.alpha
print centralities_pl_alpha, weights_pl_alpha
In [61]:
def extract_backbone(g, alpha):
backbone_graph = nx.Graph()
for node in g:
k_n = len(g[node])
if k_n > 1:
sum_w = sum( g[node][neighbor]['weight'] for neighbor in g[node] )
for neighbor in g[node]:
edgeWeight = g[node][neighbor]['weight']
pij = float(edgeWeight)/sum_w
if (1-pij)**(k_n-1) < alpha: # equation 2
backbone_graph.add_edge( node,neighbor, weight = edgeWeight)
return backbone_graph
In [65]:
g_backbone = extract_backbone(g,.125)
nx.write_gexf(g_backbone,'1995-2013_backbone.gexf')
g.number_of_edges(), g_backbone.number_of_edges()
Out[65]:
In [ ]: