In :import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import pairwise_distances from sklearn import cluster
In :events_df = pd.read_csv('GA/Data/events.csv')
In :ginf = pd.read_csv('GA/Data/ginf.csv')
In :events_df.groupby('id_odsp').id_event.count().plot('hist') plt.title('Distribution of Events by Match') plt.xlabel('Events per Match') plt.show()
In :## Depending on level of comparison, calculate either across leagues or entire dataset print events_df.groupby('id_odsp').id_event.count().mean() print events_df.groupby('id_odsp').id_event.count().median() ## Little difference between mean and median
In :## events by time ## spikes at minutes 45 and 90 (ends of halves) events_df.groupby('time').id_event.count().plot('bar') plt.show()
In :events_df.groupby('time').is_goal.sum().plot('bar') plt.show()
In :## For loop to assign current home goals at that point in a given match ## Function w apply not working... going to use loop goals = 0 events_df['prev_id'] = events_df.id_odsp.shift() for i, row in events_df.iterrows(): if row.id_odsp != row.prev_id: goals = 0 if (row.is_goal == 1) and (row.side == 1): goals += 1 events_df.loc[i, 'home_goals'] = goals
In :## away goals goals = 0 events_df['prev_id'] = events_df.id_odsp.shift() for i, row in events_df.iterrows(): if row.id_odsp != row.prev_id: goals = 0 if (row.is_goal == 1) and (row.side == 2): goals += 1 events_df.loc[i, 'away_goals'] = goals
In :## Data is saved in csv - no more for loops! #events_df.to_csv('soccer_events.csv', encoding='utf-8')
The steps below calculate the average goal difference at each minute of a match to measure how close a given match is. First, I create a new column in the existing dataframe to determine the goal difference at that point in the match. I create a separate dataframe that shows the goal difference at each minute for each match, and then write a function to fill in all minutes for a match in which no events occurred using the preceding goal difference. The average of all values across the entire match is then calculated.
In [ ]:## Dataframe was previously saved - read in data df = pd.read_csv('soccer_events.csv')
In [ ]:df['score_diff'] = abs(df.home_goals - df.away_goals) ## Pivot on score_diff for each game at each point in time ## aggfunc is 'last' to take last occurring difference - i.e., if two goals scored in same minute, take last event time_diff = df.pivot_table(values = 'score_diff', index = 'id_odsp', columns = 'time', aggfunc = 'last') ## clean up column names a little bit to indicate minutes time_diff.columns = [str(col) + '_min' for col in time_diff.columns] ## if no events in 0th minute, fill goal difference as 0 time_diff['0_min'].fillna(0, inplace = True) ## for minutes with no events in a match, fill with previous minutes' difference ## ffill each row up until last listed event def fill_row(match): match = match.copy() non_nans = match[~match.apply(np.isnan)] start, end = non_nans.index, non_nans.index[-1] match.ix[start:end] = match.ix[start:end].fillna(method='ffill') return match time_diff = time_diff.apply(fill_row, axis = 1) ## Fill up to 90th minute if no events time_diff.ix[:,'78_min':'90_min'].ffill(axis = 1, inplace = True) ## avg goal differential in match avg_time_diff = time_diff.transpose().mean().reset_index() avg_time_diff.columns = [['id_odsp','avg_goal_diff']] avg_time_diff.head()
The steps below calculate the total number of lead changes by match to indicate a higher amount of scoring, and also more excitement.
In [ ]:for i, row in goal_df.iterrows(): if row.home_goals > row.away_goals: leader = 'home' if row.away_goals > row.home_goals: leader = 'away' if row.home_goals == row.away_goals: leader = 'tied' goal_df.loc[i, 'leader'] = leader goal_df['prev_leader'] = goal_df.groupby('id_odsp').leader.shift().fillna('tied') ## Another for loop to flag if score is lead change for i, row in goal_df.iterrows(): if row.id_odsp == row.prev_id: if row.leader != row.prev_leader: ld_chg = 1 else: ld_chg = 0 goal_df.loc[i, 'lead_change'] = ld_chg ## Total lead changes by match goal_df.groupby('id_odsp').lead_change.sum().sort_values().reset_index()
Now that all fields have been calculated, the steps below will join to the original dataframe with individual match data.
In [ ]:## Adding lead changes into overall match data match_df = pd.merge(ginf, goal_df.groupby('id_odsp').lead_change.sum().reset_index(), on = 'id_odsp', how = 'left') ## Will need to fillna for new column w 0 value to indicate no lead changes match_df.lead_change.fillna(0, inplace = True) ## Match events match_df = pd.merge(match_df, df.groupby('id_odsp').id_event.count().reset_index(), on = 'id_odsp', how = 'inner') ## Average goal differential match_df = pd.merge(match_df, avg_time_diff, on = 'id_odsp', how = 'inner')
The next step is to use unsupervised clustering to create groups of matches according to the metrics calculated above.
In [ ]:X = match_df[['lead_change','id_event','avg_goal_diff']] sns.pairplot(X) plt.show()
In [ ]:X_scaled = preprocessing.StandardScaler().fit_transform(X) X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns) k = 4 kmeans = cluster.KMeans(n_clusters=k, random_state = 43) kmeans.fit(X_scaled) labels = kmeans.labels_ centroids = kmeans.cluster_centers_ inertia = kmeans.inertia_ print labels print centroids print inertia print metrics.silhouette_score(X_scaled, labels, metric='euclidean') match_df['cluster'] = labels
In [ ]:X['cluster'] = labels cols = X.columns[:-1] sns.pairplot(X, x_vars=cols, y_vars= cols, hue='cluster') plt.show()
In [ ]:match_df.drop([u'odd_h', u'odd_d',u'odd_a', u'odd_over', u'odd_under', u'odd_bts', u'odd_bts_n'], axis = 1, inplace = True) match_df['date'] = pd.to_datetime(match_df['date']) match_df['month'] = match_df.date.apply(lambda x: x.month) match_df['year'] = match_df.date.apply(lambda x: x.year) centroid_df = pd.DataFrame(centroids, columns = X_scaled_df.columns) centroid_df
In [ ]:match_df = pd.read_csv('match_cluster.csv') match_df = match_df.drop(['Unnamed: 0','link_odsp','adv_stats'], axis = 1) england = match_df[match_df.country == 'england'] spain = match_df[match_df.country == 'spain'] germany = match_df[match_df.country == 'germany'] france = match_df[match_df.country == 'france'] italy = match_df[match_df.country == 'italy']