The main purpose is to gain an overview of the dataset. Identify problems in the dataset that need to be corrected. Identify outliers and consider if these are real or erros in the dataset.
The exploratory data analysis (EDA) process will help firm up which dependent variables should be used to investigate independent variables and how the former depends on the latter.
The theme of this investigation is to ask if geographical location has an affect, if where a person was born, where the college was located has an impact on a dependent variable (e.g. Salary of player).
In [1]:
from __future__ import print_function
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
# Move to the top level of the repository
# to access the script and ballbase module
os.chdir((os.path.join('..', '..')))
import Baseball_data_investigation
from ballbase import figures
# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline
In [2]:
df = Baseball_data_investigation.main()
df.head()
Out[2]:
In [3]:
custom = ['#192231','#3C3C3C','#CDCDCD', '#494E6B']
ToddTerje = ['#F24C4E', '#EAB126', '#1FB58F', '#1B7B34']
def frequency_polygon(x, name, categorical_v=None, color_set=custom, proportion=False, ax_size=(10, 5), funky=False, formatting_right=True, x_truncation_upper=None, x_truncation_lower=None, ax=None):
"""Returns a frequency polygon
plot which can be used with catergorical
data to show difference between categories.
"""
if funky:
color_set = ToddTerje
figures.common_set_up(ax_size)
x_max = x.max()
x_min = x.min()
bin_n = int(x_max)-int(x_min)
# Get height and a position form a histogram
# to be turned into point for frequency polygon
y, bin_bounds=np.histogram(x, bins=bin_n)
bin_edge = bin_bounds[:-1]
if proportion:
x_size = len(x)
y = y/x_size
y_str = "Proportion"
else:
y_str = "Frequency"
fig = sns.pointplot(bin_edge, y, color='#192231', scale=0.3, marker='.')
title_color = '#192231'
font_colour = '#9099A2'
# Do not add a title in a multi-figure plot.
#
# Title will be added to figure with all sub-plots
# instead in this case
if ax is None:
fig.set_title(('Frequency polygon of {0}'.format(name)),
fontsize=20, color=title_color)
fig.set_xlabel('{0}'.format(name),
color=font_colour)
fig.set_ylabel(y_str.format(name),
color=font_colour)
# Limit the x axis by truncating
if x_truncation_upper or x_truncation_lower:
axes = fig.axes
fig.set_xlim(x_truncation_lower, x_truncation_upper)
# To be communicated back in Formatting notes
x_truncation_upper_str = 'x axis truncated by {0}\n'.format(x_truncation_upper)
x_truncation_lower_str = 'x axis truncated after {0}\n'.format(x_truncation_lower)
parameters = ('Formatting:\n'
+ x_truncation_lower_str
+ x_truncation_upper_str)
fig = figures.formatting_text_box(fig, parameters, formatting_right)
# Will not work on multiple subplots within a figure
if ax is None:
# Seaborn despine to remove boundaries around plot
sns.despine(offset=2, trim=True, left=True, bottom=True)
admc = frequency_polygon(df['allstar_count'].dropna(), "Allstar count", proportion=True)
In [15]:
df2 = df[["allstar_count", "birthState"]].dropna().copy()
dfs = {value: rows for value, rows in df2.groupby('birthState')}
dfs
Out[15]:
In [ ]:
for key, value in dfs:
df_temp = pd.DataFrame(value, index="playerID", columns=[["allstar_count", "birthState"]])
frequency_polygon(df_temp['allstar_count'].dropna(), "Allstar count", proportion=True)
In [24]:
df.isnull().values.any()
Out[24]:
In [6]:
fig_7 = figures.univariate(df['birthYear'].dropna(), 'Birth Year', bin_n=None, formatting_right=False)
df['birthYear'].dropna().describe()
Out[6]:
In [3]:
fig_7 = figures.univariate(df['birthYear'].dropna(), 'Birth Year', bin_n=None, formatting_right=False)
df['birthYear'].dropna().describe()
Out[3]:
In [16]:
fig_1 = figures.univariate(df['award_count'].dropna(), 'Player\'s Awards')
df['award_count'].dropna().describe()
Out[16]:
In [8]:
figures.univariate_overdispersed(df['award_count'].dropna(), 'Player\'s Awards', bin_n=None)
Out[8]:
In [17]:
fig_2 = figures.univariate(df['allstar_count'].dropna(), 'Allstar Appearence')
df['allstar_count'].dropna().describe()
Out[17]:
In [18]:
fig_3 = figures.univariate(df['max_salary_standardized_annually'].dropna(), 'Stanardised Max Career Salary', bin_n=None)
df['max_salary_standardized_annually'].dropna().describe()
Out[18]:
In [5]:
fig_4 = figures.univariate(df['min_salary_standardized_annually'].dropna(), 'Stanardised Min Career Salary', bin_n=None)
df['min_salary_standardized_annually'].dropna().describe()
Out[5]:
In [9]:
fig_5 = figures.univariate(df['mean_salary_standardized_annually'].dropna(), 'Stanardised Mean Career Salary', bin_n=None)
df['mean_salary_standardized_annually'].dropna().describe()
Out[9]:
In [19]:
fig_a = figures.dist_transform_plot(df['mean_salary'].dropna(), 'Mean Salary', bin_n=None)
In [10]:
# Unable to turn this into a function at the moment
figsize = (15, 15)
# Needed to set up figure style
figures.common_set_up(figsize)
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=figsize)
fig.suptitle("Distributions of Weight", fontsize=16)
fig.subplots_adjust(hspace=0.18, top=0.95)
figures.univariate(df['weight'].dropna(), 'Weight', rug=False, bin_n=None, ax=ax1)
figures.univariate(df['weight'].dropna(), 'Weight', rug=False, ax=ax2)
figures.univariate(df['weight'].dropna(), 'Weight', rug=False,
x_truncation_upper=200, x_truncation_lower=150,
formatting_right=False, ax=ax3)
sns.despine(offset=2, trim=True, left=True, bottom=True)
In [9]:
fig_8 = figures.univariate(df['weight'].dropna(), 'Weight', rug=False, x_truncation_upper=200, x_truncation_lower=150, formatting_right=False)
annot = "Spikes suggest entries\n rounded to every 5"
fig_8 = figures.annotation_text(fig_8, annot, 0.4, 0.2, strong_colour=True, font_size=14)
df['weight'].dropna().describe()
Out[9]:
In [6]:
fig_8 = figures.univariate(df['weight'].dropna(), 'Weight', rug=True, bin_n= 20)
annot = "Extreme outlier"
fig_8 = figures.annotation_text(fig_8, annot, 0.1, 0.05, strong_colour=False, font_size=12)
df['weight'].dropna().describe()
Out[6]:
In [3]:
fig_9 = figures.univariate(df['height'].dropna(), 'Height', rug=False)
df['height'].dropna().describe()
Out[3]:
In [13]:
sns.jointplot(x=df['weight'].dropna(), y=df['height'].dropna())
Out[13]:
In [5]:
fig_c2 = figures.boolean_bar(df['birthCountry'].dropna()=='USA', 'USA as birth country')
In [13]:
fig_c2 = figures.boolean_bar(df['birthYear'].dropna() >= 1975, 'Born in, or after 1975')
In [6]:
fig_c3 = figures.boolean_bar(df['college_country'].dropna()=='USA', 'College in USA', annotate=False)
(df['college_country'].dropna()=='USA').describe()
Out[6]:
In [20]:
columns = list(df.columns.values)
print(columns)
In [7]:
# df where birthCountry == USA, sort on birthState then display birthState
fig_c4 = figures.count_bar((
df[ # From DataFrame
df['birthCountry'] == 'USA' # Select only USA as birthCountry
].sort_values(['birthState']) # Sort by birthState
['birthState']), # Display birthState
'Birth State of Players',
highlight=4
);