The main purpose is to gain an overview of the dataset. Identify problems in the dataset that need to be corrected. Identify outliers and consider if these are real or erros in the dataset.
The data examination process will help firm up which dependent variables should be used to investigate independent variables and how the former depends on the latter.
The theme of this investigation is to ask if geographical location has an affect, if where a person was born, where the college was located has an impact on a dependent variable (e.g. Salary of player).
In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os
# Use the top level of the repository
os.chdir(os.path.join("../.."))
# Helper functions made to create polished plots
from ballbase import figures
# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [2]:
import Baseball_data_investigation
df = Baseball_data_investigation.main()
df.head()
Out[2]:
In [6]:
fig_7 = figures.univariate(df['birthYear'].dropna(), 'Birth Year', bin_n=None, formatting_right=False)
df['birthYear'].dropna().describe()
Out[6]:
In [3]:
fig_7 = figures.univariate(df['birthYear'].dropna(), 'Birth Year', bin_n=None, formatting_right=False)
df['birthYear'].dropna().describe()
Out[3]:
In [16]:
fig_1 = figures.univariate(df['award_count'].dropna(), 'Player\'s Awards')
df['award_count'].dropna().describe()
Out[16]:
In [3]:
figures.frequency_polygon(df['allstar_count'].dropna(), "Allstar count", proportion=True)
In [8]:
figures.univariate_overdispersed(df['award_count'].dropna(), 'Player\'s Awards', bin_n=None)
Out[8]:
In [17]:
fig_2 = figures.univariate(df['allstar_count'].dropna(), 'Allstar Appearence')
df['allstar_count'].dropna().describe()
Out[17]:
In [18]:
fig_3 = figures.univariate(df['max_salary_standardized_annually'].dropna(), 'Stanardised Max Career Salary', bin_n=None)
df['max_salary_standardized_annually'].dropna().describe()
Out[18]:
In [5]:
fig_4 = figures.univariate(df['min_salary_standardized_annually'].dropna(), 'Stanardised Min Career Salary', bin_n=None)
df['min_salary_standardized_annually'].dropna().describe()
Out[5]:
In [9]:
fig_5 = figures.univariate(df['mean_salary_standardized_annually'].dropna(), 'Stanardised Mean Career Salary', bin_n=None)
df['mean_salary_standardized_annually'].dropna().describe()
Out[9]:
In [19]:
fig_a = figures.dist_transform_plot(df['mean_salary'].dropna(), 'Mean Salary', bin_n=None)
In [10]:
# Unable to turn this into a function at the moment
figsize = (15, 15)
# Needed to set up figure style
figures.common_set_up(figsize)
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=figsize)
fig.suptitle("Distributions of Weight", fontsize=16)
fig.subplots_adjust(hspace=0.18, top=0.95)
figures.univariate(df['weight'].dropna(), 'Weight', rug=False, bin_n=None, ax=ax1)
figures.univariate(df['weight'].dropna(), 'Weight', rug=False, ax=ax2)
figures.univariate(df['weight'].dropna(), 'Weight', rug=False,
x_truncation_upper=200, x_truncation_lower=150,
formatting_right=False, ax=ax3)
sns.despine(offset=2, trim=True, left=True, bottom=True)
In [9]:
fig_8 = figures.univariate(df['weight'].dropna(), 'Weight', rug=False, x_truncation_upper=200, x_truncation_lower=150, formatting_right=False)
annot = "Spikes suggest entries\n rounded to every 5"
fig_8 = figures.annotation_text(fig_8, annot, 0.4, 0.2, strong_colour=True, font_size=14)
df['weight'].dropna().describe()
Out[9]:
In [6]:
fig_8 = figures.univariate(df['weight'].dropna(), 'Weight', rug=True, bin_n= 20)
annot = "Extreme outlier"
fig_8 = figures.annotation_text(fig_8, annot, 0.1, 0.05, strong_colour=False, font_size=12)
df['weight'].dropna().describe()
Out[6]:
In [3]:
fig_9 = figures.univariate(df['height'].dropna(), 'Height', rug=False)
df['height'].dropna().describe()
Out[3]:
In [5]:
fig_c2 = figures.boolean_bar(df['birthCountry'].dropna()=='USA', 'USA as birth country')
In [13]:
fig_c2 = figures.boolean_bar(df['birthYear'].dropna() >= 1975, 'Born in, or after 1975')
In [6]:
fig_c3 = figures.boolean_bar(df['college_country'].dropna()=='USA', 'College in USA', annotate=False)
(df['college_country'].dropna()=='USA').describe()
Out[6]:
In [20]:
columns = list(df.columns.values)
print(columns)
In [7]:
# df where birthCountry == USA, sort on birthState then display birthState
fig_c4 = figures.count_bar((
df[ # From DataFrame
df['birthCountry'] == 'USA' # Select only USA as birthCountry
].sort_values(['birthState']) # Sort by birthState
['birthState']), # Display birthState
'Birth State of Players',
highlight=4
);
In [10]:
fig_c3 = figures.count_bar((df.sort_values(['college_state']) # Sort by birthState
['college_state']), # Display state of college
r"Player´s College State",
highlight=3);