The main purpose is to gain an overview of the dataset. Identify problems in the dataset that need to be corrected. Identify outliers and consider if these are real or erros in the dataset.
The data analysis process will look for relationships between independent and dependent variables. This is a preliminary data analysis and any inferences should not be treated as tentative.
The theme of this investigation is to ask if geographical location has an affect, if where a person was born, where the college was located has an impact on a dependent variable (e.g. Salary of player).
In [4]:
from __future__ import print_function
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os
# Use the top level of the repository
os.chdir(os.path.join("../.."))
# Helper functions made to create polished plots
from ballbase import figures
# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [5]:
import Baseball_data_investigation
df = Baseball_data_investigation.main()
df.head()
Out[5]:
In [6]:
figures.count_bar(df['award_count'].dropna(),
"Award count frequency",
ax_size=(28, 6)
)
Out[6]:
In [171]:
figures.bar(df['award_count'].dropna(),
"Mean of award count by birth State",
x_v=(
df[ # From DataFrame
df['birthCountry'] == 'USA' # Select only USA as birthCountry
].sort_values(['birthState']) # Sort by birthState
['birthState']),
ax_size=(28, 6),
highlight=0
)
Out[171]:
In [33]:
figures.bar(df['award_count'].dropna(),
"Mean of award count by College State",
x_v=(
df[ # From DataFrame
df['birthCountry'] == 'USA' # Select only USA as birthCountry
].sort_values(['college_state']) # Sort by CollegeState
['college_state']),
ax_size=(28, 6)
)
Out[33]:
In [170]:
sns.boxplot(x= (df[df['birthCountry'] == 'USA'].sort_values(['birthState'])['birthState']),
y=df['award_count'].dropna(), data=df, color="grey")
Out[170]:
In [169]:
sns.boxplot(x= (df[df['birthCountry'] == 'USA'].sort_values(['college_state'])['college_state']),
y=df['award_count'].dropna(), data=df, color="grey")
Out[169]:
In [44]:
figures.bar(df['allstar_count'].dropna(),
"Mean of award count by birth State",
x_v=(
df[ # From DataFrame
df['birthCountry'] == 'USA' # Select only USA as birthCountry
].sort_values(['birthState']) # Sort by birthState
['birthState']),
ax_size=(28, 6),
highlight=0
)
Out[44]:
In [46]:
figures.bar(df['allstar_count'].dropna(),
"Mean of award count by College State",
x_v=(
df[ # From DataFrame
df['birthCountry'] == 'USA' # Select only USA as birthCountry
].sort_values(['college_state']) # Sort by CollegeState
['college_state']),
ax_size=(28, 6)
)
Out[46]:
In [165]:
sns.jointplot(x='allstar_count', y='award_count', data=df[['award_count', 'allstar_count']].dropna(),
s=40, alpha=0.1, color="grey", edgecolor="w", linewidth=1, size=8);
sns.despine(offset=2, trim=True, left=True, bottom=True)
In [164]:
sns.jointplot(x='weight', y='height', data=df[['weight', 'height']].dropna(),
s=40, alpha=0.1, color="grey", edgecolor="w", linewidth=1, size=8);
sns.despine(offset=2, trim=True, left=True, bottom=True)
In [163]:
sns.jointplot(x='weight', y='mean_salary', data=df[['weight', 'mean_salary']].dropna(),
s=40, alpha=0.1, color="grey", edgecolor="w", linewidth=1, size=8);
sns.despine(offset=2, trim=True, left=True, bottom=True)