Data Analysis of Baseball Database

The main purpose is to gain an overview of the dataset. Identify problems in the dataset that need to be corrected. Identify outliers and consider if these are real or erros in the dataset.

The data analysis process will look for relationships between independent and dependent variables. This is a preliminary data analysis and any inferences should not be treated as tentative.

The theme of this investigation is to ask if geographical location has an affect, if where a person was born, where the college was located has an impact on a dependent variable (e.g. Salary of player).


In [4]:
from __future__ import print_function
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os
# Use the top level of the repository
os.chdir(os.path.join("../.."))
# Helper functions made to create polished plots
from ballbase import figures

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [5]:
import Baseball_data_investigation
df = Baseball_data_investigation.main()
df.head()


Processed Hall of Fame data

Processed All Star data

Processed Player Awards data

Processed Salary data

Processed College Locations

Processed master file

Master_Merge is ready

Data Audit complete
Out[5]:
birthYear birthMonth birthDay birthCountry birthState birthCity deathYear deathMonth deathDay deathCountry ... max_salary min_salary mean_salary_standardized_annually max_salary_standardized_annually min_salary_standardized_annually mode_schoolID college_name_full college_city college_state college_country
playerID
aardsda01 1981.0 12.0 27.0 USA CO Denver NaN NaN NaN NaN ... 4500000.0 300000.0 -0.440097 0.260102 -0.670224 rice Rice University Houston TX USA
abadan01 1972.0 8.0 25.0 USA FL Palm Beach NaN NaN NaN NaN ... 327000.0 327000.0 -0.663649 -0.663649 -0.663649 gamiddl Middle Georgia College Cochran GA USA
abbeybe01 1869.0 11.0 11.0 USA VT Essex 1962.0 6.0 11.0 USA ... NaN NaN NaN NaN NaN vermont University of Vermont Burlington VT USA
abbotje01 1972.0 8.0 17.0 USA GA Atlanta NaN NaN NaN NaN ... 300000.0 175000.0 -0.644164 -0.598825 -0.690417 kentucky University of Kentucky Lexington KY USA
abbotji01 1967.0 9.0 19.0 USA MI Flint NaN NaN NaN NaN ... 2775000.0 68000.0 0.273098 1.275547 -0.814658 michigan University of Michigan Ann Arbor MI USA

5 rows × 44 columns

Awards Count


In [6]:
figures.count_bar(df['award_count'].dropna(),
            "Award count frequency",
            ax_size=(28, 6)
            )


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x16ae4d45b38>

In [171]:
figures.bar(df['award_count'].dropna(),
            "Mean of award count by birth State", 
            x_v=(
                 df[                             # From DataFrame
                    df['birthCountry'] == 'USA'  # Select only USA as birthCountry
                   ].sort_values(['birthState']) # Sort by birthState
                   ['birthState']),
            ax_size=(28, 6),
            highlight=0
            )


Out[171]:
<matplotlib.axes._subplots.AxesSubplot at 0x182682be0>

In [33]:
figures.bar(df['award_count'].dropna(),
            "Mean of award count by College State", 
            x_v=(
                 df[                                # From DataFrame
                    df['birthCountry'] == 'USA'     # Select only USA as birthCountry
                   ].sort_values(['college_state']) # Sort by CollegeState
                   ['college_state']),
            ax_size=(28, 6)
            )


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ae9a9b0>

In [170]:
sns.boxplot(x= (df[df['birthCountry'] == 'USA'].sort_values(['birthState'])['birthState']), 
            y=df['award_count'].dropna(), data=df, color="grey")


Out[170]:
<matplotlib.axes._subplots.AxesSubplot at 0x1838dd3c8>

In [169]:
sns.boxplot(x= (df[df['birthCountry'] == 'USA'].sort_values(['college_state'])['college_state']), 
            y=df['award_count'].dropna(), data=df, color="grey")


Out[169]:
<matplotlib.axes._subplots.AxesSubplot at 0x1986e1b00>

All star count


In [44]:
figures.bar(df['allstar_count'].dropna(),
            "Mean of award count by birth State", 
            x_v=(
                 df[                             # From DataFrame
                    df['birthCountry'] == 'USA'  # Select only USA as birthCountry
                   ].sort_values(['birthState']) # Sort by birthState
                   ['birthState']),
            ax_size=(28, 6),
            highlight=0
            )


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x139c8b6a0>

In [46]:
figures.bar(df['allstar_count'].dropna(),
            "Mean of award count by College State", 
            x_v=(
                 df[                                # From DataFrame
                    df['birthCountry'] == 'USA'     # Select only USA as birthCountry
                   ].sort_values(['college_state']) # Sort by CollegeState
                   ['college_state']),
            ax_size=(28, 6)
            )


Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a82aef0>

In [165]:
sns.jointplot(x='allstar_count', y='award_count', data=df[['award_count', 'allstar_count']].dropna(), 
              s=40, alpha=0.1, color="grey", edgecolor="w", linewidth=1, size=8);
sns.despine(offset=2, trim=True, left=True, bottom=True)


Height and weight


In [164]:
sns.jointplot(x='weight', y='height', data=df[['weight', 'height']].dropna(), 
              s=40, alpha=0.1, color="grey", edgecolor="w", linewidth=1, size=8);
sns.despine(offset=2, trim=True, left=True, bottom=True)



In [163]:
sns.jointplot(x='weight', y='mean_salary', data=df[['weight', 'mean_salary']].dropna(), 
              s=40, alpha=0.1, color="grey", edgecolor="w", linewidth=1, size=8);
sns.despine(offset=2, trim=True, left=True, bottom=True)



In [162]:
sns.jointplot(x='height', y='mean_salary', data=df[['height', 'mean_salary']].dropna(), 
              s=40, alpha=0.1, color="grey", edgecolor="w", linewidth=1, size=8);
sns.despine(offset=2, trim=True, left=True, bottom=True)


Salary


In [161]:
sns.jointplot(x='mean_salary', y='max_salary', data=df[['mean_salary', 'max_salary']].dropna(), 
              s=40, alpha=0.1, color="grey", edgecolor="w", linewidth=1, size=8);
sns.despine(offset=2, trim=True, left=True, bottom=True)



In [166]:
sns.jointplot(x='mean_salary_standardized_annually', y='max_salary_standardized_annually', data=df[['mean_salary_standardized_annually', 'max_salary_standardized_annually']].dropna(), 
              s=40, alpha=0.1, color="grey", edgecolor="w", linewidth=1, size=8);
sns.despine(offset=2, trim=True, left=True, bottom=True)



In [34]:
figures.bar(df['mean_salary'].dropna(),
            "Mean of mean salary by birth State",
            x_v=(
                 df[                             # From DataFrame
                    df['birthCountry'] == 'USA'  # Select only USA as birthCountry
                   ].sort_values(['birthState']) # Sort by birthState
                   ['birthState']),
            ax_size=(28, 6),
            highlight=0
            )


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x120a89b00>

In [35]:
figures.bar(df['mean_salary'].dropna(),
            "Mean of mean salary by College State", 
            x_v=(
                 df[                                # From DataFrame
                    df['birthCountry'] == 'USA'     # Select only USA as birthCountry
                   ].sort_values(['college_state']) # Sort by CollegeState
                   ['college_state']),
            ax_size=(28, 6)
            )


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x1328488d0>

In [36]:
figures.bar(df['max_salary'].dropna(),
            "Mean of Max salary by birth State", 
            x_v=(
                 df[                             # From DataFrame
                    df['birthCountry'] == 'USA'  # Select only USA as birthCountry
                   ].sort_values(['birthState']) # Sort by birthState
                   ['birthState']),
            ax_size=(28, 6),
            highlight=0
            )


Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x132848d30>

In [37]:
figures.bar(df['max_salary'].dropna(),
            "Mean of Max salary by College State", 
            x_v=(
                 df[                                # From DataFrame
                    df['birthCountry'] == 'USA'     # Select only USA as birthCountry
                   ].sort_values(['college_state']) # Sort by CollegeState
                   ['college_state']),
            ax_size=(28, 6)
            )


Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x1344075c0>

In [38]:
figures.bar(df['mean_salary_standardized_annually'].dropna(),
            "Mean of mean standardized salary birth State", 
            x_v=(
                 df[                             # From DataFrame
                    df['birthCountry'] == 'USA'  # Select only USA as birthCountry
                   ].sort_values(['birthState']) # Sort by birthState
                   ['birthState']),
            ax_size=(28, 6),
            highlight=0
            )


Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x1351a0780>

In [39]:
figures.bar(df['mean_salary_standardized_annually'].dropna(),
            "Mean of mean standardized salary College State", 
            x_v=(
                 df[                                # From DataFrame
                    df['birthCountry'] == 'USA'     # Select only USA as birthCountry
                   ].sort_values(['college_state']) # Sort by CollegeState
                   ['college_state']),
            ax_size=(28, 6)
            )


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ad540b8>

In [43]:
figures.bar(df['max_salary_standardized_annually'].dropna(),
            "Mean of max standardized salary birth State", 
            x_v=(
                 df[                             # From DataFrame
                    df['birthCountry'] == 'USA'  # Select only USA as birthCountry
                   ].sort_values(['birthState']) # Sort by birthState
                   ['birthState']),
            ax_size=(28, 6),
            highlight=0
            )


Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x138ecf3c8>

In [41]:
figures.bar(df['max_salary_standardized_annually'].dropna(),
            "Mean of mean standardized salary College State", 
            x_v=(
                 df[                                # From DataFrame
                    df['birthCountry'] == 'USA'     # Select only USA as birthCountry
                   ].sort_values(['college_state']) # Sort by CollegeState
                   ['college_state']),
            ax_size=(28, 6)
            )


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d0e15f8>

In [173]:
sns.boxplot(x= (df[df['birthCountry'] == 'USA'].sort_values(['college_state'])['college_state']), 
            y=df['max_salary_standardized_annually'].dropna(), data=df, color="grey")
sns.despine(offset=2, trim=True, left=True, bottom=True)



In [172]:
sns.boxplot(x= (df[df['birthCountry'] == 'USA'].sort_values(['birthState'])['birthState']), 
            y=df['max_salary_standardized_annually'].dropna(), data=df, color="grey")
sns.despine(offset=2, trim=True, left=True, bottom=True)



In [96]:
df_2d = df[df['birthCountry'] == 'USA'].groupby(['college_state', 'birthState']).sum().unstack('birthState')
df_2d = df_2d.fillna(0)
df_2d


Out[96]:
birthYear ... min_salary_standardized_annually
birthState AK AL AR AZ CA CO CT DC DE FL ... SD TN TX UT VA VT WA WI WV WY
college_state
AL 0.0 157231.0 0.0 0.0 0.0 0.0 1913.0 1907.0 0.0 17764.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
AR 0.0 0.0 59992.0 0.0 3923.0 3930.0 0.0 0.0 0.0 0.0 ... 0.000000 -0.673233 -1.570851 0.000000 0.000000 0.000000 0.000000 -0.714088 0.000000 0.000000
AZ 3952.0 1987.0 1945.0 80592.0 133465.0 11797.0 0.0 1952.0 0.0 9857.0 ... -2.132125 -0.701528 -1.313996 0.000000 0.000000 -0.668256 -2.826426 0.000000 0.000000 -0.792355
CA 3931.0 7802.0 5783.0 9893.0 1761614.0 13709.0 3955.0 1958.0 0.0 13740.0 ... 0.000000 0.000000 -1.290497 -0.661002 0.000000 0.000000 -7.452911 -0.637821 0.000000 -0.688622
CO 0.0 0.0 0.0 0.0 7829.0 27238.0 0.0 0.0 0.0 1955.0 ... 0.000000 0.000000 -1.110621 0.000000 0.000000 0.000000 0.000000 -0.600456 0.000000 0.000000
CT 0.0 0.0 0.0 0.0 1987.0 0.0 62059.0 0.0 0.0 1979.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
DC 0.0 0.0 0.0 0.0 0.0 0.0 3813.0 15203.0 0.0 0.0 ... 0.000000 0.000000 -0.672192 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
DE 1979.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9795.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
FL 0.0 17578.0 5804.0 0.0 25588.0 1970.0 5881.0 1980.0 3913.0 356290.0 ... -0.650778 -1.517051 -1.403834 0.000000 -2.505630 0.000000 0.000000 -0.587018 0.000000 0.000000
GA 0.0 0.0 0.0 0.0 5913.0 0.0 1969.0 3922.0 1947.0 21700.0 ... 0.000000 -1.336575 -1.432921 0.000000 -0.694650 0.000000 0.000000 -0.704321 0.000000 -0.649978
HI 0.0 0.0 0.0 0.0 13783.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -1.715388 0.000000 0.000000 0.000000
IA 0.0 0.0 0.0 0.0 1969.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.963098 0.000000 0.000000 0.000000 0.000000 0.000000 -0.293197 0.000000 0.000000
ID 0.0 0.0 0.0 0.0 13825.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 -0.702689 0.000000 0.000000 -0.360763 0.000000 0.000000 0.000000
IL 0.0 1951.0 1930.0 1981.0 7854.0 1893.0 0.0 0.0 0.0 0.0 ... 0.000000 -0.814658 0.000000 0.000000 0.000000 0.000000 0.000000 -0.673265 0.000000 0.000000
IN 0.0 1892.0 0.0 0.0 1976.0 0.0 3892.0 0.0 0.0 0.0 ... 0.000000 0.000000 -0.638570 0.000000 -0.659255 0.000000 0.000000 -1.883014 0.000000 0.000000
KS 0.0 1935.0 1979.0 1970.0 9821.0 1975.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 -0.687278 0.000000 0.000000 0.000000 0.000000 -0.783062 0.000000 -0.675600
KY 0.0 0.0 0.0 0.0 1968.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 -1.377956 0.000000 0.000000 -0.675115 0.000000 0.000000 0.000000 0.000000 0.000000
LA 0.0 11691.0 7717.0 0.0 9903.0 3967.0 1987.0 0.0 1969.0 9885.0 ... 0.000000 -0.906424 -6.735045 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
MA 0.0 1905.0 0.0 0.0 5834.0 1958.0 19158.0 0.0 0.0 1988.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
MD 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1893.0 3853.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 -0.706687 0.000000 0.000000 0.000000 0.000000 0.000000
ME 0.0 0.0 0.0 0.0 1894.0 0.0 0.0 0.0 0.0 1969.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
MI 0.0 3910.0 0.0 0.0 3754.0 0.0 1904.0 0.0 1907.0 3885.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
MN 0.0 0.0 0.0 0.0 1923.0 0.0 0.0 0.0 0.0 0.0 ... -0.690417 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.666260 0.000000 0.000000
MO 0.0 1972.0 1965.0 0.0 3936.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 -0.640804 -1.289504 0.000000 0.000000 0.000000 0.000000 -0.862108 0.000000 0.000000
MS 0.0 17669.0 5887.0 0.0 0.0 0.0 0.0 0.0 0.0 9871.0 ... 0.000000 -1.995119 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
NAN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 -0.885947 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
NC 0.0 1987.0 0.0 1977.0 3955.0 0.0 7938.0 3931.0 1927.0 25728.0 ... 0.000000 -2.311024 -1.273328 0.000000 -5.216808 0.000000 -0.652431 0.000000 0.000000 0.000000
ND 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
NE 0.0 1955.0 0.0 0.0 7882.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 -0.667735 0.000000 0.000000 0.000000 0.000000 -0.785604 0.000000 0.000000
NH 0.0 0.0 0.0 0.0 0.0 1881.0 1899.0 0.0 0.0 1950.0 ... 0.000000 0.000000 0.000000 0.000000 -0.400222 -0.040250 0.000000 0.000000 0.000000 0.000000
NJ 0.0 0.0 0.0 0.0 3924.0 0.0 5906.0 0.0 3902.0 1962.0 ... 0.000000 0.000000 -1.313741 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
NM 0.0 0.0 0.0 1982.0 3954.0 0.0 0.0 1971.0 0.0 0.0 ... 0.000000 0.000000 -2.080594 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
NV 0.0 0.0 0.0 0.0 27620.0 0.0 0.0 0.0 0.0 1978.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -1.435994 0.000000 0.000000 0.000000
NY 0.0 1924.0 0.0 0.0 0.0 0.0 11354.0 1889.0 0.0 0.0 ... 0.000000 0.000000 -0.702335 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -1.212567
OH 0.0 0.0 0.0 0.0 3963.0 0.0 0.0 3903.0 0.0 3904.0 ... 0.000000 -0.657102 0.000000 0.000000 -0.687218 0.000000 0.000000 0.000000 0.000000 0.000000
OK 1956.0 0.0 3937.0 0.0 43283.0 1969.0 1956.0 0.0 0.0 3953.0 ... 0.000000 0.000000 -3.377447 0.000000 -0.906424 0.000000 0.000000 -1.371468 -0.617961 0.000000
OR 0.0 3884.0 1953.0 0.0 15742.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -1.359584 0.000000 0.000000 -0.635756
PA 0.0 0.0 0.0 0.0 0.0 0.0 9532.0 1938.0 3857.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
RI 0.0 0.0 0.0 0.0 0.0 0.0 3763.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
SC 0.0 1961.0 0.0 0.0 3965.0 0.0 3949.0 0.0 0.0 15789.0 ... 0.000000 -0.594130 -2.057533 0.000000 -2.142667 0.000000 0.000000 -0.645287 0.000000 0.000000
SD 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
TN 0.0 3827.0 3942.0 0.0 9830.0 3967.0 0.0 0.0 0.0 7814.0 ... 0.000000 -14.898301 0.000000 0.000000 0.000000 0.000000 0.000000 -0.715285 0.000000 0.000000
TX 0.0 3929.0 21305.0 1965.0 23605.0 1981.0 0.0 0.0 1987.0 3922.0 ... -1.295016 -0.654505 -104.205274 0.000000 0.000000 0.000000 0.000000 -1.244326 0.000000 0.000000
UT 0.0 0.0 0.0 1970.0 19636.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 -2.071997 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
VA 0.0 0.0 0.0 0.0 1961.0 0.0 0.0 0.0 3926.0 5942.0 ... -0.650885 -1.442337 -0.638570 0.000000 -21.055780 0.000000 0.000000 0.000000 0.000000 0.000000
VT 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
WA 1986.0 0.0 0.0 0.0 9788.0 1992.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -10.139718 0.000000 0.000000 0.000000
WI 0.0 0.0 0.0 1973.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -6.811684 0.000000 0.000000
WV 0.0 0.0 0.0 0.0 1988.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 -1.385032 0.000000 0.000000 0.000000 -2.828661 0.000000
WY 0.0 0.0 0.0 1964.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

50 rows × 1020 columns


In [125]:
plt.figure(figsize=(20,15))
fig = sns.heatmap(df_2d['max_salary_standardized_annually'], center=0, linewidths=.5, square=True, robust=True)



In [156]:
plt.figure(figsize=(20,15))
sns.heatmap(df_2d['mean_salary'], linewidths=.5, square=True, robust=True)


Out[156]:
<matplotlib.axes._subplots.AxesSubplot at 0x1837cb5f8>

In [128]:
plt.figure(figsize=(20,15))
sns.heatmap(df_2d['allstar_count'], linewidths=.5, square=True, robust=True)


Out[128]:
<matplotlib.axes._subplots.AxesSubplot at 0x18a436588>

In [129]:
plt.figure(figsize=(20,15))
sns.heatmap(df_2d['award_count'], linewidths=.5, square=True, robust=True)


Out[129]:
<matplotlib.axes._subplots.AxesSubplot at 0x17e51e438>

In [174]:
fig_3 = figures.univariate(df[                             
                    df['college_state'] == 'NC' 
                   ]['max_salary_standardized_annually'].dropna(), 'Stanardised Max Career Salary', bin_n=None)



In [175]:
fig_3 = figures.univariate(df[                             
                    df['college_state'] == 'CA' 
                   ]['max_salary_standardized_annually'].dropna(), 'Stanardised Max Career Salary', bin_n=None)



In [ ]: