In [2]:
# RESEARCH IN PYTHON: DESCRIPTIVE STATISTICS AND EXPLORATORY DATA ANALYSIS
# by J. NATHAN MATIAS March 10, 2015

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

# THINGS TO IMPORT
# This is a good baseline set of libraries to import by default if you're rushed for time.

import codecs                     # load UTF-8 Content
import json                       # load JSON files
import pandas as pd               # Pandas handles dataframes
import numpy as np                # Numpy handles lots of basic maths operations
import matplotlib.pyplot as plt   # Matplotlib for plotting
import seaborn as sns             # Seaborn for beautiful plots
from dateutil import *            # I prefer dateutil for parsing dates
import math                       # transformations
import statsmodels.formula.api as smf  # for doing statistical regression
import statsmodels.api as sm      # access to the wider statsmodels library, including R datasets
from collections import Counter   # Counter is useful for grouping and counting

Acquire a Dataset


In [21]:
# Datasets from the R Dataset are accessible via Statsmodels
# http://vincentarelbundock.github.io/Rdatasets/

# U. S. State Public-School Expenditures
# code book: http://vincentarelbundock.github.io/Rdatasets/doc/car/Anscombe.html
# The observations are the U. S. states plus Washington, D. C. in 1970.
# education = Per-capita education expenditures, dollars.
# income = Per-capita income, dollars.
# young = Proportion under 18, per 1000.
# urban = Proportion urban, per 1000.

expenditures = sm.datasets.get_rdataset("Anscombe", "car")
# assign a variable to the Pandas dataframe for this dataset
expenditures_df = expenditures.data

Summary Statistics


In [37]:
expenditures_df.describe()


Out[37]:
education income young urban
count 51.000000 51.000000 51.000000 51.000000
mean 196.313725 3225.294118 358.886275 664.509804
std 46.454490 560.025974 23.959975 151.344821
min 112.000000 2081.000000 326.200000 322.000000
25% 165.000000 2785.500000 342.050000 552.500000
50% 192.000000 3257.000000 354.100000 664.000000
75% 228.500000 3612.000000 369.150000 790.500000
max 372.000000 4425.000000 439.700000 1000.000000

Plot the Distribution of The Variables


In [36]:
for col in expenditures_df.columns:
    plt.hist(expenditures_df[col], bins=20)
    plt.title("Distribution of %(col)s" % {"col":col}, fontsize="20")
    plt.show()


Check for Correlations


In [40]:
sns.corrplot(expenditures_df)
plt.title("Correlation Plot", fontsize="20")


Out[40]:
<matplotlib.text.Text at 0x113043550>

Scatterplot the predictors against the outcome


In [50]:
outcome = "education"
preds = [x for x in expenditures_df.columns if x!=outcome]
for pred in preds:
    sns.jointplot(pred, outcome, data=expenditures_df)
    print "Scattering %(o)s on %(p)s" %{"o":outcome, "p":pred}
    plt.show()


Scattering education on income
Scattering education on young
Scattering education on urban

In [ ]: