In [2]:

    
# RESEARCH IN PYTHON: DESCRIPTIVE STATISTICS AND EXPLORATORY DATA ANALYSIS
# by J. NATHAN MATIAS March 10, 2015

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

# THINGS TO IMPORT
# This is a good baseline set of libraries to import by default if you're rushed for time.

import codecs                     # load UTF-8 Content
import json                       # load JSON files
import pandas as pd               # Pandas handles dataframes
import numpy as np                # Numpy handles lots of basic maths operations
import matplotlib.pyplot as plt   # Matplotlib for plotting
import seaborn as sns             # Seaborn for beautiful plots
from dateutil import *            # I prefer dateutil for parsing dates
import math                       # transformations
import statsmodels.formula.api as smf  # for doing statistical regression
import statsmodels.api as sm      # access to the wider statsmodels library, including R datasets
from collections import Counter   # Counter is useful for grouping and counting

Acquire a Dataset



In [21]:

    
# Datasets from the R Dataset are accessible via Statsmodels
# http://vincentarelbundock.github.io/Rdatasets/

# U. S. State Public-School Expenditures
# code book: http://vincentarelbundock.github.io/Rdatasets/doc/car/Anscombe.html
# The observations are the U. S. states plus Washington, D. C. in 1970.
# education = Per-capita education expenditures, dollars.
# income = Per-capita income, dollars.
# young = Proportion under 18, per 1000.
# urban = Proportion urban, per 1000.

expenditures = sm.datasets.get_rdataset("Anscombe", "car")
# assign a variable to the Pandas dataframe for this dataset
expenditures_df = expenditures.data

Summary Statistics



In [37]:

    
expenditures_df.describe()









    Out[37]:






  
    
      
      education
      income
      young
      urban
    
  
  
    
      count
        51.000000
         51.000000
        51.000000
         51.000000
    
    
      mean
       196.313725
       3225.294118
       358.886275
        664.509804
    
    
      std
        46.454490
        560.025974
        23.959975
        151.344821
    
    
      min
       112.000000
       2081.000000
       326.200000
        322.000000
    
    
      25%
       165.000000
       2785.500000
       342.050000
        552.500000
    
    
      50%
       192.000000
       3257.000000
       354.100000
        664.000000
    
    
      75%
       228.500000
       3612.000000
       369.150000
        790.500000
    
    
      max
       372.000000
       4425.000000
       439.700000
       1000.000000

Plot the Distribution of The Variables



In [36]:

    
for col in expenditures_df.columns:
    plt.hist(expenditures_df[col], bins=20)
    plt.title("Distribution of %(col)s" % {"col":col}, fontsize="20")
    plt.show()

Check for Correlations



In [40]:

    
sns.corrplot(expenditures_df)
plt.title("Correlation Plot", fontsize="20")









    Out[40]:





<matplotlib.text.Text at 0x113043550>

Scatterplot the predictors against the outcome



In [50]:

    
outcome = "education"
preds = [x for x in expenditures_df.columns if x!=outcome]
for pred in preds:
    sns.jointplot(pred, outcome, data=expenditures_df)
    print "Scattering %(o)s on %(p)s" %{"o":outcome, "p":pred}
    plt.show()









    



Scattering education on income






    












    



Scattering education on young






    












    



Scattering education on urban



In [ ]:

	education	income	young	urban
count	51.000000	51.000000	51.000000	51.000000
mean	196.313725	3225.294118	358.886275	664.509804
std	46.454490	560.025974	23.959975	151.344821
min	112.000000	2081.000000	326.200000	322.000000
25%	165.000000	2785.500000	342.050000	552.500000
50%	192.000000	3257.000000	354.100000	664.000000
75%	228.500000	3612.000000	369.150000	790.500000
max	372.000000	4425.000000	439.700000	1000.000000