In [2]:
# RESEARCH IN PYTHON: DESCRIPTIVE STATISTICS AND EXPLORATORY DATA ANALYSIS
# by J. NATHAN MATIAS March 10, 2015
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# THINGS TO IMPORT
# This is a good baseline set of libraries to import by default if you're rushed for time.
import codecs # load UTF-8 Content
import json # load JSON files
import pandas as pd # Pandas handles dataframes
import numpy as np # Numpy handles lots of basic maths operations
import matplotlib.pyplot as plt # Matplotlib for plotting
import seaborn as sns # Seaborn for beautiful plots
from dateutil import * # I prefer dateutil for parsing dates
import math # transformations
import statsmodels.formula.api as smf # for doing statistical regression
import statsmodels.api as sm # access to the wider statsmodels library, including R datasets
from collections import Counter # Counter is useful for grouping and counting
In [21]:
# Datasets from the R Dataset are accessible via Statsmodels
# http://vincentarelbundock.github.io/Rdatasets/
# U. S. State Public-School Expenditures
# code book: http://vincentarelbundock.github.io/Rdatasets/doc/car/Anscombe.html
# The observations are the U. S. states plus Washington, D. C. in 1970.
# education = Per-capita education expenditures, dollars.
# income = Per-capita income, dollars.
# young = Proportion under 18, per 1000.
# urban = Proportion urban, per 1000.
expenditures = sm.datasets.get_rdataset("Anscombe", "car")
# assign a variable to the Pandas dataframe for this dataset
expenditures_df = expenditures.data
In [37]:
expenditures_df.describe()
Out[37]:
In [36]:
for col in expenditures_df.columns:
plt.hist(expenditures_df[col], bins=20)
plt.title("Distribution of %(col)s" % {"col":col}, fontsize="20")
plt.show()
In [40]:
sns.corrplot(expenditures_df)
plt.title("Correlation Plot", fontsize="20")
Out[40]:
In [50]:
outcome = "education"
preds = [x for x in expenditures_df.columns if x!=outcome]
for pred in preds:
sns.jointplot(pred, outcome, data=expenditures_df)
print "Scattering %(o)s on %(p)s" %{"o":outcome, "p":pred}
plt.show()
In [ ]: