DataScience2


Adapted from Computational Statistics in Python @ Duke

Note: Because the pandas API is a moving target, on-line tutorials go out of date. Adapting a tutorial to fit the new realities is good exercise in itself.


In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import pandas as pd
# import pandas.tseries as ts
from pandas import Series, DataFrame, Panel
from string import ascii_lowercase as letters
from scipy.stats import chisqprob


/Users/mac/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
  This is separate from the ipykernel package so we can avoid doing imports until
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-2-7d8cf4edbe10> in <module>
      3 from pandas import Series, DataFrame, Panel
      4 from string import ascii_lowercase as letters
----> 5 from scipy.stats import chisqprob

ImportError: cannot import name 'chisqprob' from 'scipy.stats' (/Users/mac/opt/anaconda3/lib/python3.7/site-packages/scipy/stats/__init__.py)

In [ ]:
xs = Series(np.arange(10), index=tuple(letters[:10]))
print(xs[:3],'\n')
print(xs[7:], '\n')
print(xs[::3], '\n')
print(xs[['d', 'f', 'h']], '\n')
print(xs.d, xs.f, xs.h)

In [ ]:
# All the numpy functions wiill work with Series objects, and return another Series

y1, y2 = np.mean(xs), np.var(xs)
y1, y2

In [ ]:
# Matplotlib will work on Series objects too
plt.plot(xs, np.sin(xs), 'r-o', xs, np.cos(xs), 'b-x');

In [ ]:
print(xs.values)

numpy.random.normal takes mean and standard deviation as inputs, also size (how many).


In [ ]:
# The Series datatype can also be used to represent time series

import datetime as dt
from pandas import date_range

# today = dt.date.today()
today = dt.datetime.strptime('May 16 2018', '%b %d %Y') # plain Python
print(today, '\n')
days = date_range(today, periods=35, freq='D')  # for indexing
ts = Series(np.random.normal(10, 1, len(days)), index=days)

# Extracting elements
print (ts[0:4], '\n')
print (ts['2018-05-21':'2018-05-28'], '\n') # Note - includes end time

In [ ]:
# df = ts.resample(rule='W', how=('mean', 'std', lambda x: sum(x*x))) -- OLD

df = ts.resample(rule='W').apply([np.mean, np.std, lambda x: sum(x*x)])
df

In [ ]:
# Renaming columns
# The use of mean and std are problmeatic because there are also methods in dataframe with those names
# Also, <lambda> is unifnormative
# So we would like to give better names to the columns of df

df.columns = ('mu', 'sigma', 'sum_of_sq')
print(df)

In [ ]:
print (df.mu, '\n') # by attribute
print (df['sigma'], '\n') # by column name

In [ ]:
# Extracting rows from a DataFrame

print( df[1:3], '\n')
print( df['2018-05-16'::2])

In [ ]:
# Extracting blocks and scalars

print (df.iat[2, 2], '\n') # extract an element with iat()
print (df.loc['2018-05-16':'2018-06-20', 'sum_of_sq'], '\n') # indexing by label
print (df.iloc[:3, 2], '\n')  # indexing by position
print (df.iloc[:3,:].loc[:,'sum_of_sq'], '\n') # combining the two

In [ ]:
# Using Boolean conditions for selecting eleements

print (df[(df.sigma < 1) & (df.sum_of_sq < 700)], '\n') # need parenthesis because of operator precedence
print (df.query('sigma < 1 and sum_of_sq < 700')) # the query() method allows more readable query strings

In [ ]:
df= np.random.binomial(100, 0.95, (9,2))
dm = np.random.binomial(100, 0.9, [12,2])
dff = DataFrame(df, columns = ['Physics', 'Math'])
dfm = DataFrame(dm, columns = ['Physics', 'Math'])
score_panel = Panel({'Girls': dff, 'Boys': dfm})
print(score_panel, '\n')

In [ ]:
score_panel['Girls'].transpose()

In [ ]:
# find physics and math scores of girls who scored >= 93 in math
# a DataFrame is returned
score_panel.loc['Girls', score_panel.Girls.Math >= 93, :]

In [ ]:
# import a DataFrame to play with
try:
    tips = pd.read_pickle('tips.pic')
except:
    tips = pd.read_csv('https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/reshape2/tips.csv', )
    tips.to_pickle('tips.pic')

In [ ]:
tips.head(n=4)

In [ ]:
# We have an extra set of indices in the first column
# Let's get rid of it

tips = tips.iloc[:, 1:]
tips.head(n=4)

In [ ]:
# For an example of the split-apply-combine pattern, we want to see counts by sex and smoker status.
# In other words, we split by sex and smoker status to get 2x2 groups,
# then apply the size function to count the number of entries per group
# and finally combine the results into a new multi-index Series

grouped = tips.groupby(['sex', 'smoker'])
grouped.size()

In [ ]:
# If you need the margins, use the crosstab function

pd.crosstab(tips.sex, tips.smoker, margins=True)

In [ ]:
# If more than 1 column of resutls is generated, a DataFrame is returned

grouped.mean()

In [ ]:
# The returned results can be further manipulated via apply()
# For example, suppose the bill and tips are in USD but we want EUR

import json
import urllib

# get current conversion rate
converter = json.loads(urllib.request.urlopen('http://free.currencyconverterapi.com/api/v3/convert?q=USD_EUR&compact=ultra ').read())
print (converter)
grouped['total_bill', 'tip'].mean().apply(lambda x: x*converter['USD_EUR'])

In [ ]:
# We can also transform the original data for more convenient analysis
# For example, suppose we want standardized units for total bill and tips

zscore = lambda x: (x - x.mean())/x.std()

std_grouped = grouped['total_bill', 'tip'].transform(zscore)
std_grouped.head(n=4)

In [ ]:
# Suppose we want to apply a set of functions to only some columns
grouped['total_bill', 'tip'].agg(['mean', 'min', 'max'])

In [ ]:
# We can also apply specific functions to specific columns
df = grouped.agg({'total_bill': (min, max), 'tip': sum})
df