# DataScience2

Adapted from Computational Statistics in Python @ Duke

Note: Because the pandas API is a moving target, on-line tutorials go out of date. Adapting a tutorial to fit the new realities is good exercise in itself.

``````

In :

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

``````
``````

In :

import pandas as pd
# import pandas.tseries as ts
from pandas import Series, DataFrame, Panel
from string import ascii_lowercase as letters
from scipy.stats import chisqprob

``````
``````

/Users/mac/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
This is separate from the ipykernel package so we can avoid doing imports until

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-2-7d8cf4edbe10> in <module>
3 from pandas import Series, DataFrame, Panel
4 from string import ascii_lowercase as letters
----> 5 from scipy.stats import chisqprob

ImportError: cannot import name 'chisqprob' from 'scipy.stats' (/Users/mac/opt/anaconda3/lib/python3.7/site-packages/scipy/stats/__init__.py)

``````
``````

In [ ]:

xs = Series(np.arange(10), index=tuple(letters[:10]))
print(xs[:3],'\n')
print(xs[7:], '\n')
print(xs[::3], '\n')
print(xs[['d', 'f', 'h']], '\n')
print(xs.d, xs.f, xs.h)

``````
``````

In [ ]:

# All the numpy functions wiill work with Series objects, and return another Series

y1, y2 = np.mean(xs), np.var(xs)
y1, y2

``````
``````

In [ ]:

# Matplotlib will work on Series objects too
plt.plot(xs, np.sin(xs), 'r-o', xs, np.cos(xs), 'b-x');

``````
``````

In [ ]:

print(xs.values)

``````

numpy.random.normal takes mean and standard deviation as inputs, also size (how many).

``````

In [ ]:

# The Series datatype can also be used to represent time series

import datetime as dt
from pandas import date_range

# today = dt.date.today()
today = dt.datetime.strptime('May 16 2018', '%b %d %Y') # plain Python
print(today, '\n')
days = date_range(today, periods=35, freq='D')  # for indexing
ts = Series(np.random.normal(10, 1, len(days)), index=days)

# Extracting elements
print (ts[0:4], '\n')
print (ts['2018-05-21':'2018-05-28'], '\n') # Note - includes end time

``````
``````

In [ ]:

# df = ts.resample(rule='W', how=('mean', 'std', lambda x: sum(x*x))) -- OLD

df = ts.resample(rule='W').apply([np.mean, np.std, lambda x: sum(x*x)])
df

``````
``````

In [ ]:

# Renaming columns
# The use of mean and std are problmeatic because there are also methods in dataframe with those names
# Also, <lambda> is unifnormative
# So we would like to give better names to the columns of df

df.columns = ('mu', 'sigma', 'sum_of_sq')
print(df)

``````
``````

In [ ]:

print (df.mu, '\n') # by attribute
print (df['sigma'], '\n') # by column name

``````
``````

In [ ]:

# Extracting rows from a DataFrame

print( df[1:3], '\n')
print( df['2018-05-16'::2])

``````
``````

In [ ]:

# Extracting blocks and scalars

print (df.iat[2, 2], '\n') # extract an element with iat()
print (df.loc['2018-05-16':'2018-06-20', 'sum_of_sq'], '\n') # indexing by label
print (df.iloc[:3, 2], '\n')  # indexing by position
print (df.iloc[:3,:].loc[:,'sum_of_sq'], '\n') # combining the two

``````
``````

In [ ]:

# Using Boolean conditions for selecting eleements

print (df[(df.sigma < 1) & (df.sum_of_sq < 700)], '\n') # need parenthesis because of operator precedence
print (df.query('sigma < 1 and sum_of_sq < 700')) # the query() method allows more readable query strings

``````
``````

In [ ]:

df= np.random.binomial(100, 0.95, (9,2))
dm = np.random.binomial(100, 0.9, [12,2])
dff = DataFrame(df, columns = ['Physics', 'Math'])
dfm = DataFrame(dm, columns = ['Physics', 'Math'])
score_panel = Panel({'Girls': dff, 'Boys': dfm})
print(score_panel, '\n')

``````
``````

In [ ]:

score_panel['Girls'].transpose()

``````
``````

In [ ]:

# find physics and math scores of girls who scored >= 93 in math
# a DataFrame is returned
score_panel.loc['Girls', score_panel.Girls.Math >= 93, :]

``````
``````

In [ ]:

# import a DataFrame to play with
try:
except:
tips.to_pickle('tips.pic')

``````
``````

In [ ]:

``````
``````

In [ ]:

# We have an extra set of indices in the first column
# Let's get rid of it

tips = tips.iloc[:, 1:]

``````
``````

In [ ]:

# For an example of the split-apply-combine pattern, we want to see counts by sex and smoker status.
# In other words, we split by sex and smoker status to get 2x2 groups,
# then apply the size function to count the number of entries per group
# and finally combine the results into a new multi-index Series

grouped = tips.groupby(['sex', 'smoker'])
grouped.size()

``````
``````

In [ ]:

# If you need the margins, use the crosstab function

pd.crosstab(tips.sex, tips.smoker, margins=True)

``````
``````

In [ ]:

# If more than 1 column of resutls is generated, a DataFrame is returned

grouped.mean()

``````
``````

In [ ]:

# The returned results can be further manipulated via apply()
# For example, suppose the bill and tips are in USD but we want EUR

import json
import urllib

# get current conversion rate
print (converter)
grouped['total_bill', 'tip'].mean().apply(lambda x: x*converter['USD_EUR'])

``````
``````

In [ ]:

# We can also transform the original data for more convenient analysis
# For example, suppose we want standardized units for total bill and tips

zscore = lambda x: (x - x.mean())/x.std()

std_grouped = grouped['total_bill', 'tip'].transform(zscore)

``````
``````

In [ ]:

# Suppose we want to apply a set of functions to only some columns
grouped['total_bill', 'tip'].agg(['mean', 'min', 'max'])

``````
``````

In [ ]:

# We can also apply specific functions to specific columns
df = grouped.agg({'total_bill': (min, max), 'tip': sum})
df

``````