In [1]:
# import pandas and numpy
import numpy as np
import pandas as pd
# Set some pandas options for controlling output
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)
import matplotlib.pyplot as plt
# inline graphics
%matplotlib inline
In [2]:
# load the sensors data
sensors = pd.read_csv("data/sensors.csv")
sensors
Out[2]:
In [3]:
# group this data by the sensor column / variable
# returns a DataFrameGroupBy object
grouped = sensors.groupby('sensor')
grouped
Out[3]:
In [4]:
# get the number of groups that this will create
grouped.ngroups
Out[4]:
In [5]:
# what are the groups that were found?
grouped.groups
Out[5]:
In [6]:
# a helper function to print the contents of the groups
def print_groups (groupobject):
# loop over all groups, printing the group name
# and group details
for name, group in groupobject:
print (name)
print (group)
In [7]:
# examine the content of the groups we created
print_groups(grouped)
In [8]:
# get how many items are in each group
grouped.size()
Out[8]:
In [9]:
# get the count of items in each column of each group
grouped.count()
Out[9]:
In [10]:
# get the data in one specific group
grouped.get_group('accel')
Out[10]:
In [11]:
# get the first three items in each group
grouped.head(3)
Out[11]:
In [12]:
# get the first item in each group
grouped.nth(0)
Out[12]:
In [13]:
# get the 2nd item in each group
grouped.nth(1)
Out[13]:
In [14]:
# and so on...
grouped.nth(2)
Out[14]:
In [15]:
# group by both sensor and axis values
mcg = sensors.groupby(['sensor', 'axis'])
print_groups(mcg)
In [16]:
# get descriptive statistics for each
mcg.describe()
Out[16]:
In [17]:
# make a copy of the data and reindex the copy
mi = sensors.copy()
mi = mi.set_index(['sensor', 'axis'])
mi
Out[17]:
In [18]:
# group by the first level of the index
mig_l1 = mi.groupby(level=0)
print_groups(mig_l1)
In [19]:
# group by multiple levels of the index
mig_l12 = mi.groupby(level=['sensor', 'axis'])
print_groups(mig_l12)
In [20]:
# calculate the mean for each sensor/axis
mig_l12.agg(np.mean)
Out[20]:
In [21]:
# do not create an index matching the original object
sensors.groupby(['sensor', 'axis'],
as_index=False).agg(np.mean)
Out[21]:
In [22]:
# can simply apply the agg function to the group by object
mig_l12.mean()
Out[22]:
In [23]:
# apply multiple aggregation functions at once
mig_l12.agg([np.sum, np.std])
Out[23]:
In [24]:
# apply a different function to each column
mig_l12.agg({'interval' : len,
'reading': np.mean})
Out[24]:
In [25]:
# calculate the mean of the reading column
mig_l12['reading'].mean()
Out[25]:
In [26]:
# a DataFrame to use for examples
df = pd.DataFrame({ 'Label': ['A', 'C', 'B', 'A', 'C'],
'Values': [0, 1, 2, 3, 4],
'Values2': [5, 6, 7, 8, 9],
'Noise': ['foo', 'bar', 'baz',
'foobar', 'barbaz']})
df
Out[26]:
In [27]:
# group by label
grouped = df.groupby('Label')
print_groups(grouped)
In [28]:
# add ten to all values in all columns
grouped.transform(lambda x: x + 10)
Out[28]:
In [29]:
# a function to print the input before we are adding 10 to it
def xplus10(x):
print (x)
return x + 10
In [30]:
# transform using xplus10
grouped.transform(xplus10)
Out[30]:
In [31]:
# sum returns existing as it is applied to each individual item
grouped.transform(lambda x: x.sum())
Out[31]:
In [32]:
# data to demonstrate replacement on NaN
df = pd.DataFrame({ 'Label': list("ABABAB"),
'Values': [10, 20, 11, np.nan, 12, 22]},
index=['i1', 'i2', 'i3', 'i4', 'i5', 'i6'])
df
Out[32]:
In [33]:
# show the groups in the data based upon Label
grouped = df.groupby('Label')
print_groups(grouped)
In [34]:
# calculate the mean of the two groups
grouped.mean()
Out[34]:
In [35]:
# use transform to fill the NaNs with the mean of the group
filled_NaNs = grouped.transform(lambda x: x.fillna(x.mean()))
filled_NaNs
Out[35]:
In [36]:
# overwrite old values with the new ones
df.Values = filled_NaNs
df
Out[36]:
In [37]:
# generate a rolling mean time series
np.random.seed(123456)
data = pd.Series(np.random.normal(0.5, 2, 365*3),
pd.date_range('2011-01-01', periods=365*3))
rolling = pd.rolling_mean(data, 100, 100).dropna()
rolling
Out[37]:
In [38]:
# visualize the series
rolling.plot();
In [39]:
# calculate mean and std by year
groupkey = lambda x: x.year
groups = rolling.groupby(groupkey)
groups.agg([np.mean, np.std])
Out[39]:
In [40]:
# normalize to the z-score
zscore = lambda x: (x - x.mean()) / x.std()
normed = rolling.groupby(groupkey).transform(zscore)
normed.groupby(groupkey).agg([np.mean, np.std])
Out[40]:
In [41]:
# plot original vs normalize
compared = pd.DataFrame({ 'Original': rolling,
'Normed': normed })
compared.plot();
In [42]:
# check the distribution % within one std
# should be roughly 64.2%
normed_in1std = normed[np.abs(normed) <= 1.0].count()
float(normed_in1std) / len(normed)
Out[42]:
In [43]:
# data for our examples
df = pd.DataFrame({'Label': list('AABCCC'),
'Values': [1, 2, 3, 4, np.nan, 8]})
df
Out[43]:
In [44]:
# drop groups with one or fewer non-NaN values
f = lambda x: x.Values.count() > 1
df.groupby('Label').filter(f)
Out[44]:
In [45]:
# drop any groups with NaN values
f = lambda x: x.Values.isnull().sum() == 0
df.groupby('Label').filter(f)
Out[45]:
In [46]:
# select groups with a mean of 2.0 or greater
grouped = df.groupby('Label')
mean = grouped.mean().mean()
f = lambda x: abs(x.Values.mean() - mean) > 2.0
df.groupby('Label').filter(f)
Out[46]:
In [47]:
# replace values in a group where the # of items is <= 1
f = lambda x: x.Values.count() > 1
df.groupby('Label').filter(f, dropna=False)
Out[47]:
In [48]:
# generate 10000 normal random #'s
np.random.seed(123456)
dist = np.random.normal(size = 10000)
# show the mean and std
"{0} {1}".format(dist.mean(), dist.std())
Out[48]:
In [49]:
# and the actual data
dist
Out[49]:
In [50]:
# split the data into 5 bins
bins = pd.cut(dist, 5)
bins
Out[50]:
In [51]:
# show the categories in the bins
bins.categories
Out[51]:
In [52]:
# demonstrate the math to calculate the bins
min = dist.min()
max = dist.max()
delta = max - min
iwidth = delta/5
extra = delta*0.001
intervals = np.arange(min, max + extra, iwidth)
intervals[0] -= delta*0.001
intervals
Out[52]:
In [53]:
# codes tells us which bin each item is in
bins.codes
Out[53]:
In [54]:
# move the closed side of the interval to the left
pd.cut(dist, 5, right=False).categories
Out[54]:
In [55]:
# generate 50 ages between 6 and 45
np.random.seed(123456)
ages = np.random.randint(6, 45, 50)
ages
Out[55]:
In [56]:
# cut into ranges and then get descriptive stats
ranges = [6, 12, 18, 35, 50]
agebins = pd.cut(ages, ranges)
agebins.describe()
Out[56]:
In [57]:
# add names for the bins
ranges = [6, 12, 18, 35, 50]
labels = ['Youth', 'Young Adult', 'Adult', 'Middle Aged']
agebins = pd.cut(ages, ranges, labels=labels)
agebins.describe()
Out[57]:
In [58]:
# cut into quantiles
# 5 bins with an equal quantity of items
qbin = pd.qcut(dist, 5)
# this will tell us the range of values in each quantile
qbin.describe()
Out[58]:
In [59]:
# make the quantiles at the +/- 3, 2 and 1 std deviations
quantiles = [0,0.001,
0.021,
0.5-0.341,
0.5,
0.5+0.341,
1.0-0.021,
1.0-0.001,
1.0]
qbin = pd.qcut(dist, quantiles)
# this data should be a perfect normal distribution
qbin.describe()
Out[59]: