Setting up the notebook


In [1]:
# import pandas and numpy
import numpy as np
import pandas as pd

# Set some pandas options for controlling output
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

import matplotlib.pyplot as plt
# inline graphics
%matplotlib inline

Splitting a pandas object

Data for the examples


In [2]:
# load the sensors data
sensors = pd.read_csv("data/sensors.csv")
sensors


Out[2]:
    interval       sensor axis  reading
0          0        accel    Z      0.0
1          0        accel    Y      0.5
2          0        accel    X      1.0
3          1        accel    Z      0.1
4          1        accel    Y      0.4
..       ...          ...  ...      ...
19         2  orientation    Y      0.3
20         2  orientation    X      0.2
21         3  orientation    Z      0.0
22         3  orientation    Y      0.4
23         3  orientation    X      0.3

[24 rows x 4 columns]

Grouping by a column values


In [3]:
# group this data by the sensor column / variable
# returns a DataFrameGroupBy object
grouped = sensors.groupby('sensor')
grouped


Out[3]:
<pandas.core.groupby.DataFrameGroupBy object at 0x7fa6d54a8990>

In [4]:
# get the number of groups that this will create
grouped.ngroups


Out[4]:
2

In [5]:
# what are the groups that were found?
grouped.groups


Out[5]:
{'accel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 'orientation': [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]}

Accessing the results of a grouping


In [6]:
# a helper function to print the contents of the groups
def print_groups (groupobject):
    # loop over all groups, printing the group name 
    # and group details
    for name, group in groupobject:
        print (name)
        print (group)

In [7]:
# examine the content of the groups we created
print_groups(grouped)


accel
    interval sensor axis  reading
0          0  accel    Z      0.0
1          0  accel    Y      0.5
2          0  accel    X      1.0
3          1  accel    Z      0.1
4          1  accel    Y      0.4
..       ...    ...  ...      ...
7          2  accel    Y      0.3
8          2  accel    X      0.8
9          3  accel    Z      0.3
10         3  accel    Y      0.2
11         3  accel    X      0.7

[12 rows x 4 columns]
orientation
    interval       sensor axis  reading
12         0  orientation    Z      0.0
13         0  orientation    Y      0.1
14         0  orientation    X      0.0
15         1  orientation    Z      0.0
16         1  orientation    Y      0.2
..       ...          ...  ...      ...
19         2  orientation    Y      0.3
20         2  orientation    X      0.2
21         3  orientation    Z      0.0
22         3  orientation    Y      0.4
23         3  orientation    X      0.3

[12 rows x 4 columns]

In [8]:
# get how many items are in each group
grouped.size()


Out[8]:
sensor
accel          12
orientation    12
dtype: int64

In [9]:
# get the count of items in each column of each group
grouped.count()


Out[9]:
             interval  axis  reading
sensor                              
accel              12    12       12
orientation        12    12       12

In [10]:
# get the data in one specific group
grouped.get_group('accel')


Out[10]:
    interval sensor axis  reading
0          0  accel    Z      0.0
1          0  accel    Y      0.5
2          0  accel    X      1.0
3          1  accel    Z      0.1
4          1  accel    Y      0.4
..       ...    ...  ...      ...
7          2  accel    Y      0.3
8          2  accel    X      0.8
9          3  accel    Z      0.3
10         3  accel    Y      0.2
11         3  accel    X      0.7

[12 rows x 4 columns]

In [11]:
# get the first three items in each group
grouped.head(3)


Out[11]:
    interval       sensor axis  reading
0          0        accel    Z      0.0
1          0        accel    Y      0.5
2          0        accel    X      1.0
12         0  orientation    Z      0.0
13         0  orientation    Y      0.1
14         0  orientation    X      0.0

In [12]:
# get the first item in each group
grouped.nth(0)


Out[12]:
             interval       sensor axis  reading
sensor                                          
accel               0        accel    Z        0
orientation         0  orientation    Z        0

In [13]:
# get the 2nd item in each group
grouped.nth(1)


Out[13]:
             interval       sensor axis  reading
sensor                                          
accel               0        accel    Y      0.5
orientation         0  orientation    Y      0.1

In [14]:
# and so on...
grouped.nth(2)


Out[14]:
             interval       sensor axis  reading
sensor                                          
accel               0        accel    X        1
orientation         0  orientation    X        0

In [15]:
# group by both sensor and axis values
mcg = sensors.groupby(['sensor', 'axis'])
print_groups(mcg)


('accel', 'X')
    interval sensor axis  reading
2          0  accel    X      1.0
5          1  accel    X      0.9
8          2  accel    X      0.8
11         3  accel    X      0.7
('accel', 'Y')
    interval sensor axis  reading
1          0  accel    Y      0.5
4          1  accel    Y      0.4
7          2  accel    Y      0.3
10         3  accel    Y      0.2
('accel', 'Z')
   interval sensor axis  reading
0         0  accel    Z      0.0
3         1  accel    Z      0.1
6         2  accel    Z      0.2
9         3  accel    Z      0.3
('orientation', 'X')
    interval       sensor axis  reading
14         0  orientation    X      0.0
17         1  orientation    X      0.1
20         2  orientation    X      0.2
23         3  orientation    X      0.3
('orientation', 'Y')
    interval       sensor axis  reading
13         0  orientation    Y      0.1
16         1  orientation    Y      0.2
19         2  orientation    Y      0.3
22         3  orientation    Y      0.4
('orientation', 'Z')
    interval       sensor axis  reading
12         0  orientation    Z        0
15         1  orientation    Z        0
18         2  orientation    Z        0
21         3  orientation    Z        0

In [16]:
# get descriptive statistics for each
mcg.describe()


Out[16]:
                        interval   reading
sensor      axis                          
accel       X    count  4.000000  4.000000
                 mean   1.500000  0.850000
                 std    1.290994  0.129099
                 min    0.000000  0.700000
                 25%    0.750000  0.775000
...                          ...       ...
orientation Z    min    0.000000  0.000000
                 25%    0.750000  0.000000
                 50%    1.500000  0.000000
                 75%    2.250000  0.000000
                 max    3.000000  0.000000

[48 rows x 2 columns]

Grouping using index levels


In [17]:
# make a copy of the data and reindex the copy
mi = sensors.copy()
mi = mi.set_index(['sensor', 'axis'])
mi


Out[17]:
                  interval  reading
sensor      axis                   
accel       Z            0      0.0
            Y            0      0.5
            X            0      1.0
            Z            1      0.1
            Y            1      0.4
...                    ...      ...
orientation Y            2      0.3
            X            2      0.2
            Z            3      0.0
            Y            3      0.4
            X            3      0.3

[24 rows x 2 columns]

In [18]:
# group by the first level of the index 
mig_l1 = mi.groupby(level=0)
print_groups(mig_l1)


accel
             interval  reading
sensor axis                   
accel  Z            0      0.0
       Y            0      0.5
       X            0      1.0
       Z            1      0.1
       Y            1      0.4
...               ...      ...
       Y            2      0.3
       X            2      0.8
       Z            3      0.3
       Y            3      0.2
       X            3      0.7

[12 rows x 2 columns]
orientation
                  interval  reading
sensor      axis                   
orientation Z            0      0.0
            Y            0      0.1
            X            0      0.0
            Z            1      0.0
            Y            1      0.2
...                    ...      ...
            Y            2      0.3
            X            2      0.2
            Z            3      0.0
            Y            3      0.4
            X            3      0.3

[12 rows x 2 columns]

In [19]:
# group by multiple levels of the index
mig_l12 = mi.groupby(level=['sensor', 'axis'])
print_groups(mig_l12)


('accel', 'X')
             interval  reading
sensor axis                   
accel  X            0      1.0
       X            1      0.9
       X            2      0.8
       X            3      0.7
('accel', 'Y')
             interval  reading
sensor axis                   
accel  Y            0      0.5
       Y            1      0.4
       Y            2      0.3
       Y            3      0.2
('accel', 'Z')
             interval  reading
sensor axis                   
accel  Z            0      0.0
       Z            1      0.1
       Z            2      0.2
       Z            3      0.3
('orientation', 'X')
                  interval  reading
sensor      axis                   
orientation X            0      0.0
            X            1      0.1
            X            2      0.2
            X            3      0.3
('orientation', 'Y')
                  interval  reading
sensor      axis                   
orientation Y            0      0.1
            Y            1      0.2
            Y            2      0.3
            Y            3      0.4
('orientation', 'Z')
                  interval  reading
sensor      axis                   
orientation Z            0        0
            Z            1        0
            Z            2        0
            Z            3        0

Apply (aggregation)

Applying aggregation functions to groups


In [20]:
# calculate the mean for each sensor/axis
mig_l12.agg(np.mean)


Out[20]:
                  interval  reading
sensor      axis                   
accel       X          1.5     0.85
            Y          1.5     0.35
            Z          1.5     0.15
orientation X          1.5     0.15
            Y          1.5     0.25
            Z          1.5     0.00

In [21]:
# do not create an index matching the original object
sensors.groupby(['sensor', 'axis'], 
                as_index=False).agg(np.mean)


Out[21]:
        sensor axis  interval  reading
0        accel    X       1.5     0.85
1        accel    Y       1.5     0.35
2        accel    Z       1.5     0.15
3  orientation    X       1.5     0.15
4  orientation    Y       1.5     0.25
5  orientation    Z       1.5     0.00

In [22]:
# can simply apply the agg function to the group by object
mig_l12.mean()


Out[22]:
                  interval  reading
sensor      axis                   
accel       X          1.5     0.85
            Y          1.5     0.35
            Z          1.5     0.15
orientation X          1.5     0.15
            Y          1.5     0.25
            Z          1.5     0.00

In [23]:
# apply multiple aggregation functions at once
mig_l12.agg([np.sum, np.std])


Out[23]:
                 interval           reading          
                      sum       std     sum       std
sensor      axis                                     
accel       X           6  1.290994     3.4  0.129099
            Y           6  1.290994     1.4  0.129099
            Z           6  1.290994     0.6  0.129099
orientation X           6  1.290994     0.6  0.129099
            Y           6  1.290994     1.0  0.129099
            Z           6  1.290994     0.0  0.000000

In [24]:
# apply a different function to each column
mig_l12.agg({'interval' : len,
             'reading': np.mean})


Out[24]:
                  interval  reading
sensor      axis                   
accel       X            4     0.85
            Y            4     0.35
            Z            4     0.15
orientation X            4     0.15
            Y            4     0.25
            Z            4     0.00

In [25]:
# calculate the mean of the reading column
mig_l12['reading'].mean()


Out[25]:
sensor       axis
accel        X       0.85
             Y       0.35
             Z       0.15
orientation  X       0.15
             Y       0.25
             Z       0.00
Name: reading, dtype: float64

Transformation of group data

Overview of transformations


In [26]:
# a DataFrame to use for examples
df = pd.DataFrame({ 'Label': ['A', 'C', 'B', 'A', 'C'],
                    'Values': [0, 1, 2, 3, 4],
                    'Values2': [5, 6, 7, 8, 9],
                    'Noise': ['foo', 'bar', 'baz', 
                              'foobar', 'barbaz']})
df


Out[26]:
  Label   Noise  Values  Values2
0     A     foo       0        5
1     C     bar       1        6
2     B     baz       2        7
3     A  foobar       3        8
4     C  barbaz       4        9

In [27]:
# group by label
grouped = df.groupby('Label')
print_groups(grouped)


A
  Label   Noise  Values  Values2
0     A     foo       0        5
3     A  foobar       3        8
B
  Label Noise  Values  Values2
2     B   baz       2        7
C
  Label   Noise  Values  Values2
1     C     bar       1        6
4     C  barbaz       4        9

In [28]:
# add ten to all values in all columns
grouped.transform(lambda x: x + 10)


Out[28]:
   Values  Values2
0      10       15
1      11       16
2      12       17
3      13       18
4      14       19

In [29]:
# a function to print the input before we are adding 10 to it
def xplus10(x):
    print (x)
    return x + 10

In [30]:
# transform using xplus10
grouped.transform(xplus10)


0       foo
3    foobar
Name: Noise, dtype: object
0       foo
3    foobar
Name: Noise, dtype: object
0       foo
3    foobar
Name: A, dtype: object
0    0
3    3
Name: A, dtype: int64
2    2
Name: B, dtype: int64
1    1
4    4
Name: C, dtype: int64
0    5
3    8
Name: A, dtype: int64
2    7
Name: B, dtype: int64
1    6
4    9
Name: C, dtype: int64
Out[30]:
   Values  Values2
0      10       15
1      11       16
2      12       17
3      13       18
4      14       19

In [31]:
# sum returns existing as it is applied to each individual item
grouped.transform(lambda x: x.sum())


Out[31]:
    Noise  Values  Values2
0     foo       0        5
1     bar       1        6
2     baz       2        7
3  foobar       3        8
4  barbaz       4        9

Practical examples of transformation


In [32]:
# data to demonstrate replacement on NaN
df = pd.DataFrame({ 'Label': list("ABABAB"),
                    'Values': [10, 20, 11, np.nan, 12, 22]},
                  index=['i1', 'i2', 'i3', 'i4', 'i5', 'i6'])
df


Out[32]:
   Label  Values
i1     A      10
i2     B      20
i3     A      11
i4     B     NaN
i5     A      12
i6     B      22

In [33]:
# show the groups in the data based upon Label
grouped = df.groupby('Label')
print_groups(grouped)


A
   Label  Values
i1     A      10
i3     A      11
i5     A      12
B
   Label  Values
i2     B      20
i4     B     NaN
i6     B      22

In [34]:
# calculate the mean of the two groups
grouped.mean()


Out[34]:
       Values
Label        
A          11
B          21

In [35]:
# use transform to fill the NaNs with the mean of the group
filled_NaNs = grouped.transform(lambda x: x.fillna(x.mean()))
filled_NaNs


Out[35]:
    Values
i1      10
i2      20
i3      11
i4      21
i5      12
i6      22

In [36]:
# overwrite old values with the new ones
df.Values = filled_NaNs
df


Out[36]:
   Label  Values
i1     A      10
i2     B      20
i3     A      11
i4     B      21
i5     A      12
i6     B      22

In [37]:
# generate a rolling mean time series
np.random.seed(123456)
data = pd.Series(np.random.normal(0.5, 2, 365*3), 
                 pd.date_range('2011-01-01', periods=365*3))
rolling = pd.rolling_mean(data, 100, 100).dropna()
rolling


Out[37]:
2011-04-10    0.073603
2011-04-11    0.057313
2011-04-12    0.089255
2011-04-13    0.133248
2011-04-14    0.175876
                ...   
2013-12-26    0.811402
2013-12-27    0.835961
2013-12-28    0.850998
2013-12-29    0.842293
2013-12-30    0.848419
Freq: D, dtype: float64

In [38]:
# visualize the series
rolling.plot();



In [39]:
# calculate mean and std by year
groupkey = lambda x: x.year
groups = rolling.groupby(groupkey)
groups.agg([np.mean, np.std])


Out[39]:
          mean       std
2011  0.454233  0.171988
2012  0.286874  0.181930
2013  0.599933  0.276009

In [40]:
# normalize to the z-score
zscore = lambda x: (x - x.mean()) / x.std()
normed = rolling.groupby(groupkey).transform(zscore)
normed.groupby(groupkey).agg([np.mean, np.std])


Out[40]:
              mean  std
2011 -3.172066e-17    1
2012  4.246755e-17    1
2013 -3.388620e-16    1

In [41]:
# plot original vs normalize
compared = pd.DataFrame({ 'Original': rolling,
                          'Normed': normed })
compared.plot();



In [42]:
# check the distribution % within one std
# should be roughly 64.2%
normed_in1std = normed[np.abs(normed) <= 1.0].count()
float(normed_in1std) / len(normed)


Out[42]:
0.6485943775100401

Filtering groups


In [43]:
# data for our examples
df = pd.DataFrame({'Label': list('AABCCC'),
                   'Values': [1, 2, 3, 4, np.nan, 8]})
df


Out[43]:
  Label  Values
0     A       1
1     A       2
2     B       3
3     C       4
4     C     NaN
5     C       8

In [44]:
# drop groups with one or fewer non-NaN values
f = lambda x: x.Values.count() > 1
df.groupby('Label').filter(f)


Out[44]:
  Label  Values
0     A       1
1     A       2
3     C       4
4     C     NaN
5     C       8

In [45]:
# drop any groups with NaN values
f = lambda x: x.Values.isnull().sum() == 0
df.groupby('Label').filter(f)


Out[45]:
  Label  Values
0     A       1
1     A       2
2     B       3

In [46]:
# select groups with a mean of 2.0 or greater 
grouped = df.groupby('Label')
mean = grouped.mean().mean()
f = lambda x: abs(x.Values.mean() - mean) > 2.0
df.groupby('Label').filter(f)


Out[46]:
  Label  Values
3     C       4
4     C     NaN
5     C       8

In [47]:
# replace values in a group where the # of items is <= 1
f = lambda x: x.Values.count() > 1
df.groupby('Label').filter(f, dropna=False)


Out[47]:
  Label  Values
0     A       1
1     A       2
2   NaN     NaN
3     C       4
4     C     NaN
5     C       8

Discretization


In [48]:
# generate 10000 normal random #'s
np.random.seed(123456)
dist = np.random.normal(size = 10000)

# show the mean and std
"{0} {1}".format(dist.mean(), dist.std())


Out[48]:
'-0.00286332404091 1.0087162032'

In [49]:
# and the actual data
dist


Out[49]:
array([ 0.4691123 , -0.28286334, -1.5090585 , ...,  0.26296448,
       -0.83377412, -0.10418135])

In [50]:
# split the data into 5 bins
bins = pd.cut(dist, 5)
bins


Out[50]:
[(-0.633, 0.81], (-0.633, 0.81], (-2.0771, -0.633], (-2.0771, -0.633], (0.81, 2.254], ..., (-2.0771, -0.633], (-0.633, 0.81], (-0.633, 0.81], (-2.0771, -0.633], (-0.633, 0.81]]
Length: 10000
Categories (5, object): [(-3.528, -2.0771] < (-2.0771, -0.633] < (-0.633, 0.81] < (0.81, 2.254] < (2.254, 3.698]]

In [51]:
# show the categories in the bins
bins.categories


Out[51]:
Index([u'(-3.528, -2.0771]', u'(-2.0771, -0.633]', u'(-0.633, 0.81]', u'(0.81, 2.254]', u'(2.254, 3.698]'], dtype='object')

In [52]:
# demonstrate the math to calculate the bins
min = dist.min()
max = dist.max()
delta = max - min
iwidth = delta/5
extra = delta*0.001
intervals = np.arange(min, max + extra, iwidth)
intervals[0] -= delta*0.001
intervals


Out[52]:
array([-3.52809473, -2.07714421, -0.63341235,  0.81031951,  2.25405137,
        3.69778323])

In [53]:
# codes tells us which bin each item is in
bins.codes


Out[53]:
array([2, 2, 1, ..., 2, 1, 2], dtype=int8)

In [54]:
# move the closed side of the interval to the left
pd.cut(dist, 5, right=False).categories


Out[54]:
Index([u'[-3.521, -2.0771)', u'[-2.0771, -0.633)', u'[-0.633, 0.81)', u'[0.81, 2.254)', u'[2.254, 3.705)'], dtype='object')

In [55]:
# generate 50 ages between 6 and 45
np.random.seed(123456)
ages = np.random.randint(6, 45, 50)
ages


Out[55]:
array([ 7, 33, 38, 29, 42, 14, 16, 16, 18, 17, 26, 28, 44, 40, 20, 12,  8,
       10, 36, 29, 26, 26, 11, 29, 42, 17, 41, 35, 22, 40, 24, 21, 38, 33,
       26, 23, 16, 34, 26, 20, 18, 42, 27, 13, 37, 37, 10,  7, 10, 23])

In [56]:
# cut into ranges and then get descriptive stats
ranges = [6, 12, 18, 35, 50]
agebins = pd.cut(ages, ranges)
agebins.describe()


Out[56]:
            counts  freqs
categories               
(6, 12]          8   0.16
(12, 18]         9   0.18
(18, 35]        21   0.42
(35, 50]        12   0.24

In [57]:
# add names for the bins
ranges = [6, 12, 18, 35, 50]
labels = ['Youth', 'Young Adult', 'Adult', 'Middle Aged']
agebins = pd.cut(ages, ranges, labels=labels)
agebins.describe()


Out[57]:
             counts  freqs
categories                
Youth             8   0.16
Young Adult       9   0.18
Adult            21   0.42
Middle Aged      12   0.24

In [58]:
# cut into quantiles
# 5 bins with an equal quantity of items
qbin = pd.qcut(dist, 5)
# this will tell us the range of values in each quantile
qbin.describe()


Out[58]:
                  counts  freqs
categories                     
[-3.521, -0.861]    2000    0.2
(-0.861, -0.241]    2000    0.2
(-0.241, 0.261]     2000    0.2
(0.261, 0.866]      2000    0.2
(0.866, 3.698]      2000    0.2

In [59]:
# make the quantiles at the +/- 3, 2 and 1 std deviations
quantiles = [0,0.001,
             0.021,
             0.5-0.341,
             0.5,
             0.5+0.341,
             1.0-0.021,
             1.0-0.001,
             1.0]
qbin = pd.qcut(dist, quantiles)
# this data should be a perfect normal distribution
qbin.describe()


Out[59]:
                     counts  freqs
categories                        
[-3.521, -3.131]         10  0.001
(-3.131, -2.0562]       200  0.020
(-2.0562, -1.0332]     1380  0.138
(-1.0332, -0.00363]    3410  0.341
(-0.00363, 1.0114]     3410  0.341
(1.0114, 2.0428]       1380  0.138
(2.0428, 3.0619]        200  0.020
(3.0619, 3.698]          10  0.001