In [31]:
import pandas as pd
import numpy as np

In [1]:
# Sample Data about student test scores from two classes
class_data = [{ 'student': 'AJ',
                'class': 'A',
                'score': 9
                },
              { 'student': 'Paul',
                'class': 'A',
                'score': 8
                },
              { 'student': 'Raymond',
                'class': 'A',
                'score': 7
                },
              { 'student': 'Jenny',
                'class': 'B',
                'score': 5
                },
              { 'student': 'Pete',
                'class': 'B',
                'score': 4
                },
              { 'student': 'Colin',
                'class': 'B',
                'score': 6
                },
              { 'student': 'Sarah',
                'class': 'B',
                'score': 4
                }]

In [3]:
df = pd.DataFrame(class_data)

In [67]:
# What does the dataframe look like?
df


Out[67]:
class score student
0 A 9 AJ
1 A 8 Paul
2 A 7 Raymond
3 B 5 Jenny
4 B 4 Pete
5 B 6 Colin
6 B 4 Sarah

7 rows × 3 columns


In [68]:
# the mean score for all students
df['score'].mean()


Out[68]:
6.1428571428571432

In [8]:
# Use a boolean index to look at students from only class A
df['class'] == 'A'


Out[8]:
0     True
1     True
2     True
3    False
4    False
5    False
6    False
Name: class, dtype: bool

In [21]:
df[df['class'] == 'A']


Out[21]:
class score student
0 A 9 AJ
1 A 8 Paul
2 A 7 Raymond

3 rows × 3 columns


In [14]:
# We can do a similar operation grouping by 'class'
group = df.groupby(by='class')

In [26]:
# This returns a DataFrameGroupBy object
type(group)


Out[26]:
pandas.core.groupby.DataFrameGroupBy

In [69]:
# We can get the mean from this object
class_mean = group.mean()

# And this returns a dataframe
type(class_mean)


Out[69]:
pandas.core.frame.DataFrame

In [70]:
class_mean


Out[70]:
score
class
A 8.00
B 4.75

2 rows × 1 columns


In [71]:
# Next, we can specify a column and use .aggregate to perform
# multiple calculations on one column
class_info = group['score'].aggregate({
                       'sum': np.sum,
                       'mean': np.mean,
                       'std': np.std
                    })

# this returns a dataframe
type(class_info)


Out[71]:
pandas.core.frame.DataFrame

In [72]:
class_info


Out[72]:
std sum mean
class
A 1.000000 24 8.00
B 0.957427 19 4.75

2 rows × 3 columns