In [31]:
import pandas as pd
import numpy as np
In [1]:
# Sample Data about student test scores from two classes
class_data = [{ 'student': 'AJ',
'class': 'A',
'score': 9
},
{ 'student': 'Paul',
'class': 'A',
'score': 8
},
{ 'student': 'Raymond',
'class': 'A',
'score': 7
},
{ 'student': 'Jenny',
'class': 'B',
'score': 5
},
{ 'student': 'Pete',
'class': 'B',
'score': 4
},
{ 'student': 'Colin',
'class': 'B',
'score': 6
},
{ 'student': 'Sarah',
'class': 'B',
'score': 4
}]
In [3]:
df = pd.DataFrame(class_data)
In [67]:
# What does the dataframe look like?
df
Out[67]:
In [68]:
# the mean score for all students
df['score'].mean()
Out[68]:
In [8]:
# Use a boolean index to look at students from only class A
df['class'] == 'A'
Out[8]:
In [21]:
df[df['class'] == 'A']
Out[21]:
In [14]:
# We can do a similar operation grouping by 'class'
group = df.groupby(by='class')
In [26]:
# This returns a DataFrameGroupBy object
type(group)
Out[26]:
In [69]:
# We can get the mean from this object
class_mean = group.mean()
# And this returns a dataframe
type(class_mean)
Out[69]:
In [70]:
class_mean
Out[70]:
In [71]:
# Next, we can specify a column and use .aggregate to perform
# multiple calculations on one column
class_info = group['score'].aggregate({
'sum': np.sum,
'mean': np.mean,
'std': np.std
})
# this returns a dataframe
type(class_info)
Out[71]:
In [72]:
class_info
Out[72]: