In [1]:
    
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib
from dsutil.plotting import add_grid,add_value_labels
pd.set_option('display.max_columns',1000)
import os
import matplotlib.pyplot as plt
%matplotlib inline
    
In [2]:
    
print('pandas: {}, numpy: {}, matplotlib: {}'.format(
    pd.__version__,
    np.__version__,
    matplotlib.__version__))
    
    
In [3]:
    
df = pd.DataFrame({
    'user_id':[1,2,1,3,3,],
    'content_id':[1,1,2,2,2],
    'tag':['cool','nice','clever','clever','not-bad']
})
df
    
    Out[3]:
In [4]:
    
df.groupby("content_id")['tag'].apply(lambda tags: ','.join(tags)).to_frame().reset_index()
    
    Out[4]:
In [5]:
    
df.groupby("content_id")["user_id"].nunique().to_frame().reset_index().rename(columns={"user_id":'unique_users'})
    
    Out[5]:
In [6]:
    
df = pd.DataFrame({
    'value':[20.45,22.89,32.12,111.22,33.22,100.00,99.99],
    'product':['table','chair','chair','mobile phone','table','mobile phone','table']
})
df
    
    Out[6]:
In [7]:
    
df.groupby('product')['value'].sum().to_frame().reset_index()
    
    Out[7]:
In [8]:
    
df.groupby('product')['value'].sum().to_frame().reset_index().sort_values(by='value')
    
    Out[8]:
In [9]:
    
type(df.groupby('product')['value'])
    
    Out[9]:
In [10]:
    
plt.clf()
df.groupby('product').size().plot(kind='bar')
plt.xticks(rotation=0)
plt.show()
    
    
In [11]:
    
plt.clf()
df.groupby('product').sum().plot(kind='bar')
plt.xticks(rotation=0)
plt.show()
    
    
    
In [12]:
    
df = pd.DataFrame({
    'product':['table','table','table','mobile phone','mobile phone','mobile phone','chair','chair','chair'],
    'purchase_price':[28.45, 25.89,32.12,99.99,120.00,170.00,12.22,28.22,5.00]
})
df[['product','purchase_price']]
    
    Out[12]:
In [13]:
    
df.groupby('product').agg([np.mean,np.std])
    
    Out[13]:
In [14]:
    
v=df.groupby('product').agg([np.mean,np.std])
v.columns = [col[-1].strip() for col in v.columns.values]
for index,row in v.iterrows():
    name = row.name
    mean = row['mean']
    stddev = row['std']    
    
v['mean'] = v['mean'].apply(lambda v: '{:.2f}'.format(v)) 
v['std'] = v['std'].apply(lambda v: '{:.2f}'.format(v)) 
v.reset_index()
    
    Out[14]:
In [15]:
    
plt.clf()
ax = plt.gca()
# plot the means
df.groupby('product').mean().plot(kind='bar',color='lightblue',ax=ax)
# generate a dataframe with means and standard deviations
grouped_df=df.groupby('product').agg([np.mean,np.std])
# flatten column names
grouped_df.columns = [col.strip() for col in v.columns.values]
# iterrows is usually very slow but since this is a grouped
# dataframe, there wonly be many rows
for i,(index,row) in enumerate(grouped_df.iterrows()):
    name = row.name
    mean = row['mean']
    stddev = row['std']
    
    # plot the vertical line
    ax.vlines(x=i,ymin=mean-stddev,ymax=mean+stddev)   
        
plt.xticks(rotation=0)
add_grid()
plt.ylabel('Average purchase price')
plt.xlabel(None)
plt.gca().legend_.remove()
plt.show()
    
    
In [16]:
    
df = pd.DataFrame({
    'value':[20.45,22.89,32.12,111.22,33.22,100.00,99.99],
    'product':['table','chair','chair','mobile phone','table','mobile phone','table']
})
df
    
    Out[16]:
In [17]:
    
grouped_df = df.groupby('product').agg({'value':['min','max','mean']})
    
In [18]:
    
grouped_df
    
    Out[18]:
In [19]:
    
grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
grouped_df.reset_index()
    
    Out[19]:
In [20]:
    
df
    
    Out[20]:
In [21]:
    
for key,group_df in df.groupby('product'):
    print("the group for product '{}' has {} rows".format(key,len(group_df)))
    
    
In [22]:
    
df
    
    Out[22]:
In [23]:
    
df.groupby('product')['value'].sum().to_frame().reset_index()
    
    Out[23]:
In [24]:
    
df.groupby('product')['value'].sum().reset_index(name='value_sum')
    
    Out[24]:
In [25]:
    
df
    
    Out[25]:
In [26]:
    
# grouped_df is a DataFrameGroupBy containing each individual group as a dataframe
grouped_df = df.groupby('product')
# you get can a dataframe containing the values for a single group
# using .get_group('group_key')
grouped_df.get_group('chair')
    
    Out[26]:
In [27]:
    
df.groupby('product')['value'].apply(lambda group_series: sorted(group_series.tolist())).reset_index(name='values')
    
    Out[27]:
In [28]:
    
df = pd.DataFrame({
    'value':[20,22,32,111,33,100,99],
    'product':['table','chair','chair','mobile phone','table','mobile phone','table']
})
df
    
    Out[28]:
In [29]:
    
def count_even_numbers(series):
    return len([elem for elem in series if elem % 2 == 0 ])
df.groupby('product')['value'].apply(count_even_numbers).reset_index(name='num_even_numbers')
    
    Out[29]:
In [30]:
    
df = pd.DataFrame({
    'price':[20,22,32,111,33,100,99],
    'product':['table','chair','chair','table','table','chair','table']
})
df.sort_values(by='product')
    
    Out[30]:
In [31]:
    
df.groupby("product").apply(
    lambda group_df: group_df.sample(2)
).reset_index(drop=True)
    
    Out[31]: