notebook.community

Edit and run



In [1]:

    
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib

from dsutil.plotting import add_grid,add_value_labels

pd.set_option('display.max_columns',1000)

import os

import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
print('pandas: {}, numpy: {}, matplotlib: {}'.format(
    pd.__version__,
    np.__version__,
    matplotlib.__version__))









    



pandas: 0.25.1, numpy: 1.16.3, matplotlib: 3.0.3



In [3]:

    
df = pd.DataFrame({
    'user_id':[1,2,1,3,3,],
    'content_id':[1,1,2,2,2],
    'tag':['cool','nice','clever','clever','not-bad']
})
df



In [4]:

    
df.groupby("content_id")['tag'].apply(lambda tags: ','.join(tags)).to_frame().reset_index()









    Out[4]:







  
    
      
      content_id
      tag
    
  
  
    
      0
      1
      cool,nice
    
    
      1
      2
      clever,clever,not-bad



In [5]:

    
df.groupby("content_id")["user_id"].nunique().to_frame().reset_index().rename(columns={"user_id":'unique_users'})









    Out[5]:







  
    
      
      content_id
      unique_users
    
  
  
    
      0
      1
      2
    
    
      1
      2
      2

sort groupby groups



In [6]:

    
df = pd.DataFrame({
    'value':[20.45,22.89,32.12,111.22,33.22,100.00,99.99],
    'product':['table','chair','chair','mobile phone','table','mobile phone','table']
})
df









    Out[6]:







  
    
      
      value
      product
    
  
  
    
      0
      20.45
      table
    
    
      1
      22.89
      chair
    
    
      2
      32.12
      chair
    
    
      3
      111.22
      mobile phone
    
    
      4
      33.22
      table
    
    
      5
      100.00
      mobile phone
    
    
      6
      99.99
      table



In [7]:

    
df.groupby('product')['value'].sum().to_frame().reset_index()









    Out[7]:







  
    
      
      product
      value
    
  
  
    
      0
      chair
      55.01
    
    
      1
      mobile phone
      211.22
    
    
      2
      table
      153.66



In [8]:

    
df.groupby('product')['value'].sum().to_frame().reset_index().sort_values(by='value')









    Out[8]:







  
    
      
      product
      value
    
  
  
    
      0
      chair
      55.01
    
    
      2
      table
      153.66
    
    
      1
      mobile phone
      211.22



In [9]:

    
type(df.groupby('product')['value'])









    Out[9]:





pandas.core.groupby.generic.SeriesGroupBy

plot group size



In [10]:

    
plt.clf()
df.groupby('product').size().plot(kind='bar')
plt.xticks(rotation=0)
plt.show()

plot sum by group



In [11]:

    
plt.clf()
df.groupby('product').sum().plot(kind='bar')
plt.xticks(rotation=0)
plt.show()









    





<Figure size 432x288 with 0 Axes>

plot group average with error bars



In [12]:

    
df = pd.DataFrame({
    'product':['table','table','table','mobile phone','mobile phone','mobile phone','chair','chair','chair'],
    'purchase_price':[28.45, 25.89,32.12,99.99,120.00,170.00,12.22,28.22,5.00]
})
df[['product','purchase_price']]









    Out[12]:







  
    
      
      product
      purchase_price
    
  
  
    
      0
      table
      28.45
    
    
      1
      table
      25.89
    
    
      2
      table
      32.12
    
    
      3
      mobile phone
      99.99
    
    
      4
      mobile phone
      120.00
    
    
      5
      mobile phone
      170.00
    
    
      6
      chair
      12.22
    
    
      7
      chair
      28.22
    
    
      8
      chair
      5.00



In [13]:

    
df.groupby('product').agg([np.mean,np.std])









    Out[13]:







  
    
      
      purchase_price
    
    
      
      mean
      std
    
    
      product
      
      
    
  
  
    
      chair
      15.146667
      11.883439
    
    
      mobile phone
      129.996667
      36.059673
    
    
      table
      28.820000
      3.131437



In [14]:

    
v=df.groupby('product').agg([np.mean,np.std])

v.columns = [col[-1].strip() for col in v.columns.values]


for index,row in v.iterrows():
    name = row.name
    mean = row['mean']
    stddev = row['std']    
    
v['mean'] = v['mean'].apply(lambda v: '{:.2f}'.format(v)) 
v['std'] = v['std'].apply(lambda v: '{:.2f}'.format(v)) 

v.reset_index()









    Out[14]:







  
    
      
      product
      mean
      std
    
  
  
    
      0
      chair
      15.15
      11.88
    
    
      1
      mobile phone
      130.00
      36.06
    
    
      2
      table
      28.82
      3.13



In [15]:

    
plt.clf()

ax = plt.gca()

# plot the means
df.groupby('product').mean().plot(kind='bar',color='lightblue',ax=ax)

# generate a dataframe with means and standard deviations
grouped_df=df.groupby('product').agg([np.mean,np.std])

# flatten column names
grouped_df.columns = [col.strip() for col in v.columns.values]

# iterrows is usually very slow but since this is a grouped
# dataframe, there wonly be many rows
for i,(index,row) in enumerate(grouped_df.iterrows()):
    name = row.name
    mean = row['mean']
    stddev = row['std']
    
    # plot the vertical line
    ax.vlines(x=i,ymin=mean-stddev,ymax=mean+stddev)   
        
plt.xticks(rotation=0)
add_grid()
plt.ylabel('Average purchase price')
plt.xlabel(None)
plt.gca().legend_.remove()
plt.show()

flattening



In [16]:

    
df = pd.DataFrame({
    'value':[20.45,22.89,32.12,111.22,33.22,100.00,99.99],
    'product':['table','chair','chair','mobile phone','table','mobile phone','table']
})
df









    Out[16]:







  
    
      
      value
      product
    
  
  
    
      0
      20.45
      table
    
    
      1
      22.89
      chair
    
    
      2
      32.12
      chair
    
    
      3
      111.22
      mobile phone
    
    
      4
      33.22
      table
    
    
      5
      100.00
      mobile phone
    
    
      6
      99.99
      table



In [17]:

    
grouped_df = df.groupby('product').agg({'value':['min','max','mean']})



In [18]:

    
grouped_df









    Out[18]:







  
    
      
      value
    
    
      
      min
      max
      mean
    
    
      product
      
      
      
    
  
  
    
      chair
      22.89
      32.12
      27.505
    
    
      mobile phone
      100.00
      111.22
      105.610
    
    
      table
      20.45
      99.99
      51.220



In [19]:

    
grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
grouped_df.reset_index()









    Out[19]:







  
    
      
      product
      value_min
      value_max
      value_mean
    
  
  
    
      0
      chair
      22.89
      32.12
      27.505
    
    
      1
      mobile phone
      100.00
      111.22
      105.610
    
    
      2
      table
      20.45
      99.99
      51.220

iterate over groups



In [20]:

    
df









    Out[20]:







  
    
      
      value
      product
    
  
  
    
      0
      20.45
      table
    
    
      1
      22.89
      chair
    
    
      2
      32.12
      chair
    
    
      3
      111.22
      mobile phone
    
    
      4
      33.22
      table
    
    
      5
      100.00
      mobile phone
    
    
      6
      99.99
      table



In [21]:

    
for key,group_df in df.groupby('product'):
    print("the group for product '{}' has {} rows".format(key,len(group_df)))









    



the group for product 'chair' has 2 rows
the group for product 'mobile phone' has 2 rows
the group for product 'table' has 3 rows

group by and change aggregation column name



In [22]:

    
df









    Out[22]:







  
    
      
      value
      product
    
  
  
    
      0
      20.45
      table
    
    
      1
      22.89
      chair
    
    
      2
      32.12
      chair
    
    
      3
      111.22
      mobile phone
    
    
      4
      33.22
      table
    
    
      5
      100.00
      mobile phone
    
    
      6
      99.99
      table



In [23]:

    
df.groupby('product')['value'].sum().to_frame().reset_index()









    Out[23]:







  
    
      
      product
      value
    
  
  
    
      0
      chair
      55.01
    
    
      1
      mobile phone
      211.22
    
    
      2
      table
      153.66



In [24]:

    
df.groupby('product')['value'].sum().reset_index(name='value_sum')









    Out[24]:







  
    
      
      product
      value_sum
    
  
  
    
      0
      chair
      55.01
    
    
      1
      mobile phone
      211.22
    
    
      2
      table
      153.66

get group by key



In [25]:

    
df









    Out[25]:







  
    
      
      value
      product
    
  
  
    
      0
      20.45
      table
    
    
      1
      22.89
      chair
    
    
      2
      32.12
      chair
    
    
      3
      111.22
      mobile phone
    
    
      4
      33.22
      table
    
    
      5
      100.00
      mobile phone
    
    
      6
      99.99
      table



In [26]:

    
# grouped_df is a DataFrameGroupBy containing each individual group as a dataframe
grouped_df = df.groupby('product')

# you get can a dataframe containing the values for a single group
# using .get_group('group_key')
grouped_df.get_group('chair')

group into list



In [27]:

    
df.groupby('product')['value'].apply(lambda group_series: sorted(group_series.tolist())).reset_index(name='values')









    Out[27]:







  
    
      
      product
      values
    
  
  
    
      0
      chair
      [22.89, 32.12]
    
    
      1
      mobile phone
      [100.0, 111.22]
    
    
      2
      table
      [20.45, 33.22, 99.99]

custom aggregation function



In [28]:

    
df = pd.DataFrame({
    'value':[20,22,32,111,33,100,99],
    'product':['table','chair','chair','mobile phone','table','mobile phone','table']
})

df









    Out[28]:







  
    
      
      value
      product
    
  
  
    
      0
      20
      table
    
    
      1
      22
      chair
    
    
      2
      32
      chair
    
    
      3
      111
      mobile phone
    
    
      4
      33
      table
    
    
      5
      100
      mobile phone
    
    
      6
      99
      table



In [29]:

    
def count_even_numbers(series):
    return len([elem for elem in series if elem % 2 == 0 ])

df.groupby('product')['value'].apply(count_even_numbers).reset_index(name='num_even_numbers')









    Out[29]:







  
    
      
      product
      num_even_numbers
    
  
  
    
      0
      chair
      2
    
    
      1
      mobile phone
      1
    
    
      2
      table
      1

stratified sampling



In [30]:

    
df = pd.DataFrame({
    'price':[20,22,32,111,33,100,99],
    'product':['table','chair','chair','table','table','chair','table']
})

df.sort_values(by='product')



In [31]:

    
df.groupby("product").apply(
    lambda group_df: group_df.sample(2)
).reset_index(drop=True)

	value	product
0	20.45	table
1	22.89	chair
2	32.12	chair
3	111.22	mobile phone
4	33.22	table
5	100.00	mobile phone
6	99.99	table

	product	purchase_price
0	table	28.45
1	table	25.89
2	table	32.12
3	mobile phone	99.99
4	mobile phone	120.00
5	mobile phone	170.00
6	chair	12.22
7	chair	28.22
8	chair	5.00

	purchase_price
	mean	std
product
chair	15.146667	11.883439
mobile phone	129.996667	36.059673
table	28.820000	3.131437

	product	values
0	chair	[22.89, 32.12]
1	mobile phone	[100.0, 111.22]
2	table	[20.45, 33.22, 99.99]