In [1]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib
from dsutil.plotting import add_grid,add_value_labels
pd.set_option('display.max_columns',1000)
import os
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
print('pandas: {}, numpy: {}, matplotlib: {}'.format(
pd.__version__,
np.__version__,
matplotlib.__version__))
In [3]:
df = pd.DataFrame({
'user_id':[1,2,1,3,3,],
'content_id':[1,1,2,2,2],
'tag':['cool','nice','clever','clever','not-bad']
})
df
Out[3]:
In [4]:
df.groupby("content_id")['tag'].apply(lambda tags: ','.join(tags)).to_frame().reset_index()
Out[4]:
In [5]:
df.groupby("content_id")["user_id"].nunique().to_frame().reset_index().rename(columns={"user_id":'unique_users'})
Out[5]:
In [6]:
df = pd.DataFrame({
'value':[20.45,22.89,32.12,111.22,33.22,100.00,99.99],
'product':['table','chair','chair','mobile phone','table','mobile phone','table']
})
df
Out[6]:
In [7]:
df.groupby('product')['value'].sum().to_frame().reset_index()
Out[7]:
In [8]:
df.groupby('product')['value'].sum().to_frame().reset_index().sort_values(by='value')
Out[8]:
In [9]:
type(df.groupby('product')['value'])
Out[9]:
In [10]:
plt.clf()
df.groupby('product').size().plot(kind='bar')
plt.xticks(rotation=0)
plt.show()
In [11]:
plt.clf()
df.groupby('product').sum().plot(kind='bar')
plt.xticks(rotation=0)
plt.show()
In [12]:
df = pd.DataFrame({
'product':['table','table','table','mobile phone','mobile phone','mobile phone','chair','chair','chair'],
'purchase_price':[28.45, 25.89,32.12,99.99,120.00,170.00,12.22,28.22,5.00]
})
df[['product','purchase_price']]
Out[12]:
In [13]:
df.groupby('product').agg([np.mean,np.std])
Out[13]:
In [14]:
v=df.groupby('product').agg([np.mean,np.std])
v.columns = [col[-1].strip() for col in v.columns.values]
for index,row in v.iterrows():
name = row.name
mean = row['mean']
stddev = row['std']
v['mean'] = v['mean'].apply(lambda v: '{:.2f}'.format(v))
v['std'] = v['std'].apply(lambda v: '{:.2f}'.format(v))
v.reset_index()
Out[14]:
In [15]:
plt.clf()
ax = plt.gca()
# plot the means
df.groupby('product').mean().plot(kind='bar',color='lightblue',ax=ax)
# generate a dataframe with means and standard deviations
grouped_df=df.groupby('product').agg([np.mean,np.std])
# flatten column names
grouped_df.columns = [col.strip() for col in v.columns.values]
# iterrows is usually very slow but since this is a grouped
# dataframe, there wonly be many rows
for i,(index,row) in enumerate(grouped_df.iterrows()):
name = row.name
mean = row['mean']
stddev = row['std']
# plot the vertical line
ax.vlines(x=i,ymin=mean-stddev,ymax=mean+stddev)
plt.xticks(rotation=0)
add_grid()
plt.ylabel('Average purchase price')
plt.xlabel(None)
plt.gca().legend_.remove()
plt.show()
In [16]:
df = pd.DataFrame({
'value':[20.45,22.89,32.12,111.22,33.22,100.00,99.99],
'product':['table','chair','chair','mobile phone','table','mobile phone','table']
})
df
Out[16]:
In [17]:
grouped_df = df.groupby('product').agg({'value':['min','max','mean']})
In [18]:
grouped_df
Out[18]:
In [19]:
grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
grouped_df.reset_index()
Out[19]:
In [20]:
df
Out[20]:
In [21]:
for key,group_df in df.groupby('product'):
print("the group for product '{}' has {} rows".format(key,len(group_df)))
In [22]:
df
Out[22]:
In [23]:
df.groupby('product')['value'].sum().to_frame().reset_index()
Out[23]:
In [24]:
df.groupby('product')['value'].sum().reset_index(name='value_sum')
Out[24]:
In [25]:
df
Out[25]:
In [26]:
# grouped_df is a DataFrameGroupBy containing each individual group as a dataframe
grouped_df = df.groupby('product')
# you get can a dataframe containing the values for a single group
# using .get_group('group_key')
grouped_df.get_group('chair')
Out[26]:
In [27]:
df.groupby('product')['value'].apply(lambda group_series: sorted(group_series.tolist())).reset_index(name='values')
Out[27]:
In [28]:
df = pd.DataFrame({
'value':[20,22,32,111,33,100,99],
'product':['table','chair','chair','mobile phone','table','mobile phone','table']
})
df
Out[28]:
In [29]:
def count_even_numbers(series):
return len([elem for elem in series if elem % 2 == 0 ])
df.groupby('product')['value'].apply(count_even_numbers).reset_index(name='num_even_numbers')
Out[29]:
In [30]:
df = pd.DataFrame({
'price':[20,22,32,111,33,100,99],
'product':['table','chair','chair','table','table','chair','table']
})
df.sort_values(by='product')
Out[30]:
In [31]:
df.groupby("product").apply(
lambda group_df: group_df.sample(2)
).reset_index(drop=True)
Out[31]: