In [1]:
    
import pandas as pd
import numpy as np
import sys
%matplotlib inline
    
In [2]:
    
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__
    
    
In [3]:
    
# Initial dataframe
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'value'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df
    
    Out[3]:
In [4]:
    
# We don't simply want to select random rows from df
# We want to first group df by (group1 & group2) then select random rows
from random import choice
# First create the group
grouped = df.groupby(['group1','group2'])
grouped.size()
#Notice that group (a,1) has two posibilities
#Notice that group (a,2) has one posibilities
#This means that if we select a random sample from group (a,1) we will get either "apple" or "peach"
#This means that if we select a random sample from group (a,2) we will always get "nectarine"
    
    Out[4]:
In [5]:
    
#df.loc[select a random record from each group]
df.loc[(choice(x) for x in grouped.groups.itervalues())]
    
    Out[5]:
In [6]:
    
df = pd.DataFrame(data=['abcdef']*10, columns=['text'])
df
    
    Out[6]:
In [7]:
    
# Select the first 2 characters of each row
df['text'].apply(lambda x: x[:2])
    
    Out[7]:
In [8]:
    
d = {'Dates':[pd.Timestamp('2013-01-02'),
              pd.Timestamp('2013-01-03'),
              pd.Timestamp('2013-01-04')],
     'Num1':[1,2,3],
     'Num2':[-1,-2,-3]}
                 
df = pd.DataFrame(data=d)
df
    
    Out[8]:
In [9]:
    
# where all values in column "Num1" are positive
positive = df['Num1'] > 0
# where values in column "Num2" is equal to -1
negativeOne = df['Num2'] == -1
# where values in the column "Dates" are in (1/2/2013 or 1/20/2013)
Dates = df['Dates'].isin(['2013-01-02','2013-01-20'])
df[positive & negativeOne & Dates]
    
    Out[9]:
In [10]:
    
df = pd.DataFrame({'col1':['minus','minus','positive','nan'],
                'col2':[10,20,30,40],
                'col3':[-10,-20,30,np.nan]
                })
df
    
    Out[10]:
In [11]:
    
# Method 1
df.groupby('col1').apply(lambda x: x.max())
    
    Out[11]:
In [12]:
    
# Method 2
df.groupby('col1').agg('max')
    
    Out[12]:
In [13]:
    
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'group2'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df = df.set_index(['group1','group2'])
df
    
    Out[13]:
In [14]:
    
df.xs('a', level='group1')
    
    Out[14]:
In [15]:
    
df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] , 
                "City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]}
               )
df
    
    Out[15]:
In [16]:
    
group = df.groupby(['City','Name'])
s = group.agg('size')
s.add_suffix('_size').reset_index()
    
    Out[16]:
Author: David Rojas