In [1]:
import pandas as pd
import numpy as np
import sys
%matplotlib inline
In [2]:
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__
In [3]:
# Initial dataframe
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
"c","a","a","a","b","b","b","b"],
'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
'value' : ["apple","pear","orange","apple",
"banana","durian","lemon","lime",
"raspberry","durian","peach","nectarine",
"banana","lemon","guava","blackberry","grape"]})
df
Out[3]:
In [4]:
# We don't simply want to select random rows from df
# We want to first group df by (group1 & group2) then select random rows
from random import choice
# First create the group
grouped = df.groupby(['group1','group2'])
grouped.size()
#Notice that group (a,1) has two posibilities
#Notice that group (a,2) has one posibilities
#This means that if we select a random sample from group (a,1) we will get either "apple" or "peach"
#This means that if we select a random sample from group (a,2) we will always get "nectarine"
Out[4]:
In [5]:
#df.loc[select a random record from each group]
df.loc[(choice(x) for x in grouped.groups.itervalues())]
Out[5]:
In [6]:
df = pd.DataFrame(data=['abcdef']*10, columns=['text'])
df
Out[6]:
In [7]:
# Select the first 2 characters of each row
df['text'].apply(lambda x: x[:2])
Out[7]:
In [8]:
d = {'Dates':[pd.Timestamp('2013-01-02'),
pd.Timestamp('2013-01-03'),
pd.Timestamp('2013-01-04')],
'Num1':[1,2,3],
'Num2':[-1,-2,-3]}
df = pd.DataFrame(data=d)
df
Out[8]:
In [9]:
# where all values in column "Num1" are positive
positive = df['Num1'] > 0
# where values in column "Num2" is equal to -1
negativeOne = df['Num2'] == -1
# where values in the column "Dates" are in (1/2/2013 or 1/20/2013)
Dates = df['Dates'].isin(['2013-01-02','2013-01-20'])
df[positive & negativeOne & Dates]
Out[9]:
In [10]:
df = pd.DataFrame({'col1':['minus','minus','positive','nan'],
'col2':[10,20,30,40],
'col3':[-10,-20,30,np.nan]
})
df
Out[10]:
In [11]:
# Method 1
df.groupby('col1').apply(lambda x: x.max())
Out[11]:
In [12]:
# Method 2
df.groupby('col1').agg('max')
Out[12]:
In [13]:
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
"c","a","a","a","b","b","b","b"],
'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
'group2' : ["apple","pear","orange","apple",
"banana","durian","lemon","lime",
"raspberry","durian","peach","nectarine",
"banana","lemon","guava","blackberry","grape"]})
df = df.set_index(['group1','group2'])
df
Out[13]:
In [14]:
df.xs('a', level='group1')
Out[14]:
In [15]:
df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] ,
"City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]}
)
df
Out[15]:
In [16]:
group = df.groupby(['City','Name'])
s = group.agg('size')
s.add_suffix('_size').reset_index()
Out[16]:
Author: David Rojas