These are the follow-up code samples for the following blog post: Pandas DataFrame by Example



In [38]:

    
import pandas as pd
import numpy as np



In [2]:

    
pd.__version__









    Out[2]:





'0.23.1'



In [41]:

    
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df









    Out[41]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      0
      john
      23
      iowa
      2
      0
    
    
      1
      mary
      78
      dc
      2
      4
    
    
      2
      peter
      22
      california
      0
      0
    
    
      3
      jeff
      19
      texas
      1
      5
    
    
      4
      bill
      45
      washington
      2
      0
    
    
      5
      lisa
      33
      dc
      1
      0

Select rows by position



In [42]:

    
# select the first 2 rows
df.iloc[:2]









    Out[42]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      0
      john
      23
      iowa
      2
      0
    
    
      1
      mary
      78
      dc
      2
      4



In [43]:

    
# select the last 2 rows
df.iloc[-2:]









    Out[43]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      4
      bill
      45
      washington
      2
      0
    
    
      5
      lisa
      33
      dc
      1
      0

Select rows by index value

compare this with iloc above



In [44]:

    
# select rows up to and including the one
# with index=2
df.loc[:2]









    Out[44]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      0
      john
      23
      iowa
      2
      0
    
    
      1
      mary
      78
      dc
      2
      4
    
    
      2
      peter
      22
      california
      0
      0

Select rows based upon the value of columns



In [45]:

    
# by a simple numeric condition
df[df["age"] > 30]









    Out[45]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      1
      mary
      78
      dc
      2
      4
    
    
      4
      bill
      45
      washington
      2
      0
    
    
      5
      lisa
      33
      dc
      1
      0



In [46]:

    
# comparing the value of two columns
df[ df["num_pets"] > df[ "num_children"] ]









    Out[46]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      1
      mary
      78
      dc
      2
      4
    
    
      3
      jeff
      19
      texas
      1
      5



In [47]:

    
# using boolean AND
df[ (df["age"] > 40) & (df["num_pets"] > 0) ]









    Out[47]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      1
      mary
      78
      dc
      2
      4

select columns starting with



In [48]:

    
df[[colname for colname in df.columns if colname.startswith('n')]]









    Out[48]:







  
    
      
      name
      num_children
      num_pets
    
  
  
    
      0
      john
      2
      0
    
    
      1
      mary
      2
      4
    
    
      2
      peter
      0
      0
    
    
      3
      jeff
      1
      5
    
    
      4
      bill
      2
      0
    
    
      5
      lisa
      1
      0

select all columns but one



In [49]:

    
df.drop(['age'],axis=1)









    Out[49]:







  
    
      
      name
      state
      num_children
      num_pets
    
  
  
    
      0
      john
      iowa
      2
      0
    
    
      1
      mary
      dc
      2
      4
    
    
      2
      peter
      california
      0
      0
    
    
      3
      jeff
      texas
      1
      5
    
    
      4
      bill
      washington
      2
      0
    
    
      5
      lisa
      dc
      1
      0

Drop a column



In [50]:

    
df.drop(["age","num_children"],axis=1)









    Out[50]:







  
    
      
      name
      state
      num_pets
    
  
  
    
      0
      john
      iowa
      0
    
    
      1
      mary
      dc
      4
    
    
      2
      peter
      california
      0
    
    
      3
      jeff
      texas
      5
    
    
      4
      bill
      washington
      0
    
    
      5
      lisa
      dc
      0

Apply a function to every column (as aggregates)

Using numpy vectorized functions for numerical values



In [52]:

    
# get the mean for each of the selected columns
df[["age","num_pets","num_children"]].apply(lambda row: np.mean(row),axis=0).to_frame()









    Out[52]:







  
    
      
      0
    
  
  
    
      age
      36.666667
    
    
      num_pets
      1.500000
    
    
      num_children
      1.333333

Apply a function to every row (as aggregates)

Using numpy vectorized functions for numerical values



In [54]:

    
# sum columns age, num_pets and num_children for each row
df[["age","num_pets","num_children"]].apply(lambda row: np.sum(row),axis=1).to_frame()

Apply a function elementwise using apply



In [55]:

    
df[["age"]].apply(lambda value: value*2)



In [56]:

    
# certain numerical functions can also be used:
df[["age"]] * 2



In [57]:

    
# also works for string values
df[["name"]].apply(lambda value: value.str.upper())

Apply a function elementwise using map

use apply for DataFrame objects and map for Series objects



In [58]:

    
df['name'].map(lambda name: name.upper()).to_frame()

Add new columns based on old ones



In [59]:

    
# simple sum of two columns
df["pets_and_children"] = df["num_pets"] + df["num_children"]
df









    Out[59]:







  
    
      
      name
      age
      state
      num_children
      num_pets
      pets_and_children
    
  
  
    
      0
      john
      23
      iowa
      2
      0
      2
    
    
      1
      mary
      78
      dc
      2
      4
      6
    
    
      2
      peter
      22
      california
      0
      0
      0
    
    
      3
      jeff
      19
      texas
      1
      5
      6
    
    
      4
      bill
      45
      washington
      2
      0
      2
    
    
      5
      lisa
      33
      dc
      1
      0
      1



In [60]:

    
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df

# you can also use custom functions we used on "elementwise application"
df["name_uppercase"] = df[["name"]].apply(lambda name: name.str.upper())
df









    Out[60]:







  
    
      
      name
      age
      state
      num_children
      num_pets
      name_uppercase
    
  
  
    
      0
      john
      23
      iowa
      2
      0
      JOHN
    
    
      1
      mary
      78
      dc
      2
      4
      MARY
    
    
      2
      peter
      22
      california
      0
      0
      PETER
    
    
      3
      jeff
      19
      texas
      1
      5
      JEFF
    
    
      4
      bill
      45
      washington
      2
      0
      BILL
    
    
      5
      lisa
      33
      dc
      1
      0
      LISA

Shuffle rows



In [82]:

    
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df

df.reindex(np.random.permutation(df.index))









    Out[82]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      4
      bill
      45
      washington
      2
      0
    
    
      3
      jeff
      19
      texas
      1
      5
    
    
      2
      peter
      22
      california
      0
      0
    
    
      1
      mary
      78
      dc
      2
      4
    
    
      5
      lisa
      33
      dc
      1
      0
    
    
      0
      john
      23
      iowa
      2
      0

Iterate over all rows



In [83]:

    
for index,row in df.iterrows():
    print("{0} has name: {1}".format(index,row["name"]))









    



0 has name: john
1 has name: mary
2 has name: peter
3 has name: jeff
4 has name: bill
5 has name: lisa

Randomly sample rows



In [84]:

    
# sample 10 rows from df
random_indices = np.random.choice(df.index.values, 4, replace=False)

# iloc allows you to retrieve rows by their numeric indices
sampled_df = df.iloc[random_indices]
sampled_df









    Out[84]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      2
      peter
      22
      california
      0
      0
    
    
      3
      jeff
      19
      texas
      1
      5
    
    
      1
      mary
      78
      dc
      2
      4
    
    
      5
      lisa
      33
      dc
      1
      0

Sort a dataframe



In [85]:

    
# sort by age, largest first
df.sort_values("age",ascending=False )









    Out[85]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      1
      mary
      78
      dc
      2
      4
    
    
      4
      bill
      45
      washington
      2
      0
    
    
      5
      lisa
      33
      dc
      1
      0
    
    
      0
      john
      23
      iowa
      2
      0
    
    
      2
      peter
      22
      california
      0
      0
    
    
      3
      jeff
      19
      texas
      1
      5



In [86]:

    
# sort by num_pets descending then sort by age ascending
df.sort_values( ["num_pets","age"], ascending=[False,True] )









    Out[86]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      3
      jeff
      19
      texas
      1
      5
    
    
      1
      mary
      78
      dc
      2
      4
    
    
      2
      peter
      22
      california
      0
      0
    
    
      0
      john
      23
      iowa
      2
      0
    
    
      5
      lisa
      33
      dc
      1
      0
    
    
      4
      bill
      45
      washington
      2
      0

custom sort



In [100]:

    
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,12,33],
    'state':['N/A','dc','california','texas','N/A','dc']
})

# sorting columns
df=df[['name','age','state']]
df



In [104]:

    
def state_to_rank(state):
    if state=="N/A":
        return 1
    else:
        return 0
    
df['rank'] = df['state'].map(lambda x: state_to_rank(x))

df.sort_values(by=['rank','age']).drop(['rank'],axis=1).reset_index(drop=True)

Perform complex selections using lambdas



In [66]:

    
df[df.apply(lambda row: row['name'].startswith('j'),axis=1)]









    Out[66]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      0
      john
      23
      iowa
      2
      0
    
    
      3
      jeff
      19
      texas
      1
      5

Change column names



In [68]:

    
# use inplace=True if you want to mutate the current dataframe
df.rename(columns={"age":"age_years"} )









    Out[68]:







  
    
      
      name
      age_years
      state
      num_children
      num_pets
    
  
  
    
      0
      john
      23
      iowa
      2
      0
    
    
      1
      mary
      78
      dc
      2
      4
    
    
      2
      peter
      22
      california
      0
      0
    
    
      3
      jeff
      19
      texas
      1
      5
    
    
      4
      bill
      45
      washington
      2
      0
    
    
      5
      lisa
      33
      dc
      1
      0

Change column dtype



In [69]:

    
df['num_children'].dtype









    Out[69]:





dtype('int64')



In [72]:

    
# we don't need 64 bits for num_children
df['num_children'] = df['num_children'].astype('int32')
df['num_children'].dtype

# it looks the same but takes less space
df









    Out[72]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      0
      john
      23
      iowa
      2
      0
    
    
      1
      mary
      78
      dc
      2
      4
    
    
      2
      peter
      22
      california
      0
      0
    
    
      3
      jeff
      19
      texas
      1
      5
    
    
      4
      bill
      45
      washington
      2
      0
    
    
      5
      lisa
      33
      dc
      1
      0

Veryfing that the dataframe includes specific values

This is done using the .isin() method, which returns a boolean dataframe to indicate where the passed values are.



In [73]:

    
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]

# if the method is passed a simple list, it matches
# those values anywhere in the dataframe 
df.isin([2,4])









    Out[73]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      0
      False
      False
      False
      True
      False
    
    
      1
      False
      False
      False
      True
      True
    
    
      2
      False
      False
      False
      False
      False
    
    
      3
      False
      False
      False
      False
      False
    
    
      4
      False
      False
      False
      True
      False
    
    
      5
      False
      False
      False
      False
      False



In [74]:

    
# you can also pass a dict or another dataframe
# as argument
df.isin({'num_pets':[4,5]})









    Out[74]:







  
    
      
      name
      age
      state
      num_children
      num_pets
    
  
  
    
      0
      False
      False
      False
      False
      False
    
    
      1
      False
      False
      False
      False
      True
    
    
      2
      False
      False
      False
      False
      False
    
    
      3
      False
      False
      False
      False
      True
    
    
      4
      False
      False
      False
      False
      False
    
    
      5
      False
      False
      False
      False
      False

Create an empty Dataframe and append rows one by one



In [75]:

    
# set column names and dtypes
new_df = pd.DataFrame(columns=['col_a','col_b']).astype({'col_a':'float32', 'col_b':'int8'})


# must reassign since the append method does not work in place
new_df = new_df.append({'col_a':5,'col_b':10}, ignore_index=True)
new_df = new_df.append({'col_a':1,'col_b':100}, ignore_index=True)

new_df

Create from list of dicts



In [76]:

    
new_df = pd.DataFrame(columns=['id','name'])

data_dict = [
    {'id':1,'name':"john"},
    {'id':2,'name':"mary"},
    {'id':3,'name':"peter"}
]

# must reassign since the append method does not work in place

new_df = new_df.from_records(data_dict)
new_df

converting types



In [77]:

    
df = pd.DataFrame({
    'name':['john','mary','peter'],
    "date_of_birth": ['27/05/2002','10/10/1999','01/04/1985']
})
df









    Out[77]:







  
    
      
      date_of_birth
      name
    
  
  
    
      0
      27/05/2002
      john
    
    
      1
      10/10/1999
      mary
    
    
      2
      01/04/1985
      peter



In [79]:

    
df['date_of_birth']=pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')
df









    Out[79]:







  
    
      
      date_of_birth
      name
    
  
  
    
      0
      2002-05-27
      john
    
    
      1
      1999-10-10
      mary
    
    
      2
      1985-04-01
      peter



In [80]:

    
df.dtypes









    Out[80]:





date_of_birth    datetime64[ns]
name                     object
dtype: object



In [ ]:

	name	age	state	num_children	num_pets
0	john	23	iowa	2	0
1	mary	78	dc	2	4
2	peter	22	california	0	0
3	jeff	19	texas	1	5
4	bill	45	washington	2	0
5	lisa	33	dc	1	0

	name	age	state	num_children	num_pets
0	False	False	False	True	False
1	False	False	False	True	True
2	False	False	False	False	False
3	False	False	False	False	False
4	False	False	False	True	False
5	False	False	False	False	False