In [38]:
import pandas as pd
import numpy as np
In [2]:
pd.__version__
Out[2]:
In [41]:
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,45,33],
'state':['iowa','dc','california','texas','washington','dc'],
'num_children':[2,2,0,1,2,1],
'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df
Out[41]:
In [42]:
# select the first 2 rows
df.iloc[:2]
Out[42]:
In [43]:
# select the last 2 rows
df.iloc[-2:]
Out[43]:
In [44]:
# select rows up to and including the one
# with index=2
df.loc[:2]
Out[44]:
In [45]:
# by a simple numeric condition
df[df["age"] > 30]
Out[45]:
In [46]:
# comparing the value of two columns
df[ df["num_pets"] > df[ "num_children"] ]
Out[46]:
In [47]:
# using boolean AND
df[ (df["age"] > 40) & (df["num_pets"] > 0) ]
Out[47]:
In [48]:
df[[colname for colname in df.columns if colname.startswith('n')]]
Out[48]:
In [49]:
df.drop(['age'],axis=1)
Out[49]:
In [50]:
df.drop(["age","num_children"],axis=1)
Out[50]:
In [52]:
# get the mean for each of the selected columns
df[["age","num_pets","num_children"]].apply(lambda row: np.mean(row),axis=0).to_frame()
Out[52]:
In [54]:
# sum columns age, num_pets and num_children for each row
df[["age","num_pets","num_children"]].apply(lambda row: np.sum(row),axis=1).to_frame()
Out[54]:
In [55]:
df[["age"]].apply(lambda value: value*2)
Out[55]:
In [56]:
# certain numerical functions can also be used:
df[["age"]] * 2
Out[56]:
In [57]:
# also works for string values
df[["name"]].apply(lambda value: value.str.upper())
Out[57]:
In [58]:
df['name'].map(lambda name: name.upper()).to_frame()
Out[58]:
In [59]:
# simple sum of two columns
df["pets_and_children"] = df["num_pets"] + df["num_children"]
df
Out[59]:
In [60]:
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,45,33],
'state':['iowa','dc','california','texas','washington','dc'],
'num_children':[2,2,0,1,2,1],
'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df
# you can also use custom functions we used on "elementwise application"
df["name_uppercase"] = df[["name"]].apply(lambda name: name.str.upper())
df
Out[60]:
In [82]:
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,45,33],
'state':['iowa','dc','california','texas','washington','dc'],
'num_children':[2,2,0,1,2,1],
'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df
df.reindex(np.random.permutation(df.index))
Out[82]:
In [83]:
for index,row in df.iterrows():
print("{0} has name: {1}".format(index,row["name"]))
In [84]:
# sample 10 rows from df
random_indices = np.random.choice(df.index.values, 4, replace=False)
# iloc allows you to retrieve rows by their numeric indices
sampled_df = df.iloc[random_indices]
sampled_df
Out[84]:
In [85]:
# sort by age, largest first
df.sort_values("age",ascending=False )
Out[85]:
In [86]:
# sort by num_pets descending then sort by age ascending
df.sort_values( ["num_pets","age"], ascending=[False,True] )
Out[86]:
In [100]:
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,12,33],
'state':['N/A','dc','california','texas','N/A','dc']
})
# sorting columns
df=df[['name','age','state']]
df
Out[100]:
In [104]:
def state_to_rank(state):
if state=="N/A":
return 1
else:
return 0
df['rank'] = df['state'].map(lambda x: state_to_rank(x))
df.sort_values(by=['rank','age']).drop(['rank'],axis=1).reset_index(drop=True)
Out[104]:
In [66]:
df[df.apply(lambda row: row['name'].startswith('j'),axis=1)]
Out[66]:
In [68]:
# use inplace=True if you want to mutate the current dataframe
df.rename(columns={"age":"age_years"} )
Out[68]:
In [69]:
df['num_children'].dtype
Out[69]:
In [72]:
# we don't need 64 bits for num_children
df['num_children'] = df['num_children'].astype('int32')
df['num_children'].dtype
# it looks the same but takes less space
df
Out[72]:
In [73]:
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,45,33],
'state':['iowa','dc','california','texas','washington','dc'],
'num_children':[2,2,0,1,2,1],
'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
# if the method is passed a simple list, it matches
# those values anywhere in the dataframe
df.isin([2,4])
Out[73]:
In [74]:
# you can also pass a dict or another dataframe
# as argument
df.isin({'num_pets':[4,5]})
Out[74]:
In [75]:
# set column names and dtypes
new_df = pd.DataFrame(columns=['col_a','col_b']).astype({'col_a':'float32', 'col_b':'int8'})
# must reassign since the append method does not work in place
new_df = new_df.append({'col_a':5,'col_b':10}, ignore_index=True)
new_df = new_df.append({'col_a':1,'col_b':100}, ignore_index=True)
new_df
Out[75]:
In [76]:
new_df = pd.DataFrame(columns=['id','name'])
data_dict = [
{'id':1,'name':"john"},
{'id':2,'name':"mary"},
{'id':3,'name':"peter"}
]
# must reassign since the append method does not work in place
new_df = new_df.from_records(data_dict)
new_df
Out[76]:
In [77]:
df = pd.DataFrame({
'name':['john','mary','peter'],
"date_of_birth": ['27/05/2002','10/10/1999','01/04/1985']
})
df
Out[77]:
In [79]:
df['date_of_birth']=pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')
df
Out[79]:
In [80]:
df.dtypes
Out[80]:
In [ ]: