In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("data.csv",sep=',')
In [2]:
pd.__version__
Out[2]:
In [3]:
df
Out[3]:
In [4]:
# select the first 2 rows
df.iloc[:2]
Out[4]:
In [5]:
# select the last 2 rows
df.iloc[-2:]
Out[5]:
In [6]:
# select rows up to and including the one
# with index=2
df.loc[:2]
Out[6]:
In [7]:
# by a simple numeric condition
df[df["age"] > 30]
Out[7]:
In [8]:
# comparing the value of two columns
df[ df["num_pets"] > df[ "num_children"] ]
Out[8]:
In [9]:
# using boolean AND
df[ (df["age"] > 40) & (df["num_pets"] > 0) ]
Out[9]:
In [10]:
df.drop(["age","num_children"],axis=1)
Out[10]:
In [11]:
# get the mean for each of the selected columns
df[["age","num_pets","num_children"]].apply(lambda row: np.mean(row),axis=0)
Out[11]:
In [12]:
# sum columns age, num_pets and num_children for each row
df[["age","num_pets","num_children"]].apply(lambda row: np.sum(row),axis=1)
Out[12]:
In [13]:
df[["age"]].apply(lambda value: value*2)
Out[13]:
In [14]:
# certain numerical functions can also be used:
df[["age"]] * 2
Out[14]:
In [15]:
# also works for string values
df[["name"]].apply(lambda value: value.str.upper())
Out[15]:
In [16]:
df['name'].map(lambda name: name.upper())
Out[16]:
In [17]:
# simple sum of two columns
df["pets_and_children"] = df["num_pets"] + df["num_children"]
df
Out[17]:
In [18]:
# you can also use custom functions we used on "elementwise application"
df["name_uppercase"] = df[["name"]].apply(lambda name: name.str.upper())
df
Out[18]:
In [19]:
df.reindex(np.random.permutation(df.index))
Out[19]:
In [20]:
for index,row in df.iterrows():
print("{0} has name: {1}".format(index,row["name"]))
In [21]:
# sample 10 rows from df
random_indices = np.random.choice(df.index.values, 4, replace=False)
# iloc allows you to retrieve rows by their numeric indices
sampled_df = df.iloc[random_indices]
sampled_df
Out[21]:
In [22]:
# sort by age, largest first
df.sort_values("age",ascending=False )
Out[22]:
In [23]:
# sort by num_pets descending then sort by age ascending
df.sort_values( ["num_pets","age"], ascending=[False,True] )
Out[23]:
In [24]:
# show matplotlib plots here
%matplotlib inline
# a scatter plot comparing num_children and num_pets
df.plot(kind='scatter',x='num_children',y='num_pets')
Out[24]:
In [25]:
df.plot(kind='line',x='name',y='age')
Out[25]:
In [26]:
df[df.apply(lambda row: row['name'].startswith('j'),axis=1)]
Out[26]:
In [27]:
pd.concat([df,pd.get_dummies(df["state"])],axis=1)
Out[27]:
In [28]:
# use inplace=True if you want to mutate the current dataframe
df.rename(columns={"age":"age_years"} )
Out[28]:
In [29]:
df['num_children'].dtype
Out[29]:
In [30]:
# we don't need 64 bits for num_children
df['num_children'] = df['num_children'].astype('int32')
df['num_children'].dtype
Out[30]:
In [31]:
# if the method is passed a simple list, it matches
# those values anywhere in the dataframe
df.isin([2,4])
Out[31]:
In [32]:
# you can also pass a dict or another dataframe
# as argument
df.isin({'num_pets':[4,5]})
Out[32]:
In [33]:
# set column names and dtypes
new_df = pd.DataFrame(columns=['col_a','col_b']).astype({'col_a':'float32', 'col_b':'int8'})
# must reassign since the append method does not work in place
new_df = new_df.append({'col_a':5,'col_b':10}, ignore_index=True)
new_df = new_df.append({'col_a':1,'col_b':100}, ignore_index=True)
new_df
Out[33]:
In [34]:
new_df = pd.DataFrame(columns=['id','name'])
data_dict = [
{'id':1,'name':"john"},
{'id':2,'name':"mary"},
{'id':3,'name':"peter"}
]
# must reassign since the append method does not work in place
new_df = new_df.from_records(data_dict)
new_df
Out[34]:
In [35]:
new_df.reset_index(inplace=True)
new_df.drop('index',axis=1,inplace=True)
new_df
Out[35]:
In [46]:
data_dict = {
'name':['john','mary','peter'],
"date_of_birth": ['27/05/2002','10/10/1999','01/04/1985']
}
df = pd.DataFrame(data_dict,columns=['name','date_of_birth'])
df
Out[46]:
In [47]:
df['date_of_birth']=pd.to_datetime(new_df['date_of_birth'],format='%d/%m/%Y')
df
Out[47]:
In [48]:
df.dtypes
Out[48]:
In [ ]: