ToC
In [1]:
import pandas as pd
import numpy as np
In [2]:
#from a list
l1 = [1,2,3,4,5]
ser1 = pd.Series(data = l1) #when you dont specify labels for index, it is autogenerated
ser1
Out[2]:
In [3]:
#from a numpy array
arr1 = np.array(l1)
l2 = ['a', 'b', 'c','e', 'd']
ser2 = pd.Series(data=arr1, index=l2) #indices can of any data type, here string
ser2
Out[3]:
In [4]:
#from a dictionary
d1 = {'usa':1, 'india':2, 'germany':3, 'japan':'china', 'china':4}
ser3 = pd.Series(d1)
ser3
Out[4]:
In [5]:
ser1a = pd.Series(l1)
ser1 + ser1a #each individual element with matching index/label is summed
Out[5]:
When labels dont match, it puts a nan. Thus when two series are added, you may or may not get the same number of elements
In [6]:
ser1 + ser3
Out[6]:
In [2]:
arr1 = np.random.rand(4,4)
arr1
Out[2]:
In [3]:
row_lables = ['Car1', 'Car2', 'Car3', 'Car4']
col_labels = ['reliability', 'cost', 'competition', 'halflife']
#create a dataframe
df1 = pd.DataFrame(data=arr1, index=row_lables, columns=col_labels)
df1
Out[3]:
In [15]:
# Accessing a whole column
df1['reliability']
Out[15]:
In [16]:
#can access as a property, but this is not advisable
#since it can clobber builtin methods and properties
df1.reliability
Out[16]:
In [18]:
df1.loc['Car4']
Out[18]:
In [19]:
type(df1.loc['Car3'])
Out[19]:
In [21]:
#get first row, first col
val1 = df1.iloc[0,0]
print(val1)
print(type(val1))
In [23]:
#get full first row
val2 = df1.iloc[0,:]
val2
Out[23]:
In [24]:
type(val2)
Out[24]:
In [26]:
#Get cost and competition of cars 2,3
df1.loc[['Car2', 'Car3'], ['cost', 'competition']]
Out[26]:
With index number, dice using
DataFrameObj.iloc[[row_indices], [col_indices]]
In [27]:
df1.iloc[[1,2], [1,2]]
Out[27]:
In [9]:
df1
Out[9]:
In [10]:
# find cars with reliability > 0.85
df1['reliability'] > 0.85
Out[10]:
In [11]:
#to get the car select the data elements using the bool series
df1[df1['reliability'] > 0.85]
Out[11]:
In [20]:
#To get only the car name, which in this case is the index
df1[df1['reliability'] > 0.85].index[0]
Out[20]:
In [21]:
#to get the actual value of reliablity for this car
df1[df1['reliability'] > 0.85]['reliability']
Out[21]:
In [22]:
# get both reliability and cost
df1[df1['reliability'] > 0.85][['reliability', 'cost']]
Out[22]:
In [24]:
#select cars that have reliability > 0.7 but competition less than 0.5
df1[(df1['reliability'] > 0.7) & (df1['competition'] < 0.5)]
Out[24]:
In [26]:
# select cars that have half life > 0.5 or competition < 0.4
df1[(df1['halflife'] > 0.5) | (df1['competition'] < 0.4)]
Out[26]:
In [4]:
#add full life column
df1['full_life'] = df1['halflife'] * 2 #similar to array, series broadcast multiplication
df1
Out[4]:
In [5]:
df1.drop('full_life', axis=1, inplace=False)
Out[5]:
In [6]:
df1.drop('Car3') #all else is the default
Out[6]:
Drop a row based on a condition.
In [7]:
df1.drop(df1[df1['cost'] > 0.65].index, inplace=False)
Out[7]:
In [30]:
#set car names as index for the data frame
car_names = 'altima outback taurus mustang'.split()
car_names
Out[30]:
In [36]:
df1['car_names'] = car_names
df1
Out[36]:
In [37]:
df_new_index = df1.set_index(keys= df1['car_names'], inplace=False)
df_new_index
Out[37]:
In [38]:
#reset df1 index to numerals and convert existing to a column
df1.reset_index()
Out[38]: