In [ ]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

Pandas has two important data strucures Series and DataFrame

Series

Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.)

>>> s = pd.Series(data, index=index)



In [ ]:

    
person_height_ft = pd.Series([5.5,5.2,5.8,6.1,4.8],name='height',
                index = ['person_a','person_b','person_c','person_d','person_e'],dtype=np.float64)
person_height_ft



In [ ]:

    
person_height_ft.values



In [ ]:

    
person_height_ft.index

A Series is like a fixed-size dict in that you can get and set values by index label



In [ ]:

    
person_height_ft['person_c']

You can also use the index position to get and set the values



In [ ]:

    
person_height_ft[3]



In [ ]:

    
person_height_ft[0:3]

Vectorized operations and label alignment with Series



In [ ]:

    
person_height_mtr = (12* 2.54) * person_height_ft/100
person_height_mtr



In [ ]:

    
# please note the index which is not same as height
person_weight_kg = pd.Series([70,55,73,68,66],name='weight',
                index = ['person_b','person_d','person_e','person_c','person_a'],dtype=np.float64)
person_weight_kg



In [ ]:

    
#Let's calculate BMI
bmi = person_weight_kg/person_height_mtr**2
bmi



In [ ]:

    
#BMI of person_c
68/(1.76784*1.76784)

DataFrame

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object.



In [ ]:

    
index = ['person_a','person_b','person_c','person_d','person_e']
df_person = pd.DataFrame({'height':[5.5,5.2,5.8,6.1,4.8],'weight':[66,70,68,55,73],
                   'gender':['male','male','female','male','female']}, index=index)



In [ ]:

    
df_person.describe(include='all')



In [ ]:

    
print(df_person.ndim)
print(df_person.shape)
print(df_person.dtypes)
print(df_person.columns)
print(df_person.index)
print(len(df_person))



In [ ]:

    
df_person.info()



In [ ]:

    
df_person.reset_index()



In [ ]:

    
df_person.reset_index().set_index('gender')



In [ ]:

    
another_index = ['a','b','c','d','e']
df1 = df_person.reset_index()
df1.index = another_index
df1

Accessing Data



In [ ]:

    
# Accessing the column data
height = df_person['height']
height



In [ ]:

    
type(height)



In [ ]:

    
# Accessing multiple columns
df_person[['height','weight']]



In [ ]:

    
# Accessinng one individual cell
df_person['height']['person_a']



In [ ]:

    
# Accessing rows by index keys
df_person.loc['person_a']



In [ ]:

    
# Accessing multiple rows using range
df_person['person_a':'person_c']



In [ ]:

    
# Accessing using index position
print(df_person.iloc[0])
print("--------------")
print(df_person.iloc[0,2])



In [ ]:

    
# Boolean indexinng
# all persons with height > 5.2 feet
df_person[df_person.height > 5.2]



In [ ]:

    
# Boolean indexinng
# all persons with height > 5.2 feet and weight > 60kgs
df_person[(df_person.height > 5.2) & (df_person.weight > 60)]



In [ ]:

    
df_person



In [ ]:

    
# Let's add new column "age" to the DataFrame
df_person['age'] = pd.Series([30,28,26,19,42], index=index)



In [ ]:

    
df_person



In [ ]:

    
# Find all perons with age > 28



In [ ]:

    
# Find females with age > 28



In [ ]:

    
# Find max aged person
df_person[df_person.age==df_person.age.max()]



In [ ]:

    
# Find max aged male person



In [ ]:

    
# Find all persons having height > average height of the group

Handling missing values



In [ ]:

    
df_backup = df_person.copy()



In [ ]:

    
df_person = df_backup.copy()



In [ ]:

    
# Let's introduce few NaN values 
df_person.loc['person_a','age'] = np.NaN
df_person.iloc[2,2] = np.NaN
df_person.loc['person_e','height'] = np.NaN
df_person.loc['person_f'] = np.NaN
df_person['married'] = np.NaN



In [ ]:

    
df_person



In [ ]:

    
# how takes 'all' or 'any'
# dropping all of the rows if all of the values are np.NaN
df_person.dropna(how='all')



In [ ]:

    
# how takes 'all' or 'any'
# dropping all of the columns if all of the values are np.NaN
df_person.dropna(axis=1,how='all')



In [ ]:

    
# Filling all of the NaN values with zero
df_person.fillna(0)



In [ ]:

    
# replace NaN weight with average weight of the group
#df_person.weight.mean()
df_person['weight'].fillna(df_person.weight.mean())



In [ ]:

    
df_person.fillna(method='ffill')



In [ ]:

    
df_person.fillna(method='bfill')



In [ ]:

    
df_person[df_person['weight'].notnull()]



In [ ]:

    
df_person[df_person.notnull()['age']]



In [ ]:

    
df_person = df_person.dropna(how='all')
df_person = df_person.dropna(how='all',axis=1)
df_person



In [ ]:

    
df_person = df_person.fillna(method='ffill')
df_person



In [ ]:

    
df_person['bmi'] = df_person['weight']/(((12* 2.54) * df_person['height']/100)**2)
df_person

GroupBy function

Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure



In [ ]:

    
df_person



In [ ]:

    
df_person_grp = df_person.groupby('gender')
print(type(df_person_grp))



In [ ]:

    
for group,data in df_person_grp:
    print(group, data)
    print("--------------------------------")



In [ ]:

    
df_person_grp.mean()



In [ ]:

    
df_person_grp.mean().plot(kind='bar')

Working with Text Data



In [ ]:

    
df_person.apply(lambda x: x['gender'].upper()[0], axis=1)



In [ ]:

    
df_person.columns



In [ ]:

    
df_person[['weight','height']].apply(lambda x: x.dtype)



In [ ]:

    
df_person.mean()



In [ ]:

    
df_person['gender'].str.upper().str[0]



In [ ]:

Working with Dates and TimeSeries Data



In [ ]:

    
# settig seed ?
np.random.seed(5)
price = pd.Series(np.random.randint(100,high=150,size=150),
                  index=pd.date_range('2000-1-1', periods=150, freq='B'),name='col1')



In [ ]:

    
price.head()



In [ ]:

    
price.groupby(pd.TimeGrouper('1M')).max().plot(ylim=(146,150))



In [ ]:

    
np.random.seed(5)
price1 = pd.Series(np.random.randint(200,high=250,size=500),
                   index=pd.date_range('2000-1-1', periods=500, freq='D'),name='col2')
#all_days = pd.date_range('2000-1-1', periods=500, freq='D')



In [ ]:

    
df_time = pd.DataFrame({'col1':price,'col2':price1})



In [ ]:

    
df_time.head()



In [ ]:

    
len(df_time)



In [ ]:

    
df_time.dtypes



In [ ]:

    
df_time.plot(figsize=(16,8))



In [ ]:

    
df_time.groupby(pd.TimeGrouper('1M')).mean().plot()



In [ ]:

    
pd.Categorical?



In [ ]:

    
pd.CategoricalIndex?



In [ ]:

    
df



In [ ]:

    
x = pd.Categorical(df_time['label'],ordered=True)



In [ ]:

    
cat = pd.Series(df['label'], dtype=x)



In [ ]:

    
cat



In [ ]:

    
pd.merge?



In [ ]:

    
pd.concat?



In [ ]:

    
pd.Timestamp?



In [ ]:

    
plt.plot([1,2,3,3.5,4.0],[1,2,3,3.2,3.8], 
         color='green', linestyle='dashed', 
         marker='o',markerfacecolor='blue', 
         markersize=8)



In [ ]:

    
mylist = [0,1,0,1,2,3,4,0,1]
mycat = pd.Categorical(mylist,categories=[0,1])
mycat



In [ ]:

    
mycat.set_categories([0,1,2,3,4])