What is special about Pandas ?

  • Allows the use of labels for rows and columns
  • Can calculate rolling statistics on time series data
  • Easy handling of NaN values
  • Is able to load data of different formats into DataFrames
  • Can join and merge different datasets together
  • It integrates with NumPy and Matplotlib

In [27]:
import pandas as pd
import numpy as np

In [12]:
# pandas Series
goods = pd.Series(data=[1,2,4,'ba'], index=['hat', 'gloves', 'cup', 'mouse'])
print(goods)
print('goods dimentions: ', goods.ndim)
print('goods shape: ', goods.shape)
print('goods index: ', goods.index)
print('goods values: ', goods.values)
print('check index banana in goods: ', 'banana' in goods.index)


hat        1
gloves     2
cup        4
mouse     ba
dtype: object
goods dimentions:  1
goods shape:  (4,)
goods index:  Index(['hat', 'gloves', 'cup', 'mouse'], dtype='object')
goods values:  [1 2 4 'ba']
check index banana in goods:  False

In [19]:
# access and deleting elements
# loc: locaiton
# iloc: interger location
# Series.drop(label, inplace=True)
print(goods['hat'])
print()
print(goods.loc[['hat', 'cup']])
print()
print('access by index:\n ', goods.iloc[[0,2]])
print()
print(goods[[0,2]])


1

hat    1
cup    4
dtype: object

access by index:
  hat    1
cup    4
dtype: object

hat    1
cup    4
dtype: object

In [20]:
# Pandas Series arithmetic operation
# make suer the operation can apply to all data type in the Series
vegies = pd.Series(data=[2,78,45,42], index=['kale', 'cabbage', 'carroat', 'lettice'])

In [21]:
vegies * 2


Out[21]:
kale         4
cabbage    156
carroat     90
lettice     84
dtype: int64

In [22]:
vegies / 2


Out[22]:
kale        1.0
cabbage    39.0
carroat    22.5
lettice    21.0
dtype: float64

In [25]:
vegies ** 2


Out[25]:
kale          4
cabbage    6084
carroat    2025
lettice    1764
dtype: int64

In [28]:
np.sqrt(vegies)


Out[28]:
kale       1.414214
cabbage    8.831761
carroat    6.708204
lettice    6.480741
dtype: float64

In [29]:
np.exp(vegies)


Out[29]:
kale       7.389056e+00
cabbage    7.498417e+33
carroat    3.493427e+19
lettice    1.739275e+18
dtype: float64

In [30]:
vegies['kale'] * 2


Out[30]:
4

In [44]:
vegies[vegies < 2]


Out[44]:
Series([], dtype: int64)

In [46]:
# pandas DataFrame
# creat from a dictionary
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}
df = pd.DataFrame(items)
df
df.shape
df.ndim
type(df)
df.size


Out[46]:
10

In [47]:
df.index


Out[47]:
Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')

In [48]:
df.columns


Out[48]:
Index(['Alice', 'Bob'], dtype='object')

In [51]:
# create a df with part of data
df2 = pd.DataFrame(items, columns=['Alice'])
df2


Out[51]:
Alice
book 40
glasses 110
bike 500
pants 45

In [52]:
# create a df with part of the row
df3 = pd.DataFrame(items, index=['bike', 'watch'])
df3


Out[52]:
Alice Bob
bike 500.0 245
watch NaN 55

In [53]:
# create df with list
df4 = pd.DataFrame([x for x in np.arange(3)])
df4


Out[53]:
0
0 0
1 1
2 2

In [74]:
# create df with list with columns and index
df5 = pd.DataFrame(data=[{'price:': 2}, {'weight': 2}], index=['colary','s'])
df5


Out[74]:
price: weight
colary 2.0 NaN
s NaN 2.0

In [34]:
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, 
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5},
         {'watches2': 10, 'glasses2': 50, 'bikes2': 15, 'pants2':5}]

# We create a DataFrame 
store_items = pd.DataFrame(items2)a
store_items


Out[34]:
bikes bikes2 glasses glasses2 pants pants2 watches watches2
0 20.0 NaN NaN NaN 30.0 NaN 35.0 NaN
1 15.0 NaN 50.0 NaN 5.0 NaN 10.0 NaN
2 NaN 15.0 NaN 50.0 NaN 5.0 NaN 10.0

In [80]:
# access element in dataframe
store_items[['watches2']]


Out[80]:
watches2
0 NaN
1 NaN
2 10.0

In [91]:
# access index with loc or iloc
store_items.iloc[[1]]


Out[91]:
bikes bikes2 glasses glasses2 pants pants2 watches watches2
1 15.0 NaN 50.0 NaN 5.0 NaN 10.0 NaN

In [92]:
# acess indidvidual elements in df, column comes first
store_items['bikes'][0]


Out[92]:
20.0

In [93]:
# add a column
store_items['cup'] = [12,45,69]

In [94]:
store_items


Out[94]:
bikes bikes2 glasses glasses2 pants pants2 watches watches2 cup
0 20.0 NaN NaN NaN 30.0 NaN 35.0 NaN 12
1 15.0 NaN 50.0 NaN 5.0 NaN 10.0 NaN 45
2 NaN 15.0 NaN 50.0 NaN 5.0 NaN 10.0 69

In [99]:
store_items + 9


Out[99]:
bikes bikes2 glasses glasses2 pants pants2 watches watches2 cup
0 29.0 NaN NaN NaN 39.0 NaN 44.0 NaN 21
1 24.0 NaN 59.0 NaN 14.0 NaN 19.0 NaN 54
2 NaN 24.0 NaN 59.0 NaN 14.0 NaN 19.0 78

In [101]:
new_items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]
new_store = pd.DataFrame(new_items, index=['store3'])
new_store


Out[101]:
bikes glasses pants watches
store3 20 4 30 35

In [102]:
# add a row
store_items.append(new_store)


Out[102]:
bikes bikes2 cup glasses glasses2 pants pants2 watches watches2
0 20.0 NaN 12.0 NaN NaN 30.0 NaN 35.0 NaN
1 15.0 NaN 45.0 50.0 NaN 5.0 NaN 10.0 NaN
2 NaN 15.0 69.0 NaN 50.0 NaN 5.0 NaN 10.0
store3 20.0 NaN NaN 4.0 NaN 30.0 NaN 35.0 NaN

In [103]:
# insert a column with df.insert(loc, lable, data)
store_items.insert(4, 'plants', [4,5,9])

In [104]:
store_items


Out[104]:
bikes bikes2 glasses glasses2 plants pants pants2 watches watches2 cup
0 20.0 NaN NaN NaN 4 30.0 NaN 35.0 NaN 12
1 15.0 NaN 50.0 NaN 5 5.0 NaN 10.0 NaN 45
2 NaN 15.0 NaN 50.0 9 NaN 5.0 NaN 10.0 69

In [111]:
# delete a column and row
#store_items.pop('cup') # will remove the last column
store_items


Out[111]:
bikes bikes2 glasses glasses2 plants pants pants2 watches watches2
0 20.0 NaN NaN NaN 4 30.0 NaN 35.0 NaN
1 15.0 NaN 50.0 NaN 5 5.0 NaN 10.0 NaN
2 NaN 15.0 NaN 50.0 9 NaN 5.0 NaN 10.0

In [113]:
# drop will remove a column by specify axis=1
store_items.drop('glasses2', axis=1)


Out[113]:
bikes bikes2 glasses plants pants pants2 watches watches2
0 20.0 NaN NaN 4 30.0 NaN 35.0 NaN
1 15.0 NaN 50.0 5 5.0 NaN 10.0 NaN
2 NaN 15.0 NaN 9 NaN 5.0 NaN 10.0

In [114]:
# remove a row with axis=0
store_items.drop(1, axis=0)


Out[114]:
bikes bikes2 glasses glasses2 plants pants pants2 watches watches2
0 20.0 NaN NaN NaN 4 30.0 NaN 35.0 NaN
2 NaN 15.0 NaN 50.0 9 NaN 5.0 NaN 10.0

In [115]:
# rename a row or column
store_items.rename({'bikes': 'renameed_bikes' }, axis=1)


Out[115]:
renameed_bikes bikes2 glasses glasses2 plants pants pants2 watches watches2
0 20.0 NaN NaN NaN 4 30.0 NaN 35.0 NaN
1 15.0 NaN 50.0 NaN 5 5.0 NaN 10.0 NaN
2 NaN 15.0 NaN 50.0 9 NaN 5.0 NaN 10.0

In [117]:
store_items.rename({2: 'last_store'}, axis=0)


Out[117]:
bikes bikes2 glasses glasses2 plants pants pants2 watches watches2
0 20.0 NaN NaN NaN 4 30.0 NaN 35.0 NaN
1 15.0 NaN 50.0 NaN 5 5.0 NaN 10.0 NaN
last_store NaN 15.0 NaN 50.0 9 NaN 5.0 NaN 10.0

In [120]:
# Dealing with NAN
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]
df6 = pd.DataFrame(items2, index=['store1', 'store2', 'store3'])
df6


Out[120]:
bikes glasses pants shirts shoes suits watches
store1 20 NaN 30 15.0 8 45.0 35
store2 15 50.0 5 2.0 5 7.0 10
store3 20 4.0 30 NaN 10 NaN 35

In [125]:
df6.isnull()


Out[125]:
bikes glasses pants shirts shoes suits watches
store1 False True False False False False False
store2 False False False False False False False
store3 False False False True False True False

In [126]:
df6.isnull().sum()


Out[126]:
bikes      0
glasses    1
pants      0
shirts     1
shoes      0
suits      1
watches    0
dtype: int64

In [127]:
df.isnull().sum().sum()


Out[127]:
3

In [128]:
# count the not NAN values
df6.count()


Out[128]:
bikes      3
glasses    2
pants      3
shirts     2
shoes      3
suits      2
watches    3
dtype: int64

In [129]:
# drop nan on columns or rows out of place, if needed specify inplace=True 
df6.dropna(axis=0) # drop nan on rows out of place


Out[129]:
bikes glasses pants shirts shoes suits watches
store2 15 50.0 5 2.0 5 7.0 10

In [130]:
# drop nana on columns out of place
df6.dropna(axis=1)


Out[130]:
bikes pants shoes watches
store1 20 30 8 35
store2 15 5 5 10
store3 20 30 10 35

In [132]:
# fill na with fillna(method='ffill', axis) out of place, specify inplace=True if needed
df6
df6.fillna(method='ffill', axis=0) # ues the previes value on the column to fill nan


Out[132]:
bikes glasses pants shirts shoes suits watches
store1 20 NaN 30 15.0 8 45.0 35
store2 15 50.0 5 2.0 5 7.0 10
store3 20 4.0 30 2.0 10 7.0 35

In [133]:
df6.fillna(method='backfill', axis=0)


Out[133]:
bikes glasses pants shirts shoes suits watches
store1 20 50.0 30 15.0 8 45.0 35
store2 15 50.0 5 2.0 5 7.0 10
store3 20 4.0 30 NaN 10 NaN 35

In [134]:
df6


Out[134]:
bikes glasses pants shirts shoes suits watches
store1 20 NaN 30 15.0 8 45.0 35
store2 15 50.0 5 2.0 5 7.0 10
store3 20 4.0 30 NaN 10 NaN 35

In [135]:
# linear fill with interpolate(method, axis)
df6.interpolate(method='linear', axis=0)


Out[135]:
bikes glasses pants shirts shoes suits watches
store1 20 NaN 30 15.0 8 45.0 35
store2 15 50.0 5 2.0 5 7.0 10
store3 20 4.0 30 2.0 10 7.0 35

In [136]:
df6.interpolate(method='linear', axis=1)


Out[136]:
bikes glasses pants shirts shoes suits watches
store1 20.0 25.0 30.0 15.0 8.0 45.0 35.0
store2 15.0 50.0 5.0 2.0 5.0 7.0 10.0
store3 20.0 4.0 30.0 20.0 10.0 22.5 35.0

In [137]:
# fillna with mean on the column
df6.fillna(df6.mean(), axis=0)


Out[137]:
bikes glasses pants shirts shoes suits watches
store1 20 27.0 30 15.0 8 45.0 35
store2 15 50.0 5 2.0 5 7.0 10
store3 20 4.0 30 8.5 10 26.0 35

In [138]:
# best_rated = book_ratings[(book_ratings == 5).any(axis = 1)]['Book Title'].values
# The code above returns a NumPy ndarray that only contains the names of the books that had a rating of 5.

In [139]:
# loading data into a DataFrame
df7 = pd.read_csv('goog-1.csv')
df7.head(3)


Out[139]:
Date Open High Low Close Adj Close Volume
0 2004-08-19 49.676899 51.693783 47.669952 49.845802 49.845802 44994500
1 2004-08-20 50.178635 54.187561 49.925285 53.805050 53.805050 23005800
2 2004-08-23 55.017166 56.373344 54.172661 54.346527 54.346527 18393200

In [140]:
df7.tail(3)


Out[140]:
Date Open High Low Close Adj Close Volume
3310 2017-10-11 973.719971 990.710022 972.25 989.250000 989.250000 1693300
3311 2017-10-12 987.450012 994.119995 985.00 987.830017 987.830017 1262400
3312 2017-10-13 992.000000 997.210022 989.00 989.679993 989.679993 1157700

In [142]:
df7.isnull().any()


Out[142]:
Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [144]:
# Get detailed info with describe()
df7.describe()


Out[144]:
Open High Low Close Adj Close Volume
count 3313.000000 3313.000000 3313.000000 3313.000000 3313.000000 3.313000e+03
mean 380.186092 383.493740 376.519309 380.072458 380.072458 8.038476e+06
std 223.818650 224.974534 222.473232 223.853780 223.853780 8.399521e+06
min 49.274517 50.541279 47.669952 49.681866 49.681866 7.900000e+03
25% 226.556473 228.394516 224.003082 226.407440 226.407440 2.584900e+06
50% 293.312286 295.433502 289.929291 293.029114 293.029114 5.281300e+06
75% 536.650024 540.000000 532.409973 536.690002 536.690002 1.065370e+07
max 992.000000 997.210022 989.000000 989.679993 989.679993 8.276810e+07

In [146]:
df7['Open'].describe()


Out[146]:
count    3313.000000
mean      380.186092
std       223.818650
min        49.274517
25%       226.556473
50%       293.312286
75%       536.650024
max       992.000000
Name: Open, dtype: float64

In [147]:
# Get correlation between columns, 1 shows strong corr, 0 shows no corr
df7.corr()


Out[147]:
Open High Low Close Adj Close Volume
Open 1.000000 0.999904 0.999845 0.999745 0.999745 -0.564258
High 0.999904 1.000000 0.999834 0.999868 0.999868 -0.562749
Low 0.999845 0.999834 1.000000 0.999899 0.999899 -0.567007
Close 0.999745 0.999868 0.999899 1.000000 1.000000 -0.564967
Adj Close 0.999745 0.999868 0.999899 1.000000 1.000000 -0.564967
Volume -0.564258 -0.562749 -0.567007 -0.564967 -0.564967 1.000000

In [152]:
# Groupby
df7.groupby(['Adj Close']).sum().head(2)


Out[152]:
Open High Low Close Volume
Adj Close
49.681866 50.148830 50.541279 49.339096 49.681866 5191000
49.801090 51.018177 51.152302 49.512966 49.801090 9206800

In [157]:
df7['Open'].rolling(150).mean()


Out[157]:
0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
5              NaN
6              NaN
7              NaN
8              NaN
9              NaN
10             NaN
11             NaN
12             NaN
13             NaN
14             NaN
15             NaN
16             NaN
17             NaN
18             NaN
19             NaN
20             NaN
21             NaN
22             NaN
23             NaN
24             NaN
25             NaN
26             NaN
27             NaN
28             NaN
29             NaN
           ...    
3283    894.046732
3284    894.954865
3285    895.824666
3286    896.744199
3287    897.634199
3288    898.531199
3289    899.388532
3290    900.212932
3291    901.024532
3292    901.777598
3293    902.470999
3294    903.127132
3295    903.817932
3296    904.571732
3297    905.269932
3298    905.915199
3299    906.548932
3300    907.199732
3301    907.957265
3302    908.806932
3303    909.702732
3304    910.537065
3305    911.358066
3306    912.190932
3307    913.122599
3308    914.139932
3309    915.116532
3310    916.034665
3311    916.995798
3312    917.982465
Name: Open, Length: 3313, dtype: float64

In [158]:
import matplotlib.pyplot as plt
%matplotlib inline

In [167]:
plt.plot(df7['Open'].rolling(150).mean())
plt.plot(df7['Open'])
plt.legend(['Open', 'Opend rolling 150'])
plt.show()



In [ ]: