What is special about Pandas ?

Allows the use of labels for rows and columns
Can calculate rolling statistics on time series data
Easy handling of NaN values
Is able to load data of different formats into DataFrames
Can join and merge different datasets together
It integrates with NumPy and Matplotlib



In [27]:

    
import pandas as pd
import numpy as np



In [12]:

    
# pandas Series
goods = pd.Series(data=[1,2,4,'ba'], index=['hat', 'gloves', 'cup', 'mouse'])
print(goods)
print('goods dimentions: ', goods.ndim)
print('goods shape: ', goods.shape)
print('goods index: ', goods.index)
print('goods values: ', goods.values)
print('check index banana in goods: ', 'banana' in goods.index)









    



hat        1
gloves     2
cup        4
mouse     ba
dtype: object
goods dimentions:  1
goods shape:  (4,)
goods index:  Index(['hat', 'gloves', 'cup', 'mouse'], dtype='object')
goods values:  [1 2 4 'ba']
check index banana in goods:  False



In [19]:

    
# access and deleting elements
# loc: locaiton
# iloc: interger location
# Series.drop(label, inplace=True)
print(goods['hat'])
print()
print(goods.loc[['hat', 'cup']])
print()
print('access by index:\n ', goods.iloc[[0,2]])
print()
print(goods[[0,2]])









    



1

hat    1
cup    4
dtype: object

access by index:
  hat    1
cup    4
dtype: object

hat    1
cup    4
dtype: object



In [20]:

    
# Pandas Series arithmetic operation
# make suer the operation can apply to all data type in the Series
vegies = pd.Series(data=[2,78,45,42], index=['kale', 'cabbage', 'carroat', 'lettice'])



In [21]:

    
vegies * 2









    Out[21]:





kale         4
cabbage    156
carroat     90
lettice     84
dtype: int64



In [22]:

    
vegies / 2









    Out[22]:





kale        1.0
cabbage    39.0
carroat    22.5
lettice    21.0
dtype: float64



In [25]:

    
vegies ** 2









    Out[25]:





kale          4
cabbage    6084
carroat    2025
lettice    1764
dtype: int64



In [28]:

    
np.sqrt(vegies)









    Out[28]:





kale       1.414214
cabbage    8.831761
carroat    6.708204
lettice    6.480741
dtype: float64



In [29]:

    
np.exp(vegies)









    Out[29]:





kale       7.389056e+00
cabbage    7.498417e+33
carroat    3.493427e+19
lettice    1.739275e+18
dtype: float64



In [30]:

    
vegies['kale'] * 2









    Out[30]:





4



In [44]:

    
vegies[vegies < 2]









    Out[44]:





Series([], dtype: int64)



In [46]:

    
# pandas DataFrame
# creat from a dictionary
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}
df = pd.DataFrame(items)
df
df.shape
df.ndim
type(df)
df.size









    Out[46]:





10



In [47]:

    
df.index









    Out[47]:





Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')



In [48]:

    
df.columns









    Out[48]:





Index(['Alice', 'Bob'], dtype='object')



In [51]:

    
# create a df with part of data
df2 = pd.DataFrame(items, columns=['Alice'])
df2



In [52]:

    
# create a df with part of the row
df3 = pd.DataFrame(items, index=['bike', 'watch'])
df3



In [53]:

    
# create df with list
df4 = pd.DataFrame([x for x in np.arange(3)])
df4



In [74]:

    
# create df with list with columns and index
df5 = pd.DataFrame(data=[{'price:': 2}, {'weight': 2}], index=['colary','s'])
df5









    Out[74]:







  
    
      
      price:
      weight
    
  
  
    
      colary
      2.0
      NaN
    
    
      s
      NaN
      2.0



In [34]:

    
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, 
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5},
         {'watches2': 10, 'glasses2': 50, 'bikes2': 15, 'pants2':5}]

# We create a DataFrame 
store_items = pd.DataFrame(items2)a
store_items









    Out[34]:







  
    
      
      bikes
      bikes2
      glasses
      glasses2
      pants
      pants2
      watches
      watches2
    
  
  
    
      0
      20.0
      NaN
      NaN
      NaN
      30.0
      NaN
      35.0
      NaN
    
    
      1
      15.0
      NaN
      50.0
      NaN
      5.0
      NaN
      10.0
      NaN
    
    
      2
      NaN
      15.0
      NaN
      50.0
      NaN
      5.0
      NaN
      10.0



In [80]:

    
# access element in dataframe
store_items[['watches2']]



In [91]:

    
# access index with loc or iloc
store_items.iloc[[1]]









    Out[91]:







  
    
      
      bikes
      bikes2
      glasses
      glasses2
      pants
      pants2
      watches
      watches2
    
  
  
    
      1
      15.0
      NaN
      50.0
      NaN
      5.0
      NaN
      10.0
      NaN



In [92]:

    
# acess indidvidual elements in df, column comes first
store_items['bikes'][0]









    Out[92]:





20.0



In [93]:

    
# add a column
store_items['cup'] = [12,45,69]



In [94]:

    
store_items









    Out[94]:







  
    
      
      bikes
      bikes2
      glasses
      glasses2
      pants
      pants2
      watches
      watches2
      cup
    
  
  
    
      0
      20.0
      NaN
      NaN
      NaN
      30.0
      NaN
      35.0
      NaN
      12
    
    
      1
      15.0
      NaN
      50.0
      NaN
      5.0
      NaN
      10.0
      NaN
      45
    
    
      2
      NaN
      15.0
      NaN
      50.0
      NaN
      5.0
      NaN
      10.0
      69



In [99]:

    
store_items + 9









    Out[99]:







  
    
      
      bikes
      bikes2
      glasses
      glasses2
      pants
      pants2
      watches
      watches2
      cup
    
  
  
    
      0
      29.0
      NaN
      NaN
      NaN
      39.0
      NaN
      44.0
      NaN
      21
    
    
      1
      24.0
      NaN
      59.0
      NaN
      14.0
      NaN
      19.0
      NaN
      54
    
    
      2
      NaN
      24.0
      NaN
      59.0
      NaN
      14.0
      NaN
      19.0
      78



In [101]:

    
new_items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]
new_store = pd.DataFrame(new_items, index=['store3'])
new_store









    Out[101]:







  
    
      
      bikes
      glasses
      pants
      watches
    
  
  
    
      store3
      20
      4
      30
      35



In [102]:

    
# add a row
store_items.append(new_store)









    Out[102]:







  
    
      
      bikes
      bikes2
      cup
      glasses
      glasses2
      pants
      pants2
      watches
      watches2
    
  
  
    
      0
      20.0
      NaN
      12.0
      NaN
      NaN
      30.0
      NaN
      35.0
      NaN
    
    
      1
      15.0
      NaN
      45.0
      50.0
      NaN
      5.0
      NaN
      10.0
      NaN
    
    
      2
      NaN
      15.0
      69.0
      NaN
      50.0
      NaN
      5.0
      NaN
      10.0
    
    
      store3
      20.0
      NaN
      NaN
      4.0
      NaN
      30.0
      NaN
      35.0
      NaN



In [103]:

    
# insert a column with df.insert(loc, lable, data)
store_items.insert(4, 'plants', [4,5,9])



In [104]:

    
store_items









    Out[104]:







  
    
      
      bikes
      bikes2
      glasses
      glasses2
      plants
      pants
      pants2
      watches
      watches2
      cup
    
  
  
    
      0
      20.0
      NaN
      NaN
      NaN
      4
      30.0
      NaN
      35.0
      NaN
      12
    
    
      1
      15.0
      NaN
      50.0
      NaN
      5
      5.0
      NaN
      10.0
      NaN
      45
    
    
      2
      NaN
      15.0
      NaN
      50.0
      9
      NaN
      5.0
      NaN
      10.0
      69



In [111]:

    
# delete a column and row
#store_items.pop('cup') # will remove the last column
store_items









    Out[111]:







  
    
      
      bikes
      bikes2
      glasses
      glasses2
      plants
      pants
      pants2
      watches
      watches2
    
  
  
    
      0
      20.0
      NaN
      NaN
      NaN
      4
      30.0
      NaN
      35.0
      NaN
    
    
      1
      15.0
      NaN
      50.0
      NaN
      5
      5.0
      NaN
      10.0
      NaN
    
    
      2
      NaN
      15.0
      NaN
      50.0
      9
      NaN
      5.0
      NaN
      10.0



In [113]:

    
# drop will remove a column by specify axis=1
store_items.drop('glasses2', axis=1)









    Out[113]:







  
    
      
      bikes
      bikes2
      glasses
      plants
      pants
      pants2
      watches
      watches2
    
  
  
    
      0
      20.0
      NaN
      NaN
      4
      30.0
      NaN
      35.0
      NaN
    
    
      1
      15.0
      NaN
      50.0
      5
      5.0
      NaN
      10.0
      NaN
    
    
      2
      NaN
      15.0
      NaN
      9
      NaN
      5.0
      NaN
      10.0



In [114]:

    
# remove a row with axis=0
store_items.drop(1, axis=0)









    Out[114]:







  
    
      
      bikes
      bikes2
      glasses
      glasses2
      plants
      pants
      pants2
      watches
      watches2
    
  
  
    
      0
      20.0
      NaN
      NaN
      NaN
      4
      30.0
      NaN
      35.0
      NaN
    
    
      2
      NaN
      15.0
      NaN
      50.0
      9
      NaN
      5.0
      NaN
      10.0



In [115]:

    
# rename a row or column
store_items.rename({'bikes': 'renameed_bikes' }, axis=1)









    Out[115]:







  
    
      
      renameed_bikes
      bikes2
      glasses
      glasses2
      plants
      pants
      pants2
      watches
      watches2
    
  
  
    
      0
      20.0
      NaN
      NaN
      NaN
      4
      30.0
      NaN
      35.0
      NaN
    
    
      1
      15.0
      NaN
      50.0
      NaN
      5
      5.0
      NaN
      10.0
      NaN
    
    
      2
      NaN
      15.0
      NaN
      50.0
      9
      NaN
      5.0
      NaN
      10.0



In [117]:

    
store_items.rename({2: 'last_store'}, axis=0)









    Out[117]:







  
    
      
      bikes
      bikes2
      glasses
      glasses2
      plants
      pants
      pants2
      watches
      watches2
    
  
  
    
      0
      20.0
      NaN
      NaN
      NaN
      4
      30.0
      NaN
      35.0
      NaN
    
    
      1
      15.0
      NaN
      50.0
      NaN
      5
      5.0
      NaN
      10.0
      NaN
    
    
      last_store
      NaN
      15.0
      NaN
      50.0
      9
      NaN
      5.0
      NaN
      10.0



In [120]:

    
# Dealing with NAN
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]
df6 = pd.DataFrame(items2, index=['store1', 'store2', 'store3'])
df6



In [125]:

    
df6.isnull()









    Out[125]:







  
    
      
      bikes
      glasses
      pants
      shirts
      shoes
      suits
      watches
    
  
  
    
      store1
      False
      True
      False
      False
      False
      False
      False
    
    
      store2
      False
      False
      False
      False
      False
      False
      False
    
    
      store3
      False
      False
      False
      True
      False
      True
      False



In [126]:

    
df6.isnull().sum()









    Out[126]:





bikes      0
glasses    1
pants      0
shirts     1
shoes      0
suits      1
watches    0
dtype: int64



In [127]:

    
df.isnull().sum().sum()









    Out[127]:





3



In [128]:

    
# count the not NAN values
df6.count()









    Out[128]:





bikes      3
glasses    2
pants      3
shirts     2
shoes      3
suits      2
watches    3
dtype: int64



In [129]:

    
# drop nan on columns or rows out of place, if needed specify inplace=True 
df6.dropna(axis=0) # drop nan on rows out of place









    Out[129]:







  
    
      
      bikes
      glasses
      pants
      shirts
      shoes
      suits
      watches
    
  
  
    
      store2
      15
      50.0
      5
      2.0
      5
      7.0
      10



In [130]:

    
# drop nana on columns out of place
df6.dropna(axis=1)



In [132]:

    
# fill na with fillna(method='ffill', axis) out of place, specify inplace=True if needed
df6
df6.fillna(method='ffill', axis=0) # ues the previes value on the column to fill nan



In [133]:

    
df6.fillna(method='backfill', axis=0)



In [134]:

    
df6



In [135]:

    
# linear fill with interpolate(method, axis)
df6.interpolate(method='linear', axis=0)



In [136]:

    
df6.interpolate(method='linear', axis=1)



In [137]:

    
# fillna with mean on the column
df6.fillna(df6.mean(), axis=0)



In [138]:

    
# best_rated = book_ratings[(book_ratings == 5).any(axis = 1)]['Book Title'].values
# The code above returns a NumPy ndarray that only contains the names of the books that had a rating of 5.



In [139]:

    
# loading data into a DataFrame
df7 = pd.read_csv('goog-1.csv')
df7.head(3)



In [140]:

    
df7.tail(3)



In [142]:

    
df7.isnull().any()









    Out[142]:





Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool



In [144]:

    
# Get detailed info with describe()
df7.describe()









    Out[144]:







  
    
      
      Open
      High
      Low
      Close
      Adj Close
      Volume
    
  
  
    
      count
      3313.000000
      3313.000000
      3313.000000
      3313.000000
      3313.000000
      3.313000e+03
    
    
      mean
      380.186092
      383.493740
      376.519309
      380.072458
      380.072458
      8.038476e+06
    
    
      std
      223.818650
      224.974534
      222.473232
      223.853780
      223.853780
      8.399521e+06
    
    
      min
      49.274517
      50.541279
      47.669952
      49.681866
      49.681866
      7.900000e+03
    
    
      25%
      226.556473
      228.394516
      224.003082
      226.407440
      226.407440
      2.584900e+06
    
    
      50%
      293.312286
      295.433502
      289.929291
      293.029114
      293.029114
      5.281300e+06
    
    
      75%
      536.650024
      540.000000
      532.409973
      536.690002
      536.690002
      1.065370e+07
    
    
      max
      992.000000
      997.210022
      989.000000
      989.679993
      989.679993
      8.276810e+07



In [146]:

    
df7['Open'].describe()









    Out[146]:





count    3313.000000
mean      380.186092
std       223.818650
min        49.274517
25%       226.556473
50%       293.312286
75%       536.650024
max       992.000000
Name: Open, dtype: float64



In [147]:

    
# Get correlation between columns, 1 shows strong corr, 0 shows no corr
df7.corr()



In [152]:

    
# Groupby
df7.groupby(['Adj Close']).sum().head(2)



In [157]:

    
df7['Open'].rolling(150).mean()









    Out[157]:





0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
5              NaN
6              NaN
7              NaN
8              NaN
9              NaN
10             NaN
11             NaN
12             NaN
13             NaN
14             NaN
15             NaN
16             NaN
17             NaN
18             NaN
19             NaN
20             NaN
21             NaN
22             NaN
23             NaN
24             NaN
25             NaN
26             NaN
27             NaN
28             NaN
29             NaN
           ...    
3283    894.046732
3284    894.954865
3285    895.824666
3286    896.744199
3287    897.634199
3288    898.531199
3289    899.388532
3290    900.212932
3291    901.024532
3292    901.777598
3293    902.470999
3294    903.127132
3295    903.817932
3296    904.571732
3297    905.269932
3298    905.915199
3299    906.548932
3300    907.199732
3301    907.957265
3302    908.806932
3303    909.702732
3304    910.537065
3305    911.358066
3306    912.190932
3307    913.122599
3308    914.139932
3309    915.116532
3310    916.034665
3311    916.995798
3312    917.982465
Name: Open, Length: 3313, dtype: float64



In [158]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [167]:

    
plt.plot(df7['Open'].rolling(150).mean())
plt.plot(df7['Open'])
plt.legend(['Open', 'Opend rolling 150'])
plt.show()



In [ ]:

	Date	Open	High	Low	Close	Adj Close	Volume
0	2004-08-19	49.676899	51.693783	47.669952	49.845802	49.845802	44994500
1	2004-08-20	50.178635	54.187561	49.925285	53.805050	53.805050	23005800
2	2004-08-23	55.017166	56.373344	54.172661	54.346527	54.346527	18393200

	Date	Open	High	Low	Close	Adj Close	Volume
3310	2017-10-11	973.719971	990.710022	972.25	989.250000	989.250000	1693300
3311	2017-10-12	987.450012	994.119995	985.00	987.830017	987.830017	1262400
3312	2017-10-13	992.000000	997.210022	989.00	989.679993	989.679993	1157700

	Open	High	Low	Close	Adj Close	Volume
Open	1.000000	0.999904	0.999845	0.999745	0.999745	-0.564258
High	0.999904	1.000000	0.999834	0.999868	0.999868	-0.562749
Low	0.999845	0.999834	1.000000	0.999899	0.999899	-0.567007
Close	0.999745	0.999868	0.999899	1.000000	1.000000	-0.564967
Adj Close	0.999745	0.999868	0.999899	1.000000	1.000000	-0.564967
Volume	-0.564258	-0.562749	-0.567007	-0.564967	-0.564967	1.000000

	bikes	bikes2	glasses	glasses2	pants	pants2	watches	watches2
0	20.0	NaN	NaN	NaN	30.0	NaN	35.0	NaN
1	15.0	NaN	50.0	NaN	5.0	NaN	10.0	NaN
2	NaN	15.0	NaN	50.0	NaN	5.0	NaN	10.0

	bikes	bikes2	glasses	glasses2	pants	pants2	watches	watches2	cup
0	29.0	NaN	NaN	NaN	39.0	NaN	44.0	NaN	21
1	24.0	NaN	59.0	NaN	14.0	NaN	19.0	NaN	54
2	NaN	24.0	NaN	59.0	NaN	14.0	NaN	19.0	78

	bikes	bikes2	cup	glasses	glasses2	pants	pants2	watches	watches2
0	20.0	NaN	12.0	NaN	NaN	30.0	NaN	35.0	NaN
1	15.0	NaN	45.0	50.0	NaN	5.0	NaN	10.0	NaN
2	NaN	15.0	69.0	NaN	50.0	NaN	5.0	NaN	10.0
store3	20.0	NaN	NaN	4.0	NaN	30.0	NaN	35.0	NaN

	bikes	glasses	pants	shirts	shoes	suits	watches
store1	20	NaN	30	15.0	8	45.0	35
store2	15	50.0	5	2.0	5	7.0	10
store3	20	4.0	30	NaN	10	NaN	35

	bikes	glasses	pants	shirts	shoes	suits	watches
store1	False	True	False	False	False	False	False
store2	False	False	False	False	False	False	False
store3	False	False	False	True	False	True	False

	bikes	glasses	pants	shirts	shoes	suits	watches
store1	20.0	25.0	30.0	15.0	8.0	45.0	35.0
store2	15.0	50.0	5.0	2.0	5.0	7.0	10.0
store3	20.0	4.0	30.0	20.0	10.0	22.5	35.0

	bikes	glasses	pants	shirts	shoes	suits	watches
store1	20	27.0	30	15.0	8	45.0	35
store2	15	50.0	5	2.0	5	7.0	10
store3	20	4.0	30	8.5	10	26.0	35

	Open	High	Low	Close	Adj Close	Volume
count	3313.000000	3313.000000	3313.000000	3313.000000	3313.000000	3.313000e+03
mean	380.186092	383.493740	376.519309	380.072458	380.072458	8.038476e+06
std	223.818650	224.974534	222.473232	223.853780	223.853780	8.399521e+06
min	49.274517	50.541279	47.669952	49.681866	49.681866	7.900000e+03
25%	226.556473	228.394516	224.003082	226.407440	226.407440	2.584900e+06
50%	293.312286	295.433502	289.929291	293.029114	293.029114	5.281300e+06
75%	536.650024	540.000000	532.409973	536.690002	536.690002	1.065370e+07
max	992.000000	997.210022	989.000000	989.679993	989.679993	8.276810e+07

	Open	High	Low	Close	Volume
Adj Close
49.681866	50.148830	50.541279	49.339096	49.681866	5191000
49.801090	51.018177	51.152302	49.512966	49.801090	9206800