Pandas

Series


In [1]:
import numpy as np
import pandas as pd

In [2]:
labels = ['a','b','c']
my_list = [10,20,30]

arr = np.array([10,20,30])
d = {'a': 10, 'b': 20, 'c': 100}

Creating a series, the multiple ways


In [3]:
pd.Series(my_list)    # pd.Series(data, index)


Out[3]:
0    10
1    20
2    30
dtype: int64

In [7]:
pd.Series(data = my_list, index = labels)


Out[7]:
a    10
b    20
c    30
dtype: int64

In [8]:
pd.Series(arr)


Out[8]:
0    10
1    20
2    30
dtype: int32

In [9]:
pd.Series(arr, labels)


Out[9]:
a    10
b    20
c    30
dtype: int32

In [10]:
pd.Series(d)


Out[10]:
a     10
b     20
c    100
dtype: int64

In [11]:
pd.Series(data = labels)


Out[11]:
0    a
1    b
2    c
dtype: object

In [13]:
# Extremely flexible, storing fucntions
pd.Series([sum, print, len])


Out[13]:
0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

Series operations


In [18]:
ser1 = pd.Series([1,2,3,4], index=['USA', 'CHINA', 'INDIA', 'GERMANY'])
ser1


Out[18]:
USA        1
CHINA      2
INDIA      3
GERMANY    4
dtype: int64

In [19]:
ser2 = pd.Series([1,2,3,4], index=['UK', 'CHINA', 'INDIA', 'JAPAN'])
ser2


Out[19]:
UK       1
CHINA    2
INDIA    3
JAPAN    4
dtype: int64

In [20]:
ser1['USA']


Out[20]:
1

In [21]:
ser2['JAPAN']


Out[21]:
4

In [23]:
# ser2['CANADA']   #produces a KeyError

In [24]:
ser1 + ser2


Out[24]:
CHINA      4.0
GERMANY    NaN
INDIA      6.0
JAPAN      NaN
UK         NaN
USA        NaN
dtype: float64

DataFrames


In [28]:
import numpy as np
import pandas as pd

In [35]:
from numpy.random import randn
np.random.seed(101)

In [36]:
df = pd.DataFrame(data = randn(5,4), index = ['a','b','c','d','e'], columns= ['w', 'x', 'y', 'z'])
df


Out[36]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

Grabbing single, multiple columns


In [104]:
df['w']     # grab column notation (commonly used practice)


Out[104]:
a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [45]:
type(df['w'])


Out[45]:
pandas.core.series.Series

In [46]:
type(df)


Out[46]:
pandas.core.frame.DataFrame

In [49]:
df[['w', 'z']]  # grabbing multiple columns


Out[49]:
w z
a 2.706850 0.503826
b 0.651118 0.605965
c -2.018168 -0.589001
d 0.188695 0.955057
e 0.190794 0.683509

In [50]:
type(df[['w', 'z']] )


Out[50]:
pandas.core.frame.DataFrame

Creating, deleting new columns


In [59]:
df['new'] = df['w'] + df['y']

In [60]:
df


Out[60]:
w x y z new
a 2.706850 0.628133 0.907969 0.503826 3.614819
b 0.651118 -0.319318 -0.848077 0.605965 -0.196959
c -2.018168 0.740122 0.528813 -0.589001 -1.489355
d 0.188695 -0.758872 -0.933237 0.955057 -0.744542
e 0.190794 1.978757 2.605967 0.683509 2.796762

In [61]:
# dropping column, default axis is 0, referring row axis, hence error
df.drop('new')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-61-9ad6a7c9aa4c> in <module>()
      1 # dropping column, default axis is 0, referring row axis, hence error
----> 2 df.drop('new')

E:\Anaconda3\lib\site-packages\pandas\core\generic.py in drop(self, labels, axis, level, inplace, errors)
   2159                 new_axis = axis.drop(labels, level=level, errors=errors)
   2160             else:
-> 2161                 new_axis = axis.drop(labels, errors=errors)
   2162             dropped = self.reindex(**{axis_name: new_axis})
   2163             try:

E:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in drop(self, labels, errors)
   3622             if errors != 'ignore':
   3623                 raise ValueError('labels %s not contained in axis' %
-> 3624                                  labels[mask])
   3625             indexer = indexer[~mask]
   3626         return self.delete(indexer)

ValueError: labels ['new'] not contained in axis

In [62]:
df.drop('new', axis = 1)


Out[62]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [63]:
df


Out[63]:
w x y z new
a 2.706850 0.628133 0.907969 0.503826 3.614819
b 0.651118 -0.319318 -0.848077 0.605965 -0.196959
c -2.018168 0.740122 0.528813 -0.589001 -1.489355
d 0.188695 -0.758872 -0.933237 0.955057 -0.744542
e 0.190794 1.978757 2.605967 0.683509 2.796762

In [64]:
# deleting column from original variable
df.drop('new', axis = 1, inplace = True)

In [65]:
df


Out[65]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [66]:
df.drop('e')


Out[66]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057

In [67]:
df


Out[67]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [68]:
df.shape


Out[68]:
(5, 4)

Grabbing single, multiple rows


In [69]:
# Selecting rows
df


Out[69]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [74]:
df.loc['a']   # loc = location


Out[74]:
w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [75]:
type(df.loc['a'])


Out[75]:
pandas.core.series.Series

In [76]:
df.iloc[0]    # accessing row with index position, iloc = indexed location


Out[76]:
w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [77]:
type(df.iloc[0])


Out[77]:
pandas.core.series.Series

In [87]:
# grabbing multiple rows
df.loc[['a', 'b']]


Out[87]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965

In [81]:
# grabbing a cell
df.loc['b', 'y']     # df.loc[row, column]


Out[81]:
-0.84807698340363147

In [82]:
df.loc[['a', 'b'], ['w', 'y']]   # subset return of the dataframe


Out[82]:
w y
a 2.706850 0.907969
b 0.651118 -0.848077

DataFrames continued... 2

Conditional selection


In [89]:
df > 0


Out[89]:
w x y z
a True True True True
b True False False True
c False True True False
d True False False True
e True True True True

In [90]:
booldf = df > 0
df[booldf]


Out[90]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 NaN NaN 0.605965
c NaN 0.740122 0.528813 NaN
d 0.188695 NaN NaN 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [91]:
df[df>0]


Out[91]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 NaN NaN 0.605965
c NaN 0.740122 0.528813 NaN
d 0.188695 NaN NaN 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [97]:
df


Out[97]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [98]:
# conditional filtering rows based on columns
df['w'] > 0


Out[98]:
a     True
b     True
c    False
d     True
e     True
Name: w, dtype: bool

In [99]:
df[df['w'] > 0]   # conditional selection of rows based on values in column w


Out[99]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [100]:
df[df['z'] < 0]


Out[100]:
w x y z
c -2.018168 0.740122 0.528813 -0.589001

In [101]:
resultdf = df[df['w'] > 0]
resultdf['x']


Out[101]:
a    0.628133
b   -0.319318
d   -0.758872
e    1.978757
Name: x, dtype: float64

In [103]:
# One liner filter of dataframe
# We are filtering original df for values in w which are > than zero
# and then grabbing only the column x
df[df['w'] > 0]['x']


Out[103]:
a    0.628133
b   -0.319318
d   -0.758872
e    1.978757
Name: x, dtype: float64

In [105]:
df[df['w'] > 0][['y','x']]


Out[105]:
y x
a 0.907969 0.628133
b -0.848077 -0.319318
d -0.933237 -0.758872
e 2.605967 1.978757

Multiple conditional selection


In [106]:
df


Out[106]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [109]:
# Note the & (ampersand) 'and' operator, normal 'and' will not work
df[(df['w'] > 0) & (df['y'] > 1)]


Out[109]:
w x y z
e 0.190794 1.978757 2.605967 0.683509

In [111]:
# Note the | (pipe) 'or' operator, normal 'or' will not work
df[(df['w'] > 0) | (df['y'] > 1)]


Out[111]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [ ]:
# ~ is the 'not' operator in pandas

resetting the row index


In [113]:
df.reset_index()   # inplace = false


Out[113]:
index w x y z
0 a 2.706850 0.628133 0.907969 0.503826
1 b 0.651118 -0.319318 -0.848077 0.605965
2 c -2.018168 0.740122 0.528813 -0.589001
3 d 0.188695 -0.758872 -0.933237 0.955057
4 e 0.190794 1.978757 2.605967 0.683509

In [114]:
df


Out[114]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

setting an index


In [117]:
newind = 'CA NY WY OR CO'.split()
newind


Out[117]:
['CA', 'NY', 'WY', 'OR', 'CO']

In [118]:
df['states'] = newind

In [119]:
df


Out[119]:
w x y z states
a 2.706850 0.628133 0.907969 0.503826 CA
b 0.651118 -0.319318 -0.848077 0.605965 NY
c -2.018168 0.740122 0.528813 -0.589001 WY
d 0.188695 -0.758872 -0.933237 0.955057 OR
e 0.190794 1.978757 2.605967 0.683509 CO

In [123]:
# So, if you have a column that you want your row index to be then do following..
df.set_index('states')   # inplace = false


Out[123]:
w x y z
states
CA 2.706850 0.628133 0.907969 0.503826
NY 0.651118 -0.319318 -0.848077 0.605965
WY -2.018168 0.740122 0.528813 -0.589001
OR 0.188695 -0.758872 -0.933237 0.955057
CO 0.190794 1.978757 2.605967 0.683509

In [124]:
df


Out[124]:
w x y z states
a 2.706850 0.628133 0.907969 0.503826 CA
b 0.651118 -0.319318 -0.848077 0.605965 NY
c -2.018168 0.740122 0.528813 -0.589001 WY
d 0.188695 -0.758872 -0.933237 0.955057 OR
e 0.190794 1.978757 2.605967 0.683509 CO

DataFrames continued... 3


In [3]:
import numpy as np
import pandas as pd

# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index


Out[3]:
[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [5]:
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index


Out[5]:
MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

Multi-level dataframe


In [9]:
df = pd.DataFrame(np.random.randn(6,2), index = hier_index, columns = ['A', 'B'])
df


Out[9]:
A B
G1 1 -0.642193 -0.612548
2 -1.644231 -0.884035
3 0.194001 0.144947
G2 1 2.623592 0.126022
2 -0.037992 -1.017579
3 -0.323930 0.854341

In [11]:
# grabbing out of multi-level
df.loc['G1']


Out[11]:
A B
1 -0.642193 -0.612548
2 -1.644231 -0.884035
3 0.194001 0.144947

In [12]:
df.loc['G1'].loc[1]


Out[12]:
A   -0.642193
B   -0.612548
Name: 1, dtype: float64

setting Frozen names


In [13]:
df.index.names


Out[13]:
FrozenList([None, None])

In [14]:
df.index.names = ['Groups', 'Num']

In [15]:
df


Out[15]:
A B
Groups Num
G1 1 -0.642193 -0.612548
2 -1.644231 -0.884035
3 0.194001 0.144947
G2 1 2.623592 0.126022
2 -0.037992 -1.017579
3 -0.323930 0.854341

In [16]:
df.loc['G2'].loc[2]['B']


Out[16]:
-1.0175789047254056

Cross-section


In [19]:
df


Out[19]:
A B
Groups Num
G1 1 -0.642193 -0.612548
2 -1.644231 -0.884035
3 0.194001 0.144947
G2 1 2.623592 0.126022
2 -0.037992 -1.017579
3 -0.323930 0.854341

In [20]:
df.xs('G1')


Out[20]:
A B
Num
1 -0.642193 -0.612548
2 -1.644231 -0.884035
3 0.194001 0.144947

In [22]:
# grabbing cross-section from both groups
df.xs(1, level='Num')


Out[22]:
A B
Groups
G1 -0.642193 -0.612548
G2 2.623592 0.126022

Pandas - Missing Data


In [23]:
import numpy as np
import pandas as pd

In [24]:
d = {'A': [1,2,np.nan], 'B': [5, np.nan, np.nan], 'C': [1,2,3]}
df = pd.DataFrame(d)
df


Out[24]:
A B C
0 1.0 5.0 1
1 2.0 NaN 2
2 NaN NaN 3

Droping values


In [26]:
# performing drop across rows
df.dropna()


Out[26]:
A B C
0 1.0 5.0 1

In [27]:
# performing drop across coolumns
df.dropna(axis = 1)


Out[27]:
C
0 1
1 2
2 3

In [28]:
df


Out[28]:
A B C
0 1.0 5.0 1
1 2.0 NaN 2
2 NaN NaN 3

In [30]:
# defining drop threshold(int)
# drop where atleast 2 Nan values
df.dropna(thresh = 2)


Out[30]:
A B C
0 1.0 5.0 1
1 2.0 NaN 2

Filling values


In [31]:
df


Out[31]:
A B C
0 1.0 5.0 1
1 2.0 NaN 2
2 NaN NaN 3

In [32]:
df.fillna(value = 'FILL VALUE')


Out[32]:
A B C
0 1 5 1
1 2 FILL VALUE 2
2 FILL VALUE FILL VALUE 3

In [34]:
# filling missing value in A with its mean
df['A'].fillna(value = df['A'].mean())


Out[34]:
0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

Group By with Pandas


In [1]:
import pandas as pd
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}

In [2]:
df = pd.DataFrame(data)
df


Out[2]:
Company Person Sales
0 GOOG Sam 200
1 GOOG Charlie 120
2 MSFT Amy 340
3 MSFT Vanessa 124
4 FB Carl 243
5 FB Sarah 350

In [3]:
df.groupby('Company')


Out[3]:
<pandas.core.groupby.DataFrameGroupBy object at 0x00000298F08DEF98>

In [10]:
byComp = df.groupby('Company')
# mean of sales
byComp.mean()


Out[10]:
Sales
Company
FB 296.5
GOOG 160.0
MSFT 232.0

In [11]:
# sum of sales
byComp.sum()


Out[11]:
Sales
Company
FB 593
GOOG 320
MSFT 464

In [12]:
# standard deviation
byComp.std()


Out[12]:
Sales
Company
FB 75.660426
GOOG 56.568542
MSFT 152.735065

In [13]:
# getting sales of facebook
byComp.sum().loc['FB']


Out[13]:
Sales    593
Name: FB, dtype: int64

In [14]:
# One linear
df.groupby('Company').count()


Out[14]:
Person Sales
Company
FB 2 2
GOOG 2 2
MSFT 2 2

In [15]:
df.groupby('Company').max()


Out[15]:
Person Sales
Company
FB Sarah 350
GOOG Sam 200
MSFT Vanessa 340

In [16]:
df.groupby('Company').min()


Out[16]:
Person Sales
Company
FB Carl 243
GOOG Charlie 120
MSFT Amy 124

In [17]:
# group by with describe method
df.groupby('Company').describe()


Out[17]:
Sales
count mean std min 25% 50% 75% max
Company
FB 2.0 296.5 75.660426 243.0 269.75 296.5 323.25 350.0
GOOG 2.0 160.0 56.568542 120.0 140.00 160.0 180.00 200.0
MSFT 2.0 232.0 152.735065 124.0 178.00 232.0 286.00 340.0

In [18]:
df.groupby('Company').describe().transpose()


Out[18]:
Company FB GOOG MSFT
Sales count 2.000000 2.000000 2.000000
mean 296.500000 160.000000 232.000000
std 75.660426 56.568542 152.735065
min 243.000000 120.000000 124.000000
25% 269.750000 140.000000 178.000000
50% 296.500000 160.000000 232.000000
75% 323.250000 180.000000 286.000000
max 350.000000 200.000000 340.000000

In [21]:
df.groupby('Company').describe().transpose()['GOOG']


Out[21]:
Sales  count      2.000000
       mean     160.000000
       std       56.568542
       min      120.000000
       25%      140.000000
       50%      160.000000
       75%      180.000000
       max      200.000000
Name: GOOG, dtype: float64

Merging, Joining, and Concatenating DataFrames

!!! Concatenating is stacking, merging is sql joining, and join is joining on index instead of column.


In [22]:
import pandas as pd

In [23]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [24]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7])

In [25]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [26]:
df1


Out[26]:
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3

In [27]:
df2


Out[27]:
A B C D
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7

In [28]:
df3


Out[28]:
A B C D
8 A8 B8 C8 D8
9 A9 B9 C9 D9
10 A10 B10 C10 D10
11 A11 B11 C11 D11

Concatenation

Concatenation basically glues together DataFrames. Keep in mind that dimensions should match along the axis you are concatenating on. You can use pd.concat and pass in a list of DataFrames to concatenate together:


In [29]:
pd.concat([df1,df2,df3])


Out[29]:
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
8 A8 B8 C8 D8
9 A9 B9 C9 D9
10 A10 B10 C10 D10
11 A11 B11 C11 D11

In [30]:
pd.concat([df1,df2,df3], axis = 1)


Out[30]:
A B C D A B C D A B C D
0 A0 B0 C0 D0 NaN NaN NaN NaN NaN NaN NaN NaN
1 A1 B1 C1 D1 NaN NaN NaN NaN NaN NaN NaN NaN
2 A2 B2 C2 D2 NaN NaN NaN NaN NaN NaN NaN NaN
3 A3 B3 C3 D3 NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN A4 B4 C4 D4 NaN NaN NaN NaN
5 NaN NaN NaN NaN A5 B5 C5 D5 NaN NaN NaN NaN
6 NaN NaN NaN NaN A6 B6 C6 D6 NaN NaN NaN NaN
7 NaN NaN NaN NaN A7 B7 C7 D7 NaN NaN NaN NaN
8 NaN NaN NaN NaN NaN NaN NaN NaN A8 B8 C8 D8
9 NaN NaN NaN NaN NaN NaN NaN NaN A9 B9 C9 D9
10 NaN NaN NaN NaN NaN NaN NaN NaN A10 B10 C10 D10
11 NaN NaN NaN NaN NaN NaN NaN NaN A11 B11 C11 D11

Example DataFrames


In [39]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})

In [40]:
left


Out[40]:
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K2
3 A3 B3 K3

In [41]:
right


Out[41]:
C D key
0 C0 D0 K0
1 C1 D1 K1
2 C2 D2 K2
3 C3 D3 K3

Merging

The merge function allows you to merge DataFrames together using a similar logic as merging SQL Tables together. For example:


In [42]:
# inner join on dataframes
pd.merge(left, right, how='inner', on='key')


Out[42]:
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3

Or to show a more complicated example:


In [43]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})

In [44]:
left


Out[44]:
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1

In [45]:
right


Out[45]:
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0

In [46]:
# inner join on multiple columns
pd.merge(left, right, on=['key1', 'key2'])


Out[46]:
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2

In [47]:
# outer join on multiple columns
pd.merge(left, right, how='outer', on=['key1', 'key2'])


Out[47]:
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3

In [48]:
# right join on multiple columns
pd.merge(left, right, how='right', on=['key1', 'key2'])


Out[48]:
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3

In [49]:
# left join on multiple columns
pd.merge(left, right, how='left', on=['key1', 'key2'])


Out[49]:
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN

Joining

Joining is a convenient method for combining the columns of two potentially differently-indexed DataFrames into a single result DataFrame.


In [50]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

In [51]:
left


Out[51]:
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2

In [52]:
right


Out[52]:
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3

In [53]:
# if the keys we want to merge on are on index and not in columns then use join 
left.join(right)


Out[53]:
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2

In [54]:
left.join(right, how='outer')


Out[54]:
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3

Pandas Common Operations


In [55]:
import numpy as np
import pandas as pd

In [56]:
df = pd.DataFrame({'col1':[1,2,3,4],
                   'col2':[444,555,666,444],
                   'col3':['abc','def','ghi','xyz']})
df.head()


Out[56]:
col1 col2 col3
0 1 444 abc
1 2 555 def
2 3 666 ghi
3 4 444 xyz

In [58]:
# unique values in column
df['col2'].unique()


Out[58]:
array([444, 555, 666], dtype=int64)

In [59]:
# count of unique values in column
df['col2'].nunique()


Out[59]:
3

In [61]:
# number of times a value has occured in column
df['col2'].value_counts()


Out[61]:
444    2
555    1
666    1
Name: col2, dtype: int64

In [64]:
# conditional selection reminder
df[(df['col1'] > 2) &(df['col2'] == 444)]


Out[64]:
col1 col2 col3
3 4 444 xyz

the apply method (!important)


In [65]:
def times2(x):
    return x * 2

df['col1'].apply(times2)


Out[65]:
0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [67]:
df['col3'].apply(len)


Out[67]:
0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [69]:
df['col2'].apply(lambda x:x*2)


Out[69]:
0     888
1    1110
2    1332
3     888
Name: col2, dtype: int64

In [70]:
# removing columns
df.drop('col1', axis = 1)


Out[70]:
col2 col3
0 444 abc
1 555 def
2 666 ghi
3 444 xyz

In [72]:
df.columns


Out[72]:
Index(['col1', 'col2', 'col3'], dtype='object')

In [73]:
df.index


Out[73]:
RangeIndex(start=0, stop=4, step=1)

sorting DataFrame


In [74]:
df


Out[74]:
col1 col2 col3
0 1 444 abc
1 2 555 def
2 3 666 ghi
3 4 444 xyz

In [75]:
df.sort_values('col2')


Out[75]:
col1 col2 col3
0 1 444 abc
3 4 444 xyz
1 2 555 def
2 3 666 ghi

In [76]:
df.isnull()


Out[76]:
col1 col2 col3
0 False False False
1 False False False
2 False False False
3 False False False

Pivot tables


In [77]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)

In [78]:
df


Out[78]:
A B C D
0 foo one x 1
1 foo one y 3
2 foo two x 2
3 bar two y 5
4 bar one x 4
5 bar one y 1

In [80]:
df.pivot_table(values = 'D', index=['A', 'B'], columns='C')


Out[80]:
C x y
A B
bar one 4.0 1.0
two NaN 5.0
foo one 1.0 3.0
two 2.0 NaN

Pandas - Data Input and Output

  • CSV
  • Excel
  • HTML
  • SQL

In [82]:
import pandas as pd
pd.read_csv('example.csv')


Out[82]:
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15

In [84]:
df = pd.read_csv('example.csv')
df


Out[84]:
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15

In [87]:
# save csv without index
df.to_csv('my_output.csv', index = False)

In [88]:
pd.read_csv('my_output.csv')


Out[88]:
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15

In [89]:
# conda install xlrd  (https://pypi.python.org/pypi/xlrd)

In [92]:
df = pd.read_excel('Excel_Sample.xlsx', sheetname='Sheet1')

In [93]:
df.to_excel('Excel_Sample2.xlsx', sheet_name='NewSheet')

In [94]:
# reading HTML
data = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html')

In [95]:
type(data)


Out[95]:
list

In [102]:
type(data[0])


Out[102]:
pandas.core.frame.DataFrame

In [103]:
data[0]


Out[103]:
Bank Name City ST CERT Acquiring Institution Closing Date Updated Date
0 Fayette County Bank Saint Elmo IL 1802 United Fidelity Bank, fsb May 26, 2017 July 26, 2017
1 Guaranty Bank, (d/b/a BestBank in Georgia & Mi... Milwaukee WI 30003 First-Citizens Bank & Trust Company May 5, 2017 July 26, 2017
2 First NBC Bank New Orleans LA 58302 Whitney Bank April 28, 2017 July 26, 2017
3 Proficio Bank Cottonwood Heights UT 35495 Cache Valley Bank March 3, 2017 May 18, 2017
4 Seaway Bank and Trust Company Chicago IL 19328 State Bank of Texas January 27, 2017 May 18, 2017
5 Harvest Community Bank Pennsville NJ 34951 First-Citizens Bank & Trust Company January 13, 2017 May 18, 2017
6 Allied Bank Mulberry AR 91 Today's Bank September 23, 2016 September 25, 2017
7 The Woodbury Banking Company Woodbury GA 11297 United Bank August 19, 2016 June 1, 2017
8 First CornerStone Bank King of Prussia PA 35312 First-Citizens Bank & Trust Company May 6, 2016 September 6, 2016
9 Trust Company Bank Memphis TN 9956 The Bank of Fayette County April 29, 2016 September 6, 2016
10 North Milwaukee State Bank Milwaukee WI 20364 First-Citizens Bank & Trust Company March 11, 2016 March 13, 2017
11 Hometown National Bank Longview WA 35156 Twin City Bank October 2, 2015 April 13, 2016
12 The Bank of Georgia Peachtree City GA 35259 Fidelity Bank October 2, 2015 October 24, 2016
13 Premier Bank Denver CO 34112 United Fidelity Bank, fsb July 10, 2015 August 17, 2016
14 Edgebrook Bank Chicago IL 57772 Republic Bank of Chicago May 8, 2015 July 12, 2016
15 Doral Bank En Espanol San Juan PR 32102 Banco Popular de Puerto Rico February 27, 2015 May 13, 2015
16 Capitol City Bank & Trust Company Atlanta GA 33938 First-Citizens Bank & Trust Company February 13, 2015 April 21, 2015
17 Highland Community Bank Chicago IL 20290 United Fidelity Bank, fsb January 23, 2015 April 21, 2015
18 First National Bank of Crestview Crestview FL 17557 First NBC Bank January 16, 2015 May 8, 2017
19 Northern Star Bank Mankato MN 34983 BankVista December 19, 2014 January 6, 2016
20 Frontier Bank, FSB D/B/A El Paseo Bank Palm Desert CA 34738 Bank of Southern California, N.A. November 7, 2014 November 10, 2016
21 The National Republic Bank of Chicago Chicago IL 916 State Bank of Texas October 24, 2014 January 6, 2016
22 NBRS Financial Rising Sun MD 4862 Howard Bank October 17, 2014 March 26, 2015
23 GreenChoice Bank, fsb Chicago IL 28462 Providence Bank, LLC July 25, 2014 December 12, 2016
24 Eastside Commercial Bank Conyers GA 58125 Community & Southern Bank July 18, 2014 July 11, 2016
25 The Freedom State Bank Freedom OK 12483 Alva State Bank & Trust Company June 27, 2014 March 25, 2016
26 Valley Bank Fort Lauderdale FL 21793 Landmark Bank, National Association June 20, 2014 June 29, 2015
27 Valley Bank Moline IL 10450 Great Southern Bank June 20, 2014 June 26, 2015
28 Slavie Federal Savings Bank Bel Air MD 32368 Bay Bank, FSB May 30, 2014 December 12, 2016
29 Columbia Savings Bank Cincinnati OH 32284 United Fidelity Bank, fsb May 23, 2014 November 10, 2016
... ... ... ... ... ... ... ...
523 ANB Financial, NA Bentonville AR 33901 Pulaski Bank and Trust Company May 9, 2008 August 28, 2012
524 Hume Bank Hume MO 1971 Security Bank March 7, 2008 August 28, 2012
525 Douglass National Bank Kansas City MO 24660 Liberty Bank and Trust Company January 25, 2008 October 26, 2012
526 Miami Valley Bank Lakeview OH 16848 The Citizens Banking Company October 4, 2007 September 12, 2016
527 NetBank Alpharetta GA 32575 ING DIRECT September 28, 2007 August 28, 2012
528 Metropolitan Savings Bank Pittsburgh PA 35353 Allegheny Valley Bank of Pittsburgh February 2, 2007 October 27, 2010
529 Bank of Ephraim Ephraim UT 1249 Far West Bank June 25, 2004 April 9, 2008
530 Reliance Bank White Plains NY 26778 Union State Bank March 19, 2004 April 9, 2008
531 Guaranty National Bank of Tallahassee Tallahassee FL 26838 Hancock Bank of Florida March 12, 2004 June 5, 2012
532 Dollar Savings Bank Newark NJ 31330 No Acquirer February 14, 2004 April 9, 2008
533 Pulaski Savings Bank Philadelphia PA 27203 Earthstar Bank November 14, 2003 July 22, 2005
534 First National Bank of Blanchardville Blanchardville WI 11639 The Park Bank May 9, 2003 June 5, 2012
535 Southern Pacific Bank Torrance CA 27094 Beal Bank February 7, 2003 October 20, 2008
536 Farmers Bank of Cheneyville Cheneyville LA 16445 Sabine State Bank & Trust December 17, 2002 October 20, 2004
537 Bank of Alamo Alamo TN 9961 No Acquirer November 8, 2002 March 18, 2005
538 AmTrade International Bank En Espanol Atlanta GA 33784 No Acquirer September 30, 2002 September 11, 2006
539 Universal Federal Savings Bank Chicago IL 29355 Chicago Community Bank June 27, 2002 April 9, 2008
540 Connecticut Bank of Commerce Stamford CT 19183 Hudson United Bank June 26, 2002 February 14, 2012
541 New Century Bank Shelby Township MI 34979 No Acquirer March 28, 2002 March 18, 2005
542 Net 1st National Bank Boca Raton FL 26652 Bank Leumi USA March 1, 2002 April 9, 2008
543 NextBank, NA Phoenix AZ 22314 No Acquirer February 7, 2002 February 5, 2015
544 Oakwood Deposit Bank Co. Oakwood OH 8966 The State Bank & Trust Company February 1, 2002 October 25, 2012
545 Bank of Sierra Blanca Sierra Blanca TX 22002 The Security State Bank of Pecos January 18, 2002 November 6, 2003
546 Hamilton Bank, NA En Espanol Miami FL 24382 Israel Discount Bank of New York January 11, 2002 September 21, 2015
547 Sinclair National Bank Gravette AR 34248 Delta Trust & Bank September 7, 2001 February 10, 2004
548 Superior Bank, FSB Hinsdale IL 32646 Superior Federal, FSB July 27, 2001 August 19, 2014
549 Malta National Bank Malta OH 6629 North Valley Bank May 3, 2001 November 18, 2002
550 First Alliance Bank & Trust Co. Manchester NH 34264 Southern New Hampshire Bank & Trust February 2, 2001 February 18, 2003
551 National State Bank of Metropolis Metropolis IL 3815 Banterra Bank of Marion December 14, 2000 March 17, 2005
552 Bank of Honolulu Honolulu HI 21029 Bank of the Orient October 13, 2000 March 17, 2005

553 rows × 7 columns

SQLite


In [104]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')
df.to_sql('my_table', engine)
sqldf = pd.read_sql('my_table', con=engine)
sqldf


Out[104]:
index a b c d
0 0 0 1 2 3
1 1 4 5 6 7
2 2 8 9 10 11
3 3 12 13 14 15

In [ ]: