6.32 - Pandas operations


In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()


Out[2]:
col1 col2 col3
0 1 444 abc
1 2 555 def
2 3 666 ghi
3 4 444 xyz

Finding unique values in a DataFrame

.unique(), .nunique(), and .value_counts()

In [4]:
df['col2'].unique()


Out[4]:
array([444, 555, 666])

In [6]:
len(df['col2'].unique()) # Checking the number of unique values


Out[6]:
3

In [8]:
df['col2'].nunique() # .nunique() is the same as len()


Out[8]:
3

In [9]:
df['col2'].value_counts()


Out[9]:
444    2
555    1
666    1
Name: col2, dtype: int64

Conditional selection again


In [11]:
df[df['col1']>2]


Out[11]:
col1 col2 col3
2 3 666 ghi
3 4 444 xyz

In [12]:
df[(df['col1']>2) & (df['col2']==444)]


Out[12]:
col1 col2 col3
3 4 444 xyz

The .apply() method


In [13]:
def times2(x):
    return x*2

In [14]:
df['col1'].apply(times2) # wow much cool


Out[14]:
0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [15]:
df['col3'].apply(len)


Out[15]:
0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [17]:
df['col2'].apply(lambda x: x*2) # coolwow


Out[17]:
0     888
1    1110
2    1332
3     888
Name: col2, dtype: int64

DataFrame attributes


In [20]:
df.columns


Out[20]:
Index([u'col1', u'col2', u'col3'], dtype='object')

In [22]:
df.index


Out[22]:
RangeIndex(start=0, stop=4, step=1)

Sorting and ordering a DataFrame


In [24]:
df.sort_values('col2')


Out[24]:
col1 col2 col3
0 1 444 abc
3 4 444 xyz
1 2 555 def
2 3 666 ghi

In [25]:
df.isnull()


Out[25]:
col1 col2 col3
0 False False False
1 False False False
2 False False False
3 False False False

PivotTables


In [28]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)
df


Out[28]:
A B C D
0 foo one x 1
1 foo one y 3
2 foo two x 2
3 bar two y 5
4 bar one x 4
5 bar one y 1

In [29]:
df.pivot_table(values='D',index=['A','B'],columns='C')


Out[29]:
C x y
A B
bar one 4.0 1.0
two NaN 5.0
foo one 1.0 3.0
two 2.0 NaN

In [ ]: