Working with missing data

Link


In [1]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.DataFrame(np.random.rand(5, 3), index=['a', 'c', 'e', 'f', 'h'],
                  columns=['one', 'two', 'three'])

In [5]:
df['four']='bar'

In [6]:
df['five'] = df['one'] > 0

In [7]:
df


Out[7]:
one two three four five
a 0.493574 0.690375 0.338309 bar True
c 0.940038 0.545834 0.963170 bar True
e 0.674076 0.518232 0.366761 bar True
f 0.015938 0.786997 0.184278 bar True
h 0.923214 0.462628 0.749403 bar True

In [8]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

In [9]:
df2


Out[9]:
one two three four five
a 0.493574 0.690375 0.338309 bar True
b NaN NaN NaN NaN NaN
c 0.940038 0.545834 0.963170 bar True
d NaN NaN NaN NaN NaN
e 0.674076 0.518232 0.366761 bar True
f 0.015938 0.786997 0.184278 bar True
g NaN NaN NaN NaN NaN
h 0.923214 0.462628 0.749403 bar True
  • Drop not available values

In [11]:
df2.dropna()


Out[11]:
one two three four five
a 0.493574 0.690375 0.338309 bar True
c 0.940038 0.545834 0.963170 bar True
e 0.674076 0.518232 0.366761 bar True
f 0.015938 0.786997 0.184278 bar True
h 0.923214 0.462628 0.749403 bar True

In [13]:
df2.dropna()


Out[13]:
one two three four five
a 0.493574 0.690375 0.338309 bar True
c 0.940038 0.545834 0.963170 bar True
e 0.674076 0.518232 0.366761 bar True
f 0.015938 0.786997 0.184278 bar True
h 0.923214 0.462628 0.749403 bar True
  • Replace NaN with empty

In [16]:
df2.fillna('')


Out[16]:
one two three four five
a 0.4935738 0.6903748 0.3383088 bar True
b
c 0.9400384 0.5458337 0.9631703 bar True
d
e 0.6740759 0.5182319 0.3667608 bar True
f 0.01593832 0.7869971 0.1842777 bar True
g
h 0.923214 0.4626276 0.7494026 bar True

In [17]:
df2.one


Out[17]:
a    0.493574
b         NaN
c    0.940038
d         NaN
e    0.674076
f    0.015938
g         NaN
h    0.923214
Name: one, dtype: float64

In [19]:
pd.isnull(df2.four)


Out[19]:
a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: four, dtype: bool

In [20]:
df2.four.notnull()


Out[20]:
a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

Datetimes

  • For numbers it is NaN where as for Datetime objects it is NaT

In [21]:
df2=df.copy()

In [24]:
df2['timestamp'] = pd.Timestamp('20150808')

In [25]:
df2


Out[25]:
one two three four five timestamp
a 0.493574 0.690375 0.338309 bar True 2015-08-08
c 0.940038 0.545834 0.963170 bar True 2015-08-08
e 0.674076 0.518232 0.366761 bar True 2015-08-08
f 0.015938 0.786997 0.184278 bar True 2015-08-08
h 0.923214 0.462628 0.749403 bar True 2015-08-08

In [26]:
df2.ix[['a','c','h'],['one','timestamp']] = np.nan

In [27]:
df2


Out[27]:
one two three four five timestamp
a NaN 0.690375 0.338309 bar True NaT
c NaN 0.545834 0.963170 bar True NaT
e 0.674076 0.518232 0.366761 bar True 2015-08-08
f 0.015938 0.786997 0.184278 bar True 2015-08-08
h NaN 0.462628 0.749403 bar True NaT

In [28]:
df2.get_dtype_counts()


Out[28]:
bool              1
datetime64[ns]    1
float64           3
object            1
dtype: int64
  • Getting help

In [31]:
df2.get_dtype_counts?

Inserting missing data


In [32]:
s = pd.Series([1, 2, 3])

In [34]:
s.loc[0]=None

In [35]:
s


Out[35]:
0    None
1       2
2       3
dtype: object

In [36]:
s = pd.Series(["a", "b", "c"])

In [37]:
s.loc[0] = None

In [38]:
s.loc[1] = np.nan

In [39]:
s


Out[39]:
0    None
1     NaN
2       c
dtype: object

In [41]:
df


Out[41]:
one two three four five
a 0.493574 0.690375 0.338309 bar True
c 0.940038 0.545834 0.963170 bar True
e 0.674076 0.518232 0.366761 bar True
f 0.015938 0.786997 0.184278 bar True
h 0.923214 0.462628 0.749403 bar True

In [42]:
df['one'].sum()


Out[42]:
3.0468404382793168

In [43]:
df.mean(1)


Out[43]:
a    0.630564
c    0.862261
e    0.639767
f    0.496803
h    0.783811
dtype: float64

In [44]:
df.cumsum()


Out[44]:
one two three four five
a 0.4935738 0.6903748 0.3383088 bar True
c 1.433612 1.236208 1.301479 barbar 2
e 2.107688 1.75444 1.66824 barbarbar 3
f 2.123626 2.541437 1.852518 barbarbarbar 4
h 3.04684 3.004065 2.60192 barbarbarbarbar 5

In [45]:
df.cumsum?

In [46]:
df2


Out[46]:
one two three four five timestamp
a NaN 0.690375 0.338309 bar True NaT
c NaN 0.545834 0.963170 bar True NaT
e 0.674076 0.518232 0.366761 bar True 2015-08-08
f 0.015938 0.786997 0.184278 bar True 2015-08-08
h NaN 0.462628 0.749403 bar True NaT

In [47]:
df2.fillna(0)


Out[47]:
one two three four five timestamp
a 0.000000 0.690375 0.338309 bar True 1970-01-01
c 0.000000 0.545834 0.963170 bar True 1970-01-01
e 0.674076 0.518232 0.366761 bar True 2015-08-08
f 0.015938 0.786997 0.184278 bar True 2015-08-08
h 0.000000 0.462628 0.749403 bar True 1970-01-01

In [48]:
df2['four'].fillna('missing')


Out[48]:
a    bar
c    bar
e    bar
f    bar
h    bar
Name: four, dtype: object

In [49]:
df


Out[49]:
one two three four five
a 0.493574 0.690375 0.338309 bar True
c 0.940038 0.545834 0.963170 bar True
e 0.674076 0.518232 0.366761 bar True
f 0.015938 0.786997 0.184278 bar True
h 0.923214 0.462628 0.749403 bar True

In [50]:
df.fillna(method='pad')


Out[50]:
one two three four five
a 0.493574 0.690375 0.338309 bar True
c 0.940038 0.545834 0.963170 bar True
e 0.674076 0.518232 0.366761 bar True
f 0.015938 0.786997 0.184278 bar True
h 0.923214 0.462628 0.749403 bar True

In [51]:
df.fillna(method='pad', limit=1)


Out[51]:
one two three four five
a 0.493574 0.690375 0.338309 bar True
c 0.940038 0.545834 0.963170 bar True
e 0.674076 0.518232 0.366761 bar True
f 0.015938 0.786997 0.184278 bar True
h 0.923214 0.462628 0.749403 bar True

Interpolation


In [53]:
pd.tseries()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-53-e9f0563fd4e5> in <module>()
----> 1 pd.tseries()

TypeError: 'module' object is not callable

In [ ]: