In [45]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas
import pandas as pd
np.set_printoptions(precision=4, threshold=500)
pd.options.display.max_rows = 100

In [46]:
%matplotlib inline

Database-style DataFrame merges


In [47]:
employee_data = DataFrame({'employee_name': ['Rafferty', 'Jones', 'Steinberg', 'Robinson', 'Smith', 'Jasper'],
                 'DepartmentID': [31, 33, 33, 34, 34, 'N/A']})
department_data = DataFrame({'DepartmentID': [31, 33, 34, 35],
                 'DepartmentName': ['Sales', 'Engineering', 'Admin', 'Marketing']})
employee_data


Out[47]:
DepartmentID employee_name
0 31 Rafferty
1 33 Jones
2 33 Steinberg
3 34 Robinson
4 34 Smith
5 N/A Jasper

In [48]:
department_data


Out[48]:
DepartmentID DepartmentName
0 31 Sales
1 33 Engineering
2 34 Admin
3 35 Marketing

In [49]:
pd.merge(employee_data, department_data, on = 'DepartmentID')


Out[49]:
DepartmentID employee_name DepartmentName
0 31 Rafferty Sales
1 33 Jones Engineering
2 33 Steinberg Engineering
3 34 Robinson Admin
4 34 Smith Admin

In [50]:
df3 = DataFrame({'employee_name': ['Rafferty', 'Jones', 'Steinberg', 'Robinson', 'Smith', 'Jasper'],
                 'department_id': [31, 33, 33, 34, 34, 'N/A']})
df4 = DataFrame({'DepartmentID': [31, 33, 34, 35],
                 'DepartmentName': ['Sales', 'Engineering', 'Admin', 'Marketing']})
pd.merge(df3, df4, left_on = 'department_id', right_on = 'DepartmentID')[['employee_name', 'DepartmentID', 'DepartmentName']]


Out[50]:
employee_name DepartmentID DepartmentName
0 Rafferty 31 Sales
1 Jones 33 Engineering
2 Steinberg 33 Engineering
3 Robinson 34 Admin
4 Smith 34 Admin

In [51]:
pd.merge(employee_data, department_data, on = 'DepartmentID', how = 'left')


Out[51]:
DepartmentID employee_name DepartmentName
0 31 Rafferty Sales
1 33 Jones Engineering
2 33 Steinberg Engineering
3 34 Robinson Admin
4 34 Smith Admin
5 N/A Jasper NaN

In [52]:
pd.merge(employee_data, department_data, on = 'DepartmentID', how = 'right')


Out[52]:
DepartmentID employee_name DepartmentName
0 31.0 Rafferty Sales
1 33.0 Jones Engineering
2 33.0 Steinberg Engineering
3 34.0 Robinson Admin
4 34.0 Smith Admin
5 35.0 NaN Marketing

In [53]:
pd.merge(employee_data, department_data, on = 'DepartmentID', how = 'outer')


Out[53]:
DepartmentID employee_name DepartmentName
0 31 Rafferty Sales
1 33 Jones Engineering
2 33 Steinberg Engineering
3 34 Robinson Admin
4 34 Smith Admin
5 N/A Jasper NaN
6 35 NaN Marketing

Data Cleaning - Removing Duplicates


In [54]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data


Out[54]:
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4

In [55]:
data.duplicated()


Out[55]:
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [56]:
data.drop_duplicates()


Out[56]:
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4

In [57]:
data['v1'] = range(7)
data.drop_duplicates(['k1'])


Out[57]:
k1 k2 v1
0 one 1 0
3 two 3 3

In [58]:
data


Out[58]:
k1 k2 v1
0 one 1 0
1 one 1 1
2 one 2 2
3 two 3 3
4 two 3 4
5 two 4 5
6 two 4 6

In [59]:
data.drop_duplicates(['k1', 'k2'], keep='last')


Out[59]:
k1 k2 v1
1 one 1 1
2 one 2 2
4 two 3 4
6 two 4 6

Data Cleaning - Missing Value Handling


In [60]:
data = DataFrame([[1.0, 6.5, 8], 
                  [3, np.NAN, np.NAN], 
                  [np.NAN, np.NAN, np.NAN], 
                  [np.NAN, 6.5, 9], 
                  [3, -1, 8], 
                  [3, 6.5, 8]])
data


Out[60]:
0 1 2
0 1.0 6.5 8.0
1 3.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 9.0
4 3.0 -1.0 8.0
5 3.0 6.5 8.0

In [61]:
clean_data = data.dropna()
clean_data


Out[61]:
0 1 2
0 1.0 6.5 8.0
4 3.0 -1.0 8.0
5 3.0 6.5 8.0

In [62]:
def count_missing(x):
  return sum(x.isnull())
total =  data.shape[0]
missing_stat = data.apply(count_missing, axis = 0)
# print total
missing_stat.apply(lambda x: x / float(total))


Out[62]:
0    0.333333
1    0.333333
2    0.333333
dtype: float64

In [63]:
data.fillna(value = '?', inplace = False)


Out[63]:
0 1 2
0 1 6.5 8
1 3 ? ?
2 ? ? ?
3 ? 6.5 9
4 3 -1 8
5 3 6.5 8

In [64]:
data.fillna(value = {0:'other', 1:2, 2:5}, inplace = False)


Out[64]:
0 1 2
0 1 6.5 8.0
1 3 2.0 5.0
2 other 2.0 5.0
3 other 6.5 9.0
4 3 -1.0 8.0
5 3 6.5 8.0

In [65]:
data.fillna(data.mean(), inplace = False)


Out[65]:
0 1 2
0 1.0 6.500 8.00
1 3.0 4.625 8.25
2 2.5 4.625 8.25
3 2.5 6.500 9.00
4 3.0 -1.000 8.00
5 3.0 6.500 8.00

In [66]:
data.fillna(data.median(), inplace = False)
data.median()


Out[66]:
0    3.0
1    6.5
2    8.0
dtype: float64

In [67]:
mode = data.mode()
data.fillna(mode.iloc[0], inplace = False)
# data.fillna(value = {0:mode.loc[0][0], 
#                     1:mode.loc[0][1], 
#                     2:mode.loc[0][2]}, inplace = False)


Out[67]:
0 1 2
0 1.0 6.5 8.0
1 3.0 6.5 8.0
2 3.0 6.5 8.0
3 3.0 6.5 9.0
4 3.0 -1.0 8.0
5 3.0 6.5 8.0

Detecting and filtering outliers


In [68]:
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()


Out[68]:
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067684 0.067924 0.025598 -0.002298
std 0.998035 0.992106 1.006835 0.996794
min -3.428254 -3.548824 -3.184377 -3.745356
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.366626 2.653656 3.260383 3.927528

In [69]:
col = data[3]
col[np.abs(col) > 3]


Out[69]:
97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

In [70]:
outliers = (np.abs(data) > 3).any(1)
data[outliers]


Out[70]:
0 1 2 3
5 -0.539741 0.476985 3.248944 -1.021228
97 -0.774363 0.552936 0.106061 3.927528
102 -0.655054 -0.565230 3.176873 0.959533
305 -2.315555 0.457246 -0.025907 -3.399312
324 0.050188 1.951312 3.260383 0.963301
400 0.146326 0.508391 -0.196713 -3.745356
499 -0.293333 -0.242459 -3.056990 1.918403
523 -3.428254 -0.296336 -0.439938 -0.867165
586 0.275144 1.179227 -3.184377 1.369891
808 -0.362528 -3.548824 1.553205 -2.186301
900 3.366626 -2.372214 0.851010 1.332846

In [71]:
data[(np.abs(data) <= 3).all(1)]


Out[71]:
0 1 2 3
0 -0.204708 0.478943 -0.519439 -0.555730
1 1.965781 1.393406 0.092908 0.281746
2 0.769023 1.246435 1.007189 -1.296221
3 0.274992 0.228913 1.352917 0.886429
4 -2.001637 -0.371843 1.669025 -0.438570
6 -0.577087 0.124121 0.302614 0.523772
7 0.000940 1.343810 -0.713544 -0.831154
8 -2.370232 -1.860761 -0.860757 0.560145
9 -1.265934 0.119827 -1.063512 0.332883
10 -2.359419 -0.199543 -1.541996 -0.970736
11 -1.307030 0.286350 0.377984 -0.753887
12 0.331286 1.349742 0.069877 0.246674
13 -0.011862 1.004812 1.327195 -0.919262
14 -1.549106 0.022185 0.758363 -0.660524
15 0.862580 -0.010032 0.050009 0.670216
16 0.852965 -0.955869 -0.023493 -2.304234
17 -0.652469 -1.218302 -1.332610 1.074623
18 0.723642 0.690002 1.001543 -0.503087
19 -0.622274 -0.921169 -0.726213 0.222896
20 0.051316 -1.157719 0.816707 0.433610
21 1.010737 1.824875 -0.997518 0.850591
22 -0.131578 0.912414 0.188211 2.169461
23 -0.114928 2.003697 0.029610 0.795253
24 0.118110 -0.748532 0.584970 0.152677
25 -1.565657 -0.562540 -0.032664 -0.929006
26 -0.482573 -0.036264 1.095390 0.980928
27 -0.589488 1.581700 -0.528735 0.457002
28 0.929969 -1.569271 -1.022487 -0.402827
29 0.220487 -0.193401 0.669158 -1.648985
30 -2.252797 -1.166832 0.353607 0.702110
31 -0.274569 -0.139142 0.107657 -0.606545
32 -0.417064 -0.017007 -1.224145 -1.800840
33 1.634736 0.989008 0.457940 0.555154
34 1.306720 -0.440554 -0.301350 0.498791
35 -0.823991 1.320566 0.507965 -0.653438
36 0.186980 -0.391725 -0.272293 -0.017141
37 0.680321 0.635512 -0.757177 0.718086
38 -0.304273 -1.677790 0.426986 -1.563740
39 -0.367488 1.045913 1.219954 -0.247699
40 -0.416232 -0.116747 -1.844788 2.068708
41 -0.776967 1.440167 -0.110557 1.227387
42 1.920784 0.746433 2.224660 -0.679400
43 0.727369 -0.868731 -1.213851 -0.470631
44 -0.919242 -0.838827 0.435155 -0.557805
45 -0.567455 -0.372642 -0.926557 1.755108
46 1.209810 1.270025 -0.974378 -0.634709
47 -0.395701 -0.289436 -0.734297 -0.728505
48 0.838775 0.266893 0.721194 0.910983
49 -1.020903 -1.413416 1.296608 0.252275
50 1.127481 -0.568363 0.309362 -0.577385
... ... ... ... ...
950 -0.984182 -0.335652 -0.161431 -0.127566
951 -1.713153 0.679703 -0.491096 -1.166500
952 2.330699 -0.178186 -0.156789 -1.785832
953 0.732040 -0.926019 0.098611 1.653691
954 -2.091554 0.289307 1.056266 -1.067813
955 0.772558 0.152847 -0.737986 -0.778785
956 0.721543 0.355294 -0.348657 -0.512455
957 -1.763876 0.036658 -1.075637 0.629870
958 0.183235 1.392379 -2.241099 1.245722
959 -0.211846 0.209227 -1.459767 -0.472795
960 0.586390 -2.748685 -1.936741 -1.285545
961 -0.262842 -0.460670 -0.526905 -0.087861
962 -0.202099 0.917099 0.981538 0.280666
963 -0.572757 -0.520393 -0.776701 -0.988353
964 0.537197 0.132493 0.361434 1.348550
965 -0.742698 1.130580 0.482667 -0.051903
966 0.747073 0.682296 -0.728508 -0.546905
967 0.100963 0.139397 0.096666 -0.330189
968 0.728368 0.328072 2.531127 0.437636
969 -0.952522 0.792203 0.125309 -1.024313
970 1.140886 0.113279 0.206782 -1.460094
971 -1.523554 -0.590572 -0.324064 0.132262
972 0.097878 1.244456 -0.048534 -1.091849
973 -0.738296 0.115881 0.016253 -1.243078
974 -1.969172 -1.141360 0.405558 -0.521155
975 -0.813280 -0.461413 1.159668 -0.307670
976 0.669147 0.397355 0.612537 1.381155
977 -0.042541 1.182125 0.153036 0.075798
978 0.615651 -1.992291 0.963044 0.102469
979 0.359400 0.355787 -0.799366 1.120895
980 -0.261844 -0.474138 -0.112021 -0.285176
981 -0.136429 1.285628 0.753586 0.053410
982 -0.355393 0.790731 1.274794 0.233619
983 -1.316005 1.396082 0.336700 0.604880
984 0.675317 -0.060329 1.801953 -0.335099
985 -0.754114 0.650691 0.365401 1.367213
986 -0.105654 -0.001860 0.316802 -0.512542
987 -0.052736 -0.457225 2.452835 -1.050942
988 -0.172078 0.853049 -0.356263 -0.387002
989 0.881601 1.067288 1.079688 1.642339
990 -0.103672 -0.882949 -0.197357 1.336648
991 0.926782 -0.527103 0.633763 0.153988
992 -0.527754 0.815104 -0.677870 -1.466596
993 0.957144 0.377342 -0.383241 1.101972
994 1.221058 -0.326481 0.750008 0.604072
995 1.089085 0.251232 -1.451985 1.653126
996 -0.478509 -0.010663 -1.060881 -1.502870
997 -1.946267 1.013592 0.037333 0.133304
998 -1.293122 -0.322542 -0.782960 -0.303340
999 0.089987 0.292291 1.177706 0.882755

989 rows × 4 columns


In [72]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()


Out[72]:
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067623 0.068473 0.025153 -0.002081
std 0.995485 0.990253 1.003977 0.989736
min -3.000000 -3.000000 -3.000000 -3.000000
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.000000 2.653656 3.000000 3.000000

Permutation and random sampling


In [73]:
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
print len(df)
sampler = np.random.permutation(len(df))
sampler


5
Out[73]:
array([1, 0, 2, 3, 4])

In [74]:
df


Out[74]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
4 16 17 18 19

In [75]:
df.take(sampler)


Out[75]:
0 1 2 3
1 4 5 6 7
0 0 1 2 3
2 8 9 10 11
3 12 13 14 15
4 16 17 18 19

In [76]:
df.take(np.random.permutation(len(df))[:3])


Out[76]:
0 1 2 3
1 4 5 6 7
3 12 13 14 15
4 16 17 18 19

Sampling w/ Replacement


In [77]:
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size = 10)

In [78]:
sampler


Out[78]:
array([4, 4, 2, 2, 2, 0, 3, 0, 4, 1])

In [79]:
draws = bag.take(sampler)
draws


Out[79]:
array([ 4,  4, -1, -1, -1,  5,  6,  5,  4,  7])

Sampling w/o Replacement


In [80]:
import random
sampler = random.sample(xrange(len(bag)), 4)
draws = bag.take(sampler)
draws


Out[80]:
array([ 7,  6, -1,  5])