In [1]:
import pandas as pd
%matplotlib inline
In [2]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
In [3]:
df
Out[3]:
data1
key
0
0
b
1
1
b
2
2
a
3
3
c
4
4
a
5
5
b
In [4]:
pd.get_dummies(df['key'],prefix='key')
Out[4]:
key_a
key_b
key_c
0
0.0
1.0
0.0
1
0.0
1.0
0.0
2
1.0
0.0
0.0
3
0.0
0.0
1.0
4
1.0
0.0
0.0
5
0.0
1.0
0.0
In [5]:
df = pd.read_csv('data/ontime_reports_may_2015_ny.csv')
In [6]:
#count number of NaNs in column
df['ARR_DELAY'].isnull().sum()
Out[6]:
945
In [7]:
#calculate the percentage this represents of the total number of instances
df['ARR_DELAY'].isnull().sum()/df['ARR_DELAY'].sum()
Out[7]:
0.0086069493146318145
In [8]:
#filter ARR_DELAY NaNs
df = df[pd.notnull(df['ARR_DELAY'])]
In [9]:
#code whether delay or not delayed
df['IS_DELAYED'] = df['ARR_DELAY'].apply(lambda x: 1 if x>0 else 0 )
In [10]:
#Let's check that our column was created properly
df[['ARR_DELAY','IS_DELAYED']]
Out[10]:
ARR_DELAY
IS_DELAYED
0
-17.0
0
1
-14.0
0
2
-11.0
0
3
4.0
1
4
-18.0
0
5
-6.0
0
6
-11.0
0
7
-15.0
0
8
5.0
1
9
31.0
1
10
-11.0
0
11
-10.0
0
12
-10.0
0
13
157.0
1
14
-8.0
0
15
5.0
1
16
7.0
1
17
-10.0
0
18
-24.0
0
19
-8.0
0
20
-29.0
0
21
89.0
1
22
-16.0
0
23
-13.0
0
24
-11.0
0
25
30.0
1
26
8.0
1
27
-8.0
0
28
-20.0
0
29
-10.0
0
...
...
...
45610
243.0
1
45613
-10.0
0
45614
-1.0
0
45615
-10.0
0
45616
14.0
1
45617
-2.0
0
45618
-13.0
0
45619
-7.0
0
45620
7.0
1
45621
-10.0
0
45622
17.0
1
45623
-7.0
0
45624
0.0
0
45625
52.0
1
45626
24.0
1
45627
-11.0
0
45628
-13.0
0
45629
-4.0
0
45633
-5.0
0
45634
138.0
1
45635
1.0
1
45636
-8.0
0
45637
13.0
1
45639
-18.0
0
45642
7.0
1
45643
-9.0
0
45644
50.0
1
45645
3.0
1
45646
-14.0
0
45647
-2.0
0
44703 rows × 2 columns
In [11]:
pd.get_dummies(df['ORIGIN'],prefix='origin') #We'd want to drop one of these before we actually used this in our algorithm
Out[11]:
origin_ABQ
origin_ACK
origin_ALB
origin_ATL
origin_AUS
origin_BGR
origin_BHM
origin_BNA
origin_BOS
origin_BQN
...
origin_SJU
origin_SLC
origin_SMF
origin_SRQ
origin_STL
origin_STT
origin_SWF
origin_SYR
origin_TPA
origin_XNA
0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
6
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
7
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
8
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
9
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
11
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
12
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
13
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
15
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
16
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
17
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
18
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
19
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
20
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
21
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
22
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
23
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
24
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
25
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
26
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
27
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
28
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
29
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
45610
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45613
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45614
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45615
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45616
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45617
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45618
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45619
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45620
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45621
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45622
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45623
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45624
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45625
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45626
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45627
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45628
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45629
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45633
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45634
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45635
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45636
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45637
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45639
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
45642
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
45643
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
45644
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
45645
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
45646
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
45647
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
44703 rows × 92 columns
In [12]:
df = pd.read_csv('data/heights_weights_genders.csv')
In [13]:
pd.get_dummies(df['Gender'],prefix='gender').corr()
Out[13]:
gender_Female
gender_Male
gender_Female
1.0
-1.0
gender_Male
-1.0
1.0
In [14]:
from sklearn import preprocessing
x = df[['Height','Weight']].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled)
In [16]:
df['Height'].describe()
Out[16]:
count 10000.000000
mean 66.367560
std 3.847528
min 54.263133
25% 63.505620
50% 66.318070
75% 69.174262
max 78.998742
Name: Height, dtype: float64
In [15]:
df_normalized
Out[15]:
0
1
0
0.791728
0.863139
1
0.586958
0.475476
2
0.802364
0.721131
3
0.706182
0.756699
4
0.631424
0.689999
5
0.525149
0.426286
6
0.587087
0.580778
7
0.569437
0.503050
8
0.515686
0.541817
9
0.371665
0.446684
10
0.684529
0.593819
11
0.702537
0.726004
12
0.424618
0.498941
13
0.607219
0.607659
14
0.605629
0.592987
15
0.541021
0.523586
16
0.733970
0.639723
17
0.392600
0.526979
18
0.621651
0.590794
19
0.552761
0.573466
20
0.551914
0.532983
21
0.613541
0.648018
22
0.480522
0.411484
23
0.846668
0.799172
24
0.551030
0.473996
25
0.561171
0.621775
26
0.599399
0.583250
27
0.749434
0.692330
28
0.531950
0.538331
29
0.575417
0.436664
...
...
...
9970
0.459079
0.422819
9971
0.419521
0.441416
9972
0.258356
0.284399
9973
0.261724
0.273888
9974
0.449991
0.381499
9975
0.484677
0.288957
9976
0.291595
0.276132
9977
0.413643
0.412600
9978
0.125836
0.241084
9979
0.315047
0.294393
9980
0.251020
0.224405
9981
0.249848
0.345167
9982
0.630887
0.551864
9983
0.467649
0.331862
9984
0.193401
0.228980
9985
0.557008
0.515438
9986
0.367469
0.376085
9987
0.458736
0.421207
9988
0.213279
0.275439
9989
0.270539
0.150941
9990
0.360467
0.372966
9991
0.338522
0.185852
9992
0.315929
0.360425
9993
0.233158
0.160687
9994
0.195472
0.223243
9995
0.481473
0.351101
9996
0.517635
0.517161
9997
0.388301
0.310660
9998
0.597160
0.482988
9999
0.310529
0.238439
10000 rows × 2 columns
In [ ]:
Content source: ledeprogram/algorithms
Similar notebooks: