In [18]:
import pandas as pd
import matplotlib.pyplot as plt
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [19]:
data = pd.read_csv("wages_hours.csv")
data.head()
Out[19]:
HRS RATE ERSP ERNO NEIN ASSET AGE DEP RACE SCHOOL
0
2157\t2.905\t1121\t291\t380\t7250\t38.5\t2.340...
1
2174\t2.970\t1128\t301\t398\t7744\t39.3\t2.335...
2
2062\t2.350\t1214\t326\t185\t3068\t40.1\t2.851...
3
2111\t2.511\t1203\t49\t117\t1632\t22.4\t1.159\...
4
2134\t2.791\t1013\t594\t730\t12710\t57.7\t1.22...
In [20]:
data = pd.read_csv("wages_hours.csv", sep = "\t")
data.head()
Out[20]:
HRS
RATE
ERSP
ERNO
NEIN
ASSET
AGE
DEP
RACE
SCHOOL
0
2157
2.905
1121
291
380
7250
38.5
2.340
32.1
10.5
1
2174
2.970
1128
301
398
7744
39.3
2.335
31.2
10.5
2
2062
2.350
1214
326
185
3068
40.1
2.851
*
8.9
3
2111
2.511
1203
49
117
1632
22.4
1.159
27.5
11.5
4
2134
2.791
1013
594
730
12710
57.7
1.229
32.5
8.8
In [21]:
data
Out[21]:
HRS
RATE
ERSP
ERNO
NEIN
ASSET
AGE
DEP
RACE
SCHOOL
0
2157
2.905
1121
291
380
7250
38.5
2.340
32.1
10.5
1
2174
2.970
1128
301
398
7744
39.3
2.335
31.2
10.5
2
2062
2.350
1214
326
185
3068
40.1
2.851
*
8.9
3
2111
2.511
1203
49
117
1632
22.4
1.159
27.5
11.5
4
2134
2.791
1013
594
730
12710
57.7
1.229
32.5
8.8
5
2185
3.040
1135
287
382
7706
38.6
2.602
31.4
10.7
6
2210
3.222
1100
295
474
9338
39.0
2.187
10.1
11.2
7
2105
2.493
1180
310
255
4730
39.9
2.616
71.1
9.3
8
2267
2.838
1298
252
431
8317
38.9
2.024
9.7
11.1
9
2205
2.356
885
264
373
6789
38.8
2.662
25.2
9.5
10
2121
2.922
1251
328
312
5907
39.8
2.287
51.1
10.3
11
2109
2.499
1207
347
271
5069
39.7
3.193
*
8.9
12
2108
2.796
1036
300
259
4614
38.2
2.040
*
9.2
13
2047
2.453
1213
297
139
1987
40.3
2.545
*
9.1
14
2174
3.582
1141
414
498
10239
40.0
2.064
*
11.7
15
2067
2.909
1805
290
239
4439
39.1
2.301
*
10.5
16
2159
2.511
1075
289
308
5621
39.3
2.486
43.6
9.5
17
2257
2.516
1093
176
392
7293
37.9
2.042
*
10.1
18
1985
1.423
553
381
146
1866
40.6
3.833
*
6.6
19
2184
3.636
1091
291
560
11240
39.1
2.328
13.6
11.6
20
2084
2.983
1327
331
296
5653
39.8
2.208
58.4
10.2
21
2051
2.573
1194
279
172
2806
40.0
2.362
77.9
9.1
22
2127
3.262
1226
314
408
8042
39.5
2.259
39.2
10.8
23
2102
3.234
1188
414
352
7557
39.8
2.019
29.8
10.7
24
2098
2.280
973
364
272
4400
40.6
2.661
53.6
8.4
25
2042
2.304
1085
328
140
1739
41.8
2.444
83.1
8.2
26
2181
2.912
1072
304
383
7340
39.0
2.337
30.2
10.2
27
2186
3.015
1122
30
352
7292
37.2
2.046
29.5
10.9
28
2108
2.786
1757
*
506
9658
43.4
*
32.6
10.2
29
2188
3.010
990
366
374
7325
38.4
2.847
30.9
10.6
30
2203
3.273
*
*
430
8221
38.2
2.324
22.1
11.0
31
2077
1.901
350
209
95
1370
37.4
4.158
61.3
8.2
32
2196
3.009
947
294
342
6888
37.5
3.047
31.8
10.6
33
2093
1.899
342
311
120
1425
37.5
4.512
62.8
8.1
34
2173
2.959
1116
296
387
7625
39.2
2.342
31.0
10.5
35
2179
2.971
1128
312
397
7779
39.4
2.341
31.2
10.5
36
2200
2.980
1126
204
393
7885
39.2
2.341
31.0
10.6
37
2052
2.630
*
*
154
3331
40.5
*
45.8
10.3
38
2197
3.413
1078
300
512
10450
39.1
2.297
15.5
11.3
In [22]:
data2 = data[["AGE", "RATE"]]
data2.head()
Out[22]:
AGE
RATE
0
38.5
2.905
1
39.3
2.970
2
40.1
2.350
3
22.4
2.511
4
57.7
2.791
In [27]:
data_sorted = data2.sort(["AGE"])
data_sorted.head()
Out[27]:
AGE
RATE
3
22.4
2.511
27
37.2
3.015
31
37.4
1.901
33
37.5
1.899
32
37.5
3.009
In [28]:
data_sorted.set_index("AGE", inplace=True)
data_sorted.head()
Out[28]:
RATE
AGE
22.4
2.511
37.2
3.015
37.4
1.901
37.5
1.899
37.5
3.009
In [29]:
data_sorted.plot()
plt.show()
In [ ]:
Content source: shantnu/Intro-to-Pandas
Similar notebooks: