In [47]:
import pandas as pd
import matplotlib.pyplot as plt
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [48]:
data = pd.read_csv("wages_hours.csv")
data.head()
Out[48]:
HRS RATE ERSP ERNO NEIN ASSET AGE DEP RACE SCHOOL
0
NaN
1
2157\t2.905\t1121\t291\t380\t7250\t38.5\t2.340...
2
2174\t2.970\t1128\t301\t398\t7744\t39.3\t2.335...
3
2062\t2.350\t1214\t326\t185\t3068\t40.1\t2.851...
4
2111\t2.511\t1203\t49\t117\t1632\t22.4\t1.159\...
Oops, we haev a prob. Data separayted by tabs
In [49]:
data = pd.read_csv("wages_hours.csv",sep='\t')
data.head()
Out[49]:
HRS
RATE
ERSP
ERNO
NEIN
ASSET
AGE
DEP
RACE
SCHOOL
0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
1
2157
2.905
1121
291
380
7250
38.5
2.340
32.1
10.5
2
2174
2.970
1128
301
398
7744
39.3
2.335
31.2
10.5
3
2062
2.350
1214
326
185
3068
40.1
2.851
*
8.9
4
2111
2.511
1203
49
117
1632
22.4
1.159
27.5
11.5
In [50]:
data.dropna(inplace=True)
data.head()
Out[50]:
HRS
RATE
ERSP
ERNO
NEIN
ASSET
AGE
DEP
RACE
SCHOOL
1
2157
2.905
1121
291
380
7250
38.5
2.340
32.1
10.5
2
2174
2.970
1128
301
398
7744
39.3
2.335
31.2
10.5
3
2062
2.350
1214
326
185
3068
40.1
2.851
*
8.9
4
2111
2.511
1203
49
117
1632
22.4
1.159
27.5
11.5
5
2134
2.791
1013
594
730
12710
57.7
1.229
32.5
8.8
Let's plot a small section of this
In [51]:
data.sort(['AGE'], inplace=True)
data.head()
Out[51]:
HRS
RATE
ERSP
ERNO
NEIN
ASSET
AGE
DEP
RACE
SCHOOL
4
2111
2.511
1203
49
117
1632
22.4
1.159
27.5
11.5
28
2186
3.015
1122
30
352
7292
37.2
2.046
29.5
10.9
32
2077
1.901
350
209
95
1370
37.4
4.158
61.3
8.2
34
2093
1.899
342
311
120
1425
37.5
4.512
62.8
8.1
33
2196
3.009
947
294
342
6888
37.5
3.047
31.8
10.6
In [52]:
data.iloc[:15]
Out[52]:
HRS
RATE
ERSP
ERNO
NEIN
ASSET
AGE
DEP
RACE
SCHOOL
4
2111
2.511
1203
49
117
1632
22.4
1.159
27.5
11.5
28
2186
3.015
1122
30
352
7292
37.2
2.046
29.5
10.9
32
2077
1.901
350
209
95
1370
37.4
4.158
61.3
8.2
34
2093
1.899
342
311
120
1425
37.5
4.512
62.8
8.1
33
2196
3.009
947
294
342
6888
37.5
3.047
31.8
10.6
18
2257
2.516
1093
176
392
7293
37.9
2.042
*
10.1
31
2203
3.273
*
*
430
8221
38.2
2.324
22.1
11.0
13
2108
2.796
1036
300
259
4614
38.2
2.040
*
9.2
30
2188
3.010
990
366
374
7325
38.4
2.847
30.9
10.6
1
2157
2.905
1121
291
380
7250
38.5
2.340
32.1
10.5
6
2185
3.040
1135
287
382
7706
38.6
2.602
31.4
10.7
10
2205
2.356
885
264
373
6789
38.8
2.662
25.2
9.5
9
2267
2.838
1298
252
431
8317
38.9
2.024
9.7
11.1
7
2210
3.222
1100
295
474
9338
39.0
2.187
10.1
11.2
27
2181
2.912
1072
304
383
7340
39.0
2.337
30.2
10.2
In [53]:
data2 = data[['AGE','RATE']]
data2.head()
Out[53]:
AGE
RATE
4
22.4
2.511
28
37.2
3.015
32
37.4
1.901
34
37.5
1.899
33
37.5
3.009
In [54]:
data2.set_index('AGE', inplace=True)
data2
Out[54]:
RATE
AGE
22.4
2.511
37.2
3.015
37.4
1.901
37.5
1.899
37.5
3.009
37.9
2.516
38.2
3.273
38.2
2.796
38.4
3.010
38.5
2.905
38.6
3.040
38.8
2.356
38.9
2.838
39.0
3.222
39.0
2.912
39.1
3.636
39.1
3.413
39.1
2.909
39.2
2.980
39.2
2.959
39.3
2.970
39.3
2.511
39.4
2.971
39.5
3.262
39.7
2.499
39.8
2.983
39.8
2.922
39.8
3.234
39.9
2.493
40.0
2.573
40.0
3.582
40.1
2.350
40.3
2.453
40.5
2.630
40.6
1.423
40.6
2.280
41.8
2.304
43.4
2.786
57.7
2.791
In [55]:
data2.plot()
plt.show()
In [55]:
Content source: shantnu/data_mine
Similar notebooks: