In [47]:
import pandas as pd
import matplotlib.pyplot as plt
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [48]:
data = pd.read_csv("wages_hours.csv")
data.head()


Out[48]:
HRS RATE ERSP ERNO NEIN ASSET AGE DEP RACE SCHOOL
0 NaN
1 2157\t2.905\t1121\t291\t380\t7250\t38.5\t2.340...
2 2174\t2.970\t1128\t301\t398\t7744\t39.3\t2.335...
3 2062\t2.350\t1214\t326\t185\t3068\t40.1\t2.851...
4 2111\t2.511\t1203\t49\t117\t1632\t22.4\t1.159\...

Oops, we haev a prob. Data separayted by tabs


In [49]:
data = pd.read_csv("wages_hours.csv",sep='\t')
data.head()


Out[49]:
HRS RATE ERSP ERNO NEIN ASSET AGE DEP RACE SCHOOL
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2157 2.905 1121 291 380 7250 38.5 2.340 32.1 10.5
2 2174 2.970 1128 301 398 7744 39.3 2.335 31.2 10.5
3 2062 2.350 1214 326 185 3068 40.1 2.851 * 8.9
4 2111 2.511 1203 49 117 1632 22.4 1.159 27.5 11.5

In [50]:
data.dropna(inplace=True)
data.head()


Out[50]:
HRS RATE ERSP ERNO NEIN ASSET AGE DEP RACE SCHOOL
1 2157 2.905 1121 291 380 7250 38.5 2.340 32.1 10.5
2 2174 2.970 1128 301 398 7744 39.3 2.335 31.2 10.5
3 2062 2.350 1214 326 185 3068 40.1 2.851 * 8.9
4 2111 2.511 1203 49 117 1632 22.4 1.159 27.5 11.5
5 2134 2.791 1013 594 730 12710 57.7 1.229 32.5 8.8

Let's plot a small section of this


In [51]:
data.sort(['AGE'], inplace=True)
data.head()


Out[51]:
HRS RATE ERSP ERNO NEIN ASSET AGE DEP RACE SCHOOL
4 2111 2.511 1203 49 117 1632 22.4 1.159 27.5 11.5
28 2186 3.015 1122 30 352 7292 37.2 2.046 29.5 10.9
32 2077 1.901 350 209 95 1370 37.4 4.158 61.3 8.2
34 2093 1.899 342 311 120 1425 37.5 4.512 62.8 8.1
33 2196 3.009 947 294 342 6888 37.5 3.047 31.8 10.6

In [52]:
data.iloc[:15]


Out[52]:
HRS RATE ERSP ERNO NEIN ASSET AGE DEP RACE SCHOOL
4 2111 2.511 1203 49 117 1632 22.4 1.159 27.5 11.5
28 2186 3.015 1122 30 352 7292 37.2 2.046 29.5 10.9
32 2077 1.901 350 209 95 1370 37.4 4.158 61.3 8.2
34 2093 1.899 342 311 120 1425 37.5 4.512 62.8 8.1
33 2196 3.009 947 294 342 6888 37.5 3.047 31.8 10.6
18 2257 2.516 1093 176 392 7293 37.9 2.042 * 10.1
31 2203 3.273 * * 430 8221 38.2 2.324 22.1 11.0
13 2108 2.796 1036 300 259 4614 38.2 2.040 * 9.2
30 2188 3.010 990 366 374 7325 38.4 2.847 30.9 10.6
1 2157 2.905 1121 291 380 7250 38.5 2.340 32.1 10.5
6 2185 3.040 1135 287 382 7706 38.6 2.602 31.4 10.7
10 2205 2.356 885 264 373 6789 38.8 2.662 25.2 9.5
9 2267 2.838 1298 252 431 8317 38.9 2.024 9.7 11.1
7 2210 3.222 1100 295 474 9338 39.0 2.187 10.1 11.2
27 2181 2.912 1072 304 383 7340 39.0 2.337 30.2 10.2

In [53]:
data2 = data[['AGE','RATE']]
data2.head()


Out[53]:
AGE RATE
4 22.4 2.511
28 37.2 3.015
32 37.4 1.901
34 37.5 1.899
33 37.5 3.009

In [54]:
data2.set_index('AGE', inplace=True)
data2


Out[54]:
RATE
AGE
22.4 2.511
37.2 3.015
37.4 1.901
37.5 1.899
37.5 3.009
37.9 2.516
38.2 3.273
38.2 2.796
38.4 3.010
38.5 2.905
38.6 3.040
38.8 2.356
38.9 2.838
39.0 3.222
39.0 2.912
39.1 3.636
39.1 3.413
39.1 2.909
39.2 2.980
39.2 2.959
39.3 2.970
39.3 2.511
39.4 2.971
39.5 3.262
39.7 2.499
39.8 2.983
39.8 2.922
39.8 3.234
39.9 2.493
40.0 2.573
40.0 3.582
40.1 2.350
40.3 2.453
40.5 2.630
40.6 1.423
40.6 2.280
41.8 2.304
43.4 2.786
57.7 2.791

In [55]:
data2.plot()

plt.show()



In [55]: