In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('housing.csv') # df = document frequency
df


Out[1]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
5 -122.25 37.85 52.0 919.0 213.0 413.0 193.0 4.0368 269700.0 NEAR BAY
6 -122.25 37.84 52.0 2535.0 489.0 1094.0 514.0 3.6591 299200.0 NEAR BAY
7 -122.25 37.84 52.0 3104.0 687.0 1157.0 647.0 3.1200 241400.0 NEAR BAY
8 -122.26 37.84 42.0 2555.0 665.0 1206.0 595.0 2.0804 226700.0 NEAR BAY
9 -122.25 37.84 52.0 3549.0 707.0 1551.0 714.0 3.6912 261100.0 NEAR BAY
10 -122.26 37.85 52.0 2202.0 434.0 910.0 402.0 3.2031 281500.0 NEAR BAY
11 -122.26 37.85 52.0 3503.0 752.0 1504.0 734.0 3.2705 241800.0 NEAR BAY
12 -122.26 37.85 52.0 2491.0 474.0 1098.0 468.0 3.0750 213500.0 NEAR BAY
13 -122.26 37.84 52.0 696.0 191.0 345.0 174.0 2.6736 191300.0 NEAR BAY
14 -122.26 37.85 52.0 2643.0 626.0 1212.0 620.0 1.9167 159200.0 NEAR BAY
15 -122.26 37.85 50.0 1120.0 283.0 697.0 264.0 2.1250 140000.0 NEAR BAY
16 -122.27 37.85 52.0 1966.0 347.0 793.0 331.0 2.7750 152500.0 NEAR BAY
17 -122.27 37.85 52.0 1228.0 293.0 648.0 303.0 2.1202 155500.0 NEAR BAY
18 -122.26 37.84 50.0 2239.0 455.0 990.0 419.0 1.9911 158700.0 NEAR BAY
19 -122.27 37.84 52.0 1503.0 298.0 690.0 275.0 2.6033 162900.0 NEAR BAY
20 -122.27 37.85 40.0 751.0 184.0 409.0 166.0 1.3578 147500.0 NEAR BAY
21 -122.27 37.85 42.0 1639.0 367.0 929.0 366.0 1.7135 159800.0 NEAR BAY
22 -122.27 37.84 52.0 2436.0 541.0 1015.0 478.0 1.7250 113900.0 NEAR BAY
23 -122.27 37.84 52.0 1688.0 337.0 853.0 325.0 2.1806 99700.0 NEAR BAY
24 -122.27 37.84 52.0 2224.0 437.0 1006.0 422.0 2.6000 132600.0 NEAR BAY
25 -122.28 37.85 41.0 535.0 123.0 317.0 119.0 2.4038 107500.0 NEAR BAY
26 -122.28 37.85 49.0 1130.0 244.0 607.0 239.0 2.4597 93800.0 NEAR BAY
27 -122.28 37.85 52.0 1898.0 421.0 1102.0 397.0 1.8080 105500.0 NEAR BAY
28 -122.28 37.84 50.0 2082.0 492.0 1131.0 473.0 1.6424 108900.0 NEAR BAY
29 -122.28 37.84 52.0 729.0 160.0 395.0 155.0 1.6875 132000.0 NEAR BAY
... ... ... ... ... ... ... ... ... ... ...
20610 -121.56 39.10 28.0 2130.0 484.0 1195.0 439.0 1.3631 45500.0 INLAND
20611 -121.55 39.10 27.0 1783.0 441.0 1163.0 409.0 1.2857 47000.0 INLAND
20612 -121.56 39.08 26.0 1377.0 289.0 761.0 267.0 1.4934 48300.0 INLAND
20613 -121.55 39.09 31.0 1728.0 365.0 1167.0 384.0 1.4958 53400.0 INLAND
20614 -121.54 39.08 26.0 2276.0 460.0 1455.0 474.0 2.4695 58000.0 INLAND
20615 -121.54 39.08 23.0 1076.0 216.0 724.0 197.0 2.3598 57500.0 INLAND
20616 -121.53 39.08 15.0 1810.0 441.0 1157.0 375.0 2.0469 55100.0 INLAND
20617 -121.53 39.06 20.0 561.0 109.0 308.0 114.0 3.3021 70800.0 INLAND
20618 -121.55 39.06 25.0 1332.0 247.0 726.0 226.0 2.2500 63400.0 INLAND
20619 -121.56 39.01 22.0 1891.0 340.0 1023.0 296.0 2.7303 99100.0 INLAND
20620 -121.48 39.05 40.0 198.0 41.0 151.0 48.0 4.5625 100000.0 INLAND
20621 -121.47 39.01 37.0 1244.0 247.0 484.0 157.0 2.3661 77500.0 INLAND
20622 -121.44 39.00 20.0 755.0 147.0 457.0 157.0 2.4167 67000.0 INLAND
20623 -121.37 39.03 32.0 1158.0 244.0 598.0 227.0 2.8235 65500.0 INLAND
20624 -121.41 39.04 16.0 1698.0 300.0 731.0 291.0 3.0739 87200.0 INLAND
20625 -121.52 39.12 37.0 102.0 17.0 29.0 14.0 4.1250 72000.0 INLAND
20626 -121.43 39.18 36.0 1124.0 184.0 504.0 171.0 2.1667 93800.0 INLAND
20627 -121.32 39.13 5.0 358.0 65.0 169.0 59.0 3.0000 162500.0 INLAND
20628 -121.48 39.10 19.0 2043.0 421.0 1018.0 390.0 2.5952 92400.0 INLAND
20629 -121.39 39.12 28.0 10035.0 1856.0 6912.0 1818.0 2.0943 108300.0 INLAND
20630 -121.32 39.29 11.0 2640.0 505.0 1257.0 445.0 3.5673 112000.0 INLAND
20631 -121.40 39.33 15.0 2655.0 493.0 1200.0 432.0 3.5179 107200.0 INLAND
20632 -121.45 39.26 15.0 2319.0 416.0 1047.0 385.0 3.1250 115600.0 INLAND
20633 -121.53 39.19 27.0 2080.0 412.0 1082.0 382.0 2.5495 98300.0 INLAND
20634 -121.56 39.27 28.0 2332.0 395.0 1041.0 344.0 3.7125 116800.0 INLAND
20635 -121.09 39.48 25.0 1665.0 374.0 845.0 330.0 1.5603 78100.0 INLAND
20636 -121.21 39.49 18.0 697.0 150.0 356.0 114.0 2.5568 77100.0 INLAND
20637 -121.22 39.43 17.0 2254.0 485.0 1007.0 433.0 1.7000 92300.0 INLAND
20638 -121.32 39.43 18.0 1860.0 409.0 741.0 349.0 1.8672 84700.0 INLAND
20639 -121.24 39.37 16.0 2785.0 616.0 1387.0 530.0 2.3886 89400.0 INLAND

20640 rows × 10 columns


In [2]:
# Para ver apenas as features
df[:0]


Out[2]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity

In [3]:
# Aqui vai melhor (para ver as features): 
df.columns


Out[3]:
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

In [5]:
# aqui para obter direto a quantidade de linhas e colunas, respectivamente
print(df.shape)


(20640, 10)

In [6]:
# para obter apenas a quantidade de linhas
print(df.shape[0])


20640

In [7]:
# verificando o tipo de variável, é int!
type(df.shape[0])


Out[7]:
int

In [8]:
# e aqui a quantidade de colunas
print(df.shape[1])


10

In [9]:
type(df.shape[1])


Out[9]:
int

In [10]:
# obtendo os atributos (features)
print(df.keys())


Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [11]:
print(df.dtypes) # para obter os tipos de todos os atributos


longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [12]:
# imprimir as primeiras 10 linhas apenas
df.head(10)


Out[12]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
5 -122.25 37.85 52.0 919.0 213.0 413.0 193.0 4.0368 269700.0 NEAR BAY
6 -122.25 37.84 52.0 2535.0 489.0 1094.0 514.0 3.6591 299200.0 NEAR BAY
7 -122.25 37.84 52.0 3104.0 687.0 1157.0 647.0 3.1200 241400.0 NEAR BAY
8 -122.26 37.84 42.0 2555.0 665.0 1206.0 595.0 2.0804 226700.0 NEAR BAY
9 -122.25 37.84 52.0 3549.0 707.0 1551.0 714.0 3.6912 261100.0 NEAR BAY

In [13]:
# imprimir as 10 últimas linhas
df.tail(10)


Out[13]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
20630 -121.32 39.29 11.0 2640.0 505.0 1257.0 445.0 3.5673 112000.0 INLAND
20631 -121.40 39.33 15.0 2655.0 493.0 1200.0 432.0 3.5179 107200.0 INLAND
20632 -121.45 39.26 15.0 2319.0 416.0 1047.0 385.0 3.1250 115600.0 INLAND
20633 -121.53 39.19 27.0 2080.0 412.0 1082.0 382.0 2.5495 98300.0 INLAND
20634 -121.56 39.27 28.0 2332.0 395.0 1041.0 344.0 3.7125 116800.0 INLAND
20635 -121.09 39.48 25.0 1665.0 374.0 845.0 330.0 1.5603 78100.0 INLAND
20636 -121.21 39.49 18.0 697.0 150.0 356.0 114.0 2.5568 77100.0 INLAND
20637 -121.22 39.43 17.0 2254.0 485.0 1007.0 433.0 1.7000 92300.0 INLAND
20638 -121.32 39.43 18.0 1860.0 409.0 741.0 349.0 1.8672 84700.0 INLAND
20639 -121.24 39.37 16.0 2785.0 616.0 1387.0 530.0 2.3886 89400.0 INLAND

acessando a primeira linha


In [14]:
df.iloc[0]


Out[14]:
longitude              -122.23
latitude                 37.88
housing_median_age          41
total_rooms                880
total_bedrooms             129
population                 322
households                 126
median_income           8.3252
median_house_value      452600
ocean_proximity       NEAR BAY
Name: 0, dtype: object

In [15]:
# para acessar a coluna 'longitude'
df.longitude


Out[15]:
0       -122.23
1       -122.22
2       -122.24
3       -122.25
4       -122.25
5       -122.25
6       -122.25
7       -122.25
8       -122.26
9       -122.25
10      -122.26
11      -122.26
12      -122.26
13      -122.26
14      -122.26
15      -122.26
16      -122.27
17      -122.27
18      -122.26
19      -122.27
20      -122.27
21      -122.27
22      -122.27
23      -122.27
24      -122.27
25      -122.28
26      -122.28
27      -122.28
28      -122.28
29      -122.28
          ...  
20610   -121.56
20611   -121.55
20612   -121.56
20613   -121.55
20614   -121.54
20615   -121.54
20616   -121.53
20617   -121.53
20618   -121.55
20619   -121.56
20620   -121.48
20621   -121.47
20622   -121.44
20623   -121.37
20624   -121.41
20625   -121.52
20626   -121.43
20627   -121.32
20628   -121.48
20629   -121.39
20630   -121.32
20631   -121.40
20632   -121.45
20633   -121.53
20634   -121.56
20635   -121.09
20636   -121.21
20637   -121.22
20638   -121.32
20639   -121.24
Name: longitude, Length: 20640, dtype: float64

In [16]:
# acessando o valor da longitude na primeira linha
df.iloc[0].longitude


Out[16]:
-122.23

In [17]:
# apenas a mediana dos preços das casas
df.median_house_value


Out[17]:
0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
5        269700.0
6        299200.0
7        241400.0
8        226700.0
9        261100.0
10       281500.0
11       241800.0
12       213500.0
13       191300.0
14       159200.0
15       140000.0
16       152500.0
17       155500.0
18       158700.0
19       162900.0
20       147500.0
21       159800.0
22       113900.0
23        99700.0
24       132600.0
25       107500.0
26        93800.0
27       105500.0
28       108900.0
29       132000.0
           ...   
20610     45500.0
20611     47000.0
20612     48300.0
20613     53400.0
20614     58000.0
20615     57500.0
20616     55100.0
20617     70800.0
20618     63400.0
20619     99100.0
20620    100000.0
20621     77500.0
20622     67000.0
20623     65500.0
20624     87200.0
20625     72000.0
20626     93800.0
20627    162500.0
20628     92400.0
20629    108300.0
20630    112000.0
20631    107200.0
20632    115600.0
20633     98300.0
20634    116800.0
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64

In [18]:
df.iloc[0].median_house_value # o primeiro valor do preço


Out[18]:
452600.0

In [19]:
# método braçal para obter o maior valor da mediana dos preços das casas

rows = df.shape[0]
higher = df.iloc[0].median_house_value # atribuindo o primeiro valor como o maior

for i in range(rows):
    if df.iloc[i].median_house_value > higher:
        higher = df.iloc[i].median_house_value
        
higher


Out[19]:
500001.0

In [20]:
# método braçal para obter o menor valor da mediana dos preços das casas

lower = df.iloc[0].median_house_value # atribuindo o primeiro valor como o menor

for i in range(rows):
    if df.iloc[i].median_house_value < lower:
        lower = df.iloc[i].median_house_value
        
lower


Out[20]:
14999.0

In [21]:
# Método Braçal para obter a média dos valores da mediana dos preços das casas

mean = 0

for i in range(rows):
    mean += df.iloc[i].median_house_value

mean /= rows
        
mean


Out[21]:
206855.81690891474

In [22]:
format(mean, '.2f')


Out[22]:
'206855.82'

In [23]:
# Método mais prático: 

print('Max value of median_house_value:' , format(np.max(df.median_house_value), '.2f'))
print('Min value of median_house_value:' , format(np.min(df.median_house_value), '.2f'))
print('Mean value of median_house_value:' , format(np.mean(df.median_house_value), '.2f'))


Max value of median_house_value: 500001.00
Min value of median_house_value: 14999.00
Mean value of median_house_value: 206855.82

In [24]:
# obter os diferentes valores de 'ocean proximity'

df['ocean_proximity'].value_counts()


Out[24]:
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [25]:
# função describe() do Pandas mostra um resumão dos dados, com a quantidade (count), média (mean), 
# desvio padrão (std = standard deviation), valor mínimo (min), 25º, 50º e 75º percentil, valor máximo (max)
df.describe()


Out[25]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000

In [26]:
# alterando o valor da precisão, para duas casas decimais: 

pd.set_option('precision', 2) # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html#pandas.set_option
df.describe()


Out[26]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.00 20640.00 20640.00 20640.00 20433.00 20640.00 20640.00 20640.00 20640.00
mean -119.57 35.63 28.64 2635.76 537.87 1425.48 499.54 3.87 206855.82
std 2.00 2.14 12.59 2181.62 421.39 1132.46 382.33 1.90 115395.62
min -124.35 32.54 1.00 2.00 1.00 3.00 1.00 0.50 14999.00
25% -121.80 33.93 18.00 1447.75 296.00 787.00 280.00 2.56 119600.00
50% -118.49 34.26 29.00 2127.00 435.00 1166.00 409.00 3.53 179700.00
75% -118.01 37.71 37.00 3148.00 647.00 1725.00 605.00 4.74 264725.00
max -114.31 41.95 52.00 39320.00 6445.00 35682.00 6082.00 15.00 500001.00

In [27]:
# exibir histogramas
import matplotlib.pyplot as plt # https://matplotlib.org/api/pyplot_api.html

df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1) 
# https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hist
plt.show()

# os parâmetros sharex, sharey, xlabelsize e ylabelsize aqui são apenas para dispor melhor os gráficos...



In [28]:
# Outro modo com mais detalhes: 

%matplotlib inline
df.hist(bins=50, figsize=(20,15))
# save_fig("attribute_histogram_plots")
plt.show()



In [29]:
df['median_house_value'].hist()


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f47c784d400>

In [30]:
# Histograma do median_house_value

#hist(x, bins=None, range=None, normed=False, weights=None, cumulative=False, bottom=None, 
# histtype='bar', align='mid', orientation='vertical', rwidth=None, log=False, color=None, 
# label=None, stacked=False, hold=None, data=None, **kwargs)

df.hist('median_house_value', bins=50, figsize=(20,15)) 
plt.show()



In [31]:
# Mostrando os Quartis:

# boxplot(x, notch=None, sym=None, vert=None, whis=None, positions=None, widths=None, patch_artist=None, 
# bootstrap=None, usermedians=None, conf_intervals=None, meanline=None, showmeans=None, showcaps=None, 
# showbox=None, showfliers=None, boxprops=None, labels=None, flierprops=None, medianprops=None, 
# meanprops=None, capprops=None, whiskerprops=None, manage_xticks=True, autorange=False, zorder=None, 
# hold=None, data=None)
df.boxplot("median_house_value", patch_artist=True)


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f47c7715f60>

In [32]:
df.hist('median_house_value', bins=50, figsize=(20,15)) 
plt.axvline(df['median_house_value'].mean(), color='g', linestyle='dashed', linewidth=2) # traço do valor médio em verde
plt.axvline(np.percentile(df['median_house_value'], [25]), color='b', linestyle='dashed', linewidth=2) # traço do 1º Quartil, em azul
plt.axvline(np.percentile(df['median_house_value'], [50]), color='r', linestyle='dashed', linewidth=2) # traço do 2º Quartil, em vermelho
plt.axvline(np.percentile(df['median_house_value'], [75]), color='m', linestyle='dashed', linewidth=2) # traço do 3º Quartil, em roxo
plt.show()



In [33]:
q1 = 0

for i in df['median_house_value']:
    if i <= 119600.00:
        q1 += 1

q1


Out[33]:
5162

In [34]:
q2 = 0

for i in df['median_house_value']:
    if i > 119600.00 and i <= 179700.00:
        q2 += 1

q2


Out[34]:
5161

In [35]:
q3 = 0

for i in df['median_house_value']:
    if i > 179700.00 and i <= 264725.00:
        q3 += 1

q3


Out[35]:
5157

In [36]:
q4 = 0

for i in df['median_house_value']:
    if i > 264725.00:
        q4 += 1

q4


Out[36]:
5160

In [37]:
q1+q2+q3+q4 == len(df['median_house_value'])


Out[37]:
True

In [38]:
# exibir a densidade

# https://pandas.pydata.org/pandas-docs/version/0.18.1/generated/pandas.DataFrame.plot.html
df.plot(kind='density', subplots=True, layout=(3, 3), sharex=False, sharey=False, legend=False, fontsize=1)
plt.show()

# legendas retiradas para ficarem legíveis os gráficos.



In [39]:
# ...senão: 

df.plot(kind='density', subplots=True, layout=(3, 3), sharex=False, sharey=False, legend=True, fontsize=1)
plt.show()

# porém, vemos aqui que não está na mesma ordem que os do histograma!


E agora uma matriz de correlação:


In [40]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(df.corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
ticks = np.arange(0,14,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(df.keys(), rotation = 'vertical') # https://matplotlib.org/devdocs/api/_as_gen/matplotlib.axes.Axes.set_yticklabels.html#matplotlib-axes-axes-set-yticklabels
ax.set_yticklabels(df.keys()) 
plt.show()



In [41]:
# outra forma para matriz de correlação, usando seaborn
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(df.corr())
plt.show()


Aqui uns testes para aplicar nos dados das casas. Verificando a função train_test_split, se ela divide aleatóriamente os dados de treinamento e teste.


In [42]:
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

# Verificando se divide de forma aleatória os dados.

from sklearn.model_selection import train_test_split

X, y = np.arange(10).reshape((5, 2)), range(5)

In [43]:
X


Out[43]:
array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [44]:
list(y)


Out[44]:
[0, 1, 2, 3, 4]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Por padrão, pega dados aleatóriamente.

Sobre o random_state: Can be any integer between 0 and 2**32 - 1 inclusive, an array (or other sequence) of such integers, or None (the default). If seed is None, then RandomState will try to read data com /dev/urandom (or the Windows analogue) if avaliable os seed from the clock otherwise.

https://stackoverflow.com/questions/3674409/numpy-how-to-split-partition-a-dataset-array-into-training-and-test-datasets

data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.20, random_state=42)


In [46]:
X_train


Out[46]:
array([[4, 5],
       [0, 1],
       [6, 7]])

In [47]:
X_test


Out[47]:
array([[2, 3],
       [8, 9]])

In [48]:
y_train


Out[48]:
[2, 0, 3]

In [49]:
y_test


Out[49]:
[1, 4]

In [50]:
# Testando agora para apenas um conjunto de dados X: 

X = np.arange(10).reshape((5, 2))
X_train, X_test = train_test_split(X, test_size=0.33, random_state=42)

In [51]:
X_train


Out[51]:
array([[4, 5],
       [0, 1],
       [6, 7]])

In [52]:
X_test


Out[52]:
array([[2, 3],
       [8, 9]])

It works!


In [53]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.3, random_state=42)

In [54]:
test_set.head()


Out[54]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
20046 -119.01 36.06 25.0 1505.0 NaN 1392.0 359.0 1.68 47700.0 INLAND
3024 -119.46 35.14 30.0 2943.0 NaN 1565.0 584.0 2.53 45800.0 INLAND
15663 -122.44 37.80 52.0 3830.0 NaN 1310.0 963.0 3.48 500001.0 NEAR BAY
20484 -118.72 34.28 17.0 3051.0 NaN 1705.0 495.0 5.74 218600.0 <1H OCEAN
9814 -121.93 36.62 34.0 2351.0 NaN 1063.0 428.0 3.73 278000.0 NEAR OCEAN

In [55]:
test_set.tail()


Out[55]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
17505 -121.90 37.36 47.0 1007.0 245.0 581.0 240.0 2.95 237500.0 <1H OCEAN
13512 -117.32 34.11 41.0 1229.0 302.0 994.0 270.0 1.49 67300.0 INLAND
10842 -117.91 33.67 16.0 7961.0 2276.0 5014.0 2116.0 3.51 218400.0 <1H OCEAN
16559 -121.28 37.82 10.0 9205.0 1774.0 5935.0 1673.0 3.65 119400.0 INLAND
5786 -118.24 34.15 17.0 5282.0 1605.0 4116.0 1574.0 3.05 209800.0 <1H OCEAN

In [56]:
df["median_income"].hist()


Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f47bcd2add8>

In [57]:
# Divide por 1.5 para limitar o número de categorias de renda
df["income_cat"] = np.ceil(df["median_income"] / 1.5)

# Categoriza-os de 1 a 5
df["income_cat"].where(df["income_cat"] < 5, 5.0, inplace=True)

In [58]:
df["income_cat"].value_counts()


Out[58]:
3.0    7236
2.0    6581
4.0    3639
5.0    2362
1.0     822
Name: income_cat, dtype: int64

In [59]:
df["income_cat"].hist()


Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f47bcc6b9b0>

In [60]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["income_cat"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [61]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)


Out[61]:
3.0    0.35
2.0    0.32
4.0    0.18
5.0    0.11
1.0    0.04
Name: income_cat, dtype: float64

In [62]:
df["income_cat"].value_counts() / len(df)


Out[62]:
3.0    0.35
2.0    0.32
4.0    0.18
5.0    0.11
1.0    0.04
Name: income_cat, dtype: float64

In [63]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [64]:
df = strat_train_set.copy()

Visualização dos dados


In [65]:
df = strat_train_set.copy()

df.plot(kind="scatter", x="longitude", y="latitude")


Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f47c7572550>

In [66]:
df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)


Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f47c7f1a8d0>

In [67]:
df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=df["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()


Out[67]:
<matplotlib.legend.Legend at 0x7f47c77bbd68>

In [68]:
import matplotlib.image as mpimg
california_img=mpimg.imread('pics/california_pic.png')
ax = df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=df["population"]/100, label="population", figsize=(10,7),
             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)

plt.imshow(california_img, extent=[-124.55, -113.8, 32.45, 42.05], alpha=0.5)
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

plt.legend(fontsize=16)
plt.show()


Matriz de correlação

Correlation coefficient: A statistic used to show how the scores from one measure relate to scores on a second measure for the same group of individuals. A high value (approaching +1.00) is a strong direct relationship, a low negative value (approaching -1.00) is a strong inverse relationship, and values near 0.00 indicate little, if any, relationship.

http://www.ncme.org/ncme/NCME/Resource_Center/Glossary/NCME/Resource_Center/Glossary1.aspx?hkey=4bb87415-44dc-4088-9ed9-e8515326a061#anchorC


In [69]:
# Coeficiente de correlação padrão (standard correlation coefficient)
corr_matrix = df.corr()

# Signature: df.corr(method='pearson', min_periods=1)
# Docstring: Compute pairwise correlation of columns, excluding NA/null values

In [70]:
corr_matrix["median_house_value"].sort_values(ascending=False)


Out[70]:
median_house_value    1.00
median_income         0.69
total_rooms           0.14
housing_median_age    0.11
households            0.06
total_bedrooms        0.05
population           -0.03
longitude            -0.05
latitude             -0.14
Name: median_house_value, dtype: float64

Os três atributos mais relevantes para median_house_value são median_income (mediana da renda), total_rooms (total de cômodos) e housing_median_age (mediana da idade das casas)

Outra forma de verificar a correlação, plotando pelo Pandas cada atributo numérico contra todos os outros atributos numéricos.

Para otimizar e evitar gráficos desnecessários, foi escolhido os atributos mais importantes.


In [71]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(df[attributes], figsize=(12, 8))


Out[71]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f47c7a0d438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47c7e3fe48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47c79ca668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47bcc4d9e8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f47c7a2bb70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47c7a2bba8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47bc36d0f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47bc3608d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f47bc2b7a20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47bc274a20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47bc204c18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47bc216518>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f47bc152c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47bc0b6438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47bc09fa90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f47aefd9e10>]], dtype=object)

Por estes gráficos, vemos que o mais promissor para median_house_value é o median_income (gráfico crescente e com algum padrão/tendência). Vendo-os mais próximos:


In [72]:
df.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])


Out[72]:
[0, 16, 0, 550000]

Há poucos pontos que ficam dispersos. E há duas curiosas retas horizontais, na linha dos 500k e por volta dos 350k.


In [74]:
len(df["median_house_value"])


Out[74]:
16512

In [75]:
len(df)


Out[75]:
16512

Criando alguns novos atributos para tentar combinações dos atributos. Será criado 3, a quantidade de cômodos por famílias, total de quartos por total de cômodos e população por famílias:


In [99]:
df["rooms_per_household"] = df["total_rooms"]/df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"]/df["total_rooms"]
df["population_per_household"]= df["population"]/df["households"]

In [100]:
corr_matrix = df.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)


Out[100]:
median_house_value          1.00
median_income               0.69
rooms_per_household         0.15
total_rooms                 0.14
housing_median_age          0.11
households                  0.06
total_bedrooms              0.05
population_per_household   -0.02
population                 -0.03
longitude                  -0.05
latitude                   -0.14
bedrooms_per_room          -0.26
Name: median_house_value, dtype: float64

Vemos que o novo atributo rooms_per_household ficou entre os 3 melhores. Como era de se esperar, as casas maiores são mais caras.


In [101]:
df.plot(kind="scatter", x="rooms_per_household", y="median_house_value",
             alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()



In [102]:
df.describe()


Out[102]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value rooms_per_household bedrooms_per_room population_per_household
count 16512.00 16512.00 16512.00 16512.00 16354.00 16512.00 16512.00 16512.00 16512.00 16512.00 16354.00 16512.00
mean -119.58 35.64 28.65 2622.73 534.97 1419.79 497.06 3.88 206990.92 5.44 0.21 3.10
std 2.00 2.14 12.57 2138.46 412.70 1115.69 375.72 1.90 115703.01 2.61 0.06 11.58
min -124.35 32.54 1.00 6.00 2.00 3.00 2.00 0.50 14999.00 1.13 0.10 0.69
25% -121.80 33.94 18.00 1443.00 295.00 784.00 279.00 2.57 119800.00 4.44 0.18 2.43
50% -118.51 34.26 29.00 2119.50 433.00 1164.00 408.00 3.54 179500.00 5.23 0.20 2.82
75% -118.01 37.72 37.00 3141.00 644.00 1719.25 602.00 4.74 263900.00 6.06 0.24 3.28
max -114.31 41.95 52.00 39320.00 6210.00 35682.00 5358.00 15.00 500001.00 141.91 1.00 1243.33

Preparando os dados para os algoritmos de Aprendizado de Máquina


In [103]:
# separando em dados e rótulos, respectivamente: 

df = strat_train_set.drop("median_house_value", axis=1) # conjunto de treinamento sem os rótulos 
# o drop cria uma cópia dos dados

df_labels = strat_train_set["median_house_value"].copy()

Sobre o axis:


In [104]:
df


Out[104]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity
17606 -121.89 37.29 38.0 1568.0 351.0 710.0 339.0 2.70 <1H OCEAN
18632 -121.93 37.05 14.0 679.0 108.0 306.0 113.0 6.42 <1H OCEAN
14650 -117.20 32.77 31.0 1952.0 471.0 936.0 462.0 2.86 NEAR OCEAN
3230 -119.61 36.31 25.0 1847.0 371.0 1460.0 353.0 1.88 INLAND
3555 -118.59 34.23 17.0 6592.0 1525.0 4459.0 1463.0 3.03 <1H OCEAN
19480 -120.97 37.66 24.0 2930.0 588.0 1448.0 570.0 3.54 INLAND
8879 -118.50 34.04 52.0 2233.0 317.0 769.0 277.0 8.38 <1H OCEAN
13685 -117.24 34.15 26.0 2041.0 293.0 936.0 375.0 6.00 INLAND
4937 -118.26 33.99 47.0 1865.0 465.0 1916.0 438.0 1.82 <1H OCEAN
4861 -118.28 34.02 29.0 515.0 229.0 2690.0 217.0 0.50 <1H OCEAN
16365 -121.31 38.02 24.0 4157.0 951.0 2734.0 879.0 2.80 INLAND
19684 -121.62 39.14 41.0 2183.0 559.0 1202.0 506.0 1.69 INLAND
19234 -122.69 38.51 18.0 3364.0 501.0 1442.0 506.0 6.69 <1H OCEAN
13956 -117.06 34.17 21.0 2520.0 582.0 416.0 151.0 2.71 INLAND
2390 -119.46 36.91 12.0 2980.0 495.0 1184.0 429.0 3.91 INLAND
11176 -117.96 33.83 30.0 2838.0 649.0 1758.0 593.0 3.38 <1H OCEAN
15614 -122.41 37.81 25.0 1178.0 545.0 592.0 441.0 3.67 NEAR BAY
2953 -119.02 35.35 42.0 1239.0 251.0 776.0 272.0 1.98 INLAND
13209 -117.72 34.05 8.0 1841.0 409.0 1243.0 394.0 4.06 INLAND
6569 -118.15 34.20 46.0 1505.0 261.0 857.0 269.0 4.50 INLAND
5825 -118.30 34.19 14.0 3615.0 913.0 1924.0 852.0 3.51 <1H OCEAN
18086 -122.05 37.31 25.0 4111.0 538.0 1585.0 568.0 9.23 <1H OCEAN
16718 -120.66 35.49 17.0 4422.0 945.0 2307.0 885.0 2.83 <1H OCEAN
13600 -117.25 34.16 37.0 1709.0 278.0 744.0 274.0 3.72 INLAND
13989 -117.19 34.94 31.0 2034.0 444.0 1097.0 367.0 2.15 INLAND
15168 -117.06 33.02 24.0 830.0 190.0 279.0 196.0 1.92 <1H OCEAN
6747 -118.07 34.11 41.0 2869.0 563.0 1627.0 533.0 5.07 <1H OCEAN
7398 -118.24 33.96 44.0 1338.0 366.0 1765.0 388.0 1.78 <1H OCEAN
5562 -118.28 33.91 41.0 620.0 133.0 642.0 162.0 2.65 <1H OCEAN
16121 -122.46 37.79 52.0 2059.0 416.0 999.0 402.0 3.74 NEAR BAY
... ... ... ... ... ... ... ... ... ...
12380 -116.47 33.77 26.0 4300.0 767.0 1557.0 669.0 4.41 INLAND
5618 -118.23 33.78 20.0 59.0 24.0 69.0 23.0 2.56 NEAR OCEAN
10060 -121.06 39.25 17.0 3127.0 539.0 1390.0 520.0 3.95 INLAND
18067 -122.03 37.29 22.0 3118.0 438.0 1147.0 425.0 10.37 <1H OCEAN
4471 -118.17 34.09 33.0 2907.0 797.0 3212.0 793.0 2.23 <1H OCEAN
19786 -122.86 40.56 12.0 1350.0 300.0 423.0 172.0 1.74 INLAND
9969 -122.48 38.51 49.0 1977.0 393.0 741.0 339.0 3.13 INLAND
14621 -117.17 32.78 17.0 3845.0 1051.0 3102.0 944.0 2.37 NEAR OCEAN
579 -122.07 37.71 40.0 1808.0 302.0 746.0 270.0 5.30 NEAR BAY
11682 -118.01 33.87 25.0 6348.0 1615.0 4188.0 1497.0 3.14 <1H OCEAN
245 -122.21 37.78 43.0 1702.0 460.0 1227.0 407.0 1.72 NEAR BAY
12130 -117.23 33.94 8.0 2405.0 537.0 1594.0 517.0 3.08 INLAND
16441 -121.29 38.14 34.0 2770.0 544.0 1409.0 535.0 3.23 INLAND
11016 -117.82 33.76 33.0 2774.0 428.0 1229.0 407.0 6.29 <1H OCEAN
19934 -119.34 36.31 14.0 1635.0 422.0 870.0 399.0 2.70 INLAND
1364 -122.14 38.03 42.0 118.0 34.0 54.0 30.0 2.58 NEAR BAY
1236 -120.37 38.23 13.0 4401.0 829.0 924.0 383.0 2.69 INLAND
5364 -118.42 34.04 52.0 1358.0 272.0 574.0 267.0 5.65 <1H OCEAN
11703 -117.97 33.88 16.0 2003.0 300.0 1172.0 318.0 6.04 <1H OCEAN
10356 -117.67 33.60 25.0 3164.0 449.0 1517.0 453.0 6.79 <1H OCEAN
15270 -117.29 33.08 18.0 3225.0 515.0 1463.0 476.0 5.78 NEAR OCEAN
3754 -118.37 34.18 36.0 1608.0 373.0 1217.0 374.0 2.97 <1H OCEAN
12166 -117.14 33.81 13.0 4496.0 756.0 2044.0 695.0 3.28 INLAND
6003 -117.77 34.08 27.0 5929.0 932.0 2817.0 828.0 6.04 INLAND
7364 -118.20 33.97 43.0 825.0 212.0 820.0 184.0 1.89 <1H OCEAN
6563 -118.13 34.20 46.0 1271.0 236.0 573.0 210.0 4.93 INLAND
12053 -117.56 33.88 40.0 1196.0 294.0 1052.0 258.0 2.07 INLAND
13908 -116.40 34.09 9.0 4855.0 872.0 2098.0 765.0 3.27 INLAND
11159 -118.01 33.82 31.0 1960.0 380.0 1356.0 356.0 4.06 <1H OCEAN
15775 -122.45 37.77 52.0 3095.0 682.0 1269.0 639.0 3.58 NEAR BAY

16512 rows × 9 columns


In [105]:
df_labels #median_house_value


Out[105]:
17606    286600.0
18632    340600.0
14650    196900.0
3230      46300.0
3555     254500.0
19480    127900.0
8879     500001.0
13685    140200.0
4937      95000.0
4861     500001.0
16365     92100.0
19684     61500.0
19234    313000.0
13956     89000.0
2390     123900.0
11176    197400.0
15614    500001.0
2953      63300.0
13209    107000.0
6569     184200.0
5825     280900.0
18086    500001.0
16718    171300.0
13600    116600.0
13989     60800.0
15168    121100.0
6747     270700.0
7398     109900.0
5562     159600.0
16121    500001.0
           ...   
12380    122500.0
5618     350000.0
10060    172800.0
18067    500001.0
4471     146600.0
19786     81300.0
9969     247600.0
14621    164100.0
579      254900.0
11682    185700.0
245      126800.0
12130    114200.0
16441    101800.0
11016    265600.0
19934     88900.0
1364     225000.0
1236     123500.0
5364     500001.0
11703    321600.0
10356    266000.0
15270    346700.0
3754     190200.0
12166    148800.0
6003     214800.0
7364     174300.0
6563     240200.0
12053    113000.0
13908     97800.0
11159    225900.0
15775    500001.0
Name: median_house_value, Length: 16512, dtype: float64

In [106]:
# aqui verificamos que há alguns valores com NaN (not a number), estes 5.
sample_incomplete_rows = df[df.isnull().any(axis=1)].head()
sample_incomplete_rows


Out[106]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity
4629 -118.30 34.07 18.0 3759.0 NaN 3296.0 1462.0 2.27 <1H OCEAN
6068 -117.86 34.01 16.0 4632.0 NaN 3038.0 727.0 5.18 <1H OCEAN
17923 -121.97 37.35 30.0 1955.0 NaN 999.0 386.0 4.63 <1H OCEAN
13656 -117.30 34.05 6.0 2155.0 NaN 1039.0 391.0 1.67 INLAND
19252 -122.79 38.48 7.0 6837.0 NaN 3468.0 1405.0 3.17 <1H OCEAN

In [107]:
# 1ª Opção - Retirar os NaN (Not a Number)

sample_incomplete_rows.dropna(subset=["total_bedrooms"])    

# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html
# Return object with labels on given axis omitted where alternately any or all of the data are missing

# 2ª Opção - retirar a coluna inteira

sample_incomplete_rows.drop("total_bedrooms", axis=1)
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html
# axis=0 seria aplicado em cada linha numa mesma coluna (por default, axis=0)
# axis=1 é ao longo da coluna numa mesma linha

# 3ª Opção - completar com os valores médios

median = df["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # option 3
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html
# Fill NA/NaN values using the specified method

# inplace : boolean, default False
# If True, fill in place. Note: this will modify any other views on this object, 
# (e.g. a no-copy slice for a column in a DataFrame).

In [108]:
sample_incomplete_rows


Out[108]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity
4629 -118.30 34.07 18.0 3759.0 433.0 3296.0 1462.0 2.27 <1H OCEAN
6068 -117.86 34.01 16.0 4632.0 433.0 3038.0 727.0 5.18 <1H OCEAN
17923 -121.97 37.35 30.0 1955.0 433.0 999.0 386.0 4.63 <1H OCEAN
13656 -117.30 34.05 6.0 2155.0 433.0 1039.0 391.0 1.67 INLAND
19252 -122.79 38.48 7.0 6837.0 433.0 3468.0 1405.0 3.17 <1H OCEAN

In [110]:
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
# Imputation transformer for completing missing values.
# Remove o atributo textual para os cálculos estatísticos
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")

In [111]:
# copia os dados sem o atributo textual ocean_proximity
housing_num = df.drop("ocean_proximity", axis=1)

In [112]:
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html#sklearn.preprocessing.Imputer.fit
# __init__(missing_values=’NaN’, strategy=’mean’, axis=0, verbose=0, copy=True)
imputer.fit(housing_num)


Out[112]:
Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

O imputer computou a média de cada atributo e armazenou em statistics_.


In [113]:
# statistics_ : array of shape (n_features,)
# The imputation fill value for each feature if axis == 0.
imputer.statistics_


Out[113]:
array([ -118.51  ,    34.26  ,    29.    ,  2119.5   ,   433.    ,
        1164.    ,   408.    ,     3.5409])

In [114]:
# Confirmando aqui: 

housing_num.median().values


Out[114]:
array([ -118.51  ,    34.26  ,    29.    ,  2119.5   ,   433.    ,
        1164.    ,   408.    ,     3.5409])

In [115]:
# Transformando o conjunto de treinamento substituindo os valores que faltam pelas medianas aprendidas: 

X = imputer.transform(housing_num)

In [116]:
# Resulta num array do Numpy que contém as features transformadas. Colocando num DataFrame do Pandas:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index = (df.index.values))

In [117]:
# listando as linhas com valores incompletos
housing_tr.loc[sample_incomplete_rows.index.values]


Out[117]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income
4629 -118.30 34.07 18.0 3759.0 433.0 3296.0 1462.0 2.27
6068 -117.86 34.01 16.0 4632.0 433.0 3038.0 727.0 5.18
17923 -121.97 37.35 30.0 1955.0 433.0 999.0 386.0 4.63
13656 -117.30 34.05 6.0 2155.0 433.0 1039.0 391.0 1.67
19252 -122.79 38.48 7.0 6837.0 433.0 3468.0 1405.0 3.17

In [118]:
imputer.strategy


Out[118]:
'median'

In [119]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.head()


Out[119]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income
0 -121.89 37.29 38.0 1568.0 351.0 710.0 339.0 2.70
1 -121.93 37.05 14.0 679.0 108.0 306.0 113.0 6.42
2 -117.20 32.77 31.0 1952.0 471.0 936.0 462.0 2.86
3 -119.61 36.31 25.0 1847.0 371.0 1460.0 353.0 1.88
4 -118.59 34.23 17.0 6592.0 1525.0 4459.0 1463.0 3.03

Logo acima deixamos de lado o ocean_proximity por ser um atributo textual, que impossibilitava o cálculo da mediana.

O sk-learn possui um transformador, o LabelEncoder:


In [120]:
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# Encode labels with value between 0 and n_classes-1.
    
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
housing_cat = df["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded


Out[120]:
array([0, 0, 4, ..., 1, 0, 3])

In [121]:
print(encoder.classes_)


['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']
  • <1H OCEAN: 0
  • INLAND: 1
  • ISLAND: 2
  • NEAR BAY: 3
  • NEAR OCEAN: 4

Vemos que os mais similares ficaram distantes na numeração, como 0 e 4. Para isso, pode-se criar um atributo binário para cada categoria. Por exemplo, um atributo igual a 1 quando a categoria é <1H OCEAN (e 0 se não for), da mesma forma para os outros atributos.

No sklean tem o OneHotEncoder. Quando o atributo é 1 ("quente"), enquanto os outros serão 0 ("frio").


In [122]:
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot


Out[122]:
<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

Na matriz vemos que cada linha possui um 1 e o resto é zero.


In [123]:
housing_cat_1hot.toarray()


Out[123]:
array([[ 1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       ..., 
       [ 0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.]])

In [124]:
print(housing_cat_1hot) # tuplas com a linha da matriz e a posição do valor 1, respectivamente.


  (0, 0)	1.0
  (1, 0)	1.0
  (2, 4)	1.0
  (3, 1)	1.0
  (4, 0)	1.0
  (5, 1)	1.0
  (6, 0)	1.0
  (7, 1)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 1)	1.0
  (11, 1)	1.0
  (12, 0)	1.0
  (13, 1)	1.0
  (14, 1)	1.0
  (15, 0)	1.0
  (16, 3)	1.0
  (17, 1)	1.0
  (18, 1)	1.0
  (19, 1)	1.0
  (20, 0)	1.0
  (21, 0)	1.0
  (22, 0)	1.0
  (23, 1)	1.0
  (24, 1)	1.0
  :	:
  (16487, 1)	1.0
  (16488, 1)	1.0
  (16489, 4)	1.0
  (16490, 3)	1.0
  (16491, 0)	1.0
  (16492, 3)	1.0
  (16493, 1)	1.0
  (16494, 1)	1.0
  (16495, 0)	1.0
  (16496, 1)	1.0
  (16497, 3)	1.0
  (16498, 1)	1.0
  (16499, 0)	1.0
  (16500, 0)	1.0
  (16501, 0)	1.0
  (16502, 4)	1.0
  (16503, 0)	1.0
  (16504, 1)	1.0
  (16505, 1)	1.0
  (16506, 0)	1.0
  (16507, 1)	1.0
  (16508, 1)	1.0
  (16509, 1)	1.0
  (16510, 0)	1.0
  (16511, 3)	1.0

In [125]:
# este aqui em baixo faz o mesmo que o OneHotEncoder de cima.

# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot


Out[125]:
array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ..., 
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

In [126]:
# Transformador personalizado para adicionar atributos extras
from sklearn.base import BaseEstimator, TransformerMixin

# índice da coluna
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): 
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(df.values)

In [127]:
df.columns


Out[127]:
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')

In [128]:
list(zip(df.columns))


Out[128]:
[('longitude',),
 ('latitude',),
 ('housing_median_age',),
 ('total_rooms',),
 ('total_bedrooms',),
 ('population',),
 ('households',),
 ('median_income',),
 ('ocean_proximity',)]

In [129]:
housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(df.columns)+["rooms_per_household", "population_per_household"])

housing_extra_attribs.head()


Out[129]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity rooms_per_household population_per_household
0 -1.2e+02 37 38 1.6e+03 3.5e+02 7.1e+02 3.4e+02 2.7 <1H OCEAN 4.6 2.1
1 -1.2e+02 37 14 6.8e+02 1.1e+02 3.1e+02 1.1e+02 6.4 <1H OCEAN 6 2.7
2 -1.2e+02 33 31 2e+03 4.7e+02 9.4e+02 4.6e+02 2.9 NEAR OCEAN 4.2 2
3 -1.2e+02 36 25 1.8e+03 3.7e+02 1.5e+03 3.5e+02 1.9 INLAND 5.2 4.1
4 -1.2e+02 34 17 6.6e+03 1.5e+03 4.5e+03 1.5e+03 3 <1H OCEAN 4.5 3

In [130]:
# Pipeline para pré-processar os atributos numéricos
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [131]:
housing_num_tr


Out[131]:
array([[-1.15604281,  0.77194962,  0.74333089, ..., -0.31205452,
        -0.08649871,  0.15531753],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.21768338,
        -0.03353391, -0.83628902],
       [ 1.18684903, -1.34218285,  0.18664186, ..., -0.46531516,
        -0.09240499,  0.4222004 ],
       ..., 
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.3469342 ,
        -0.03055414, -0.52177644],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.02499488,
         0.06150916, -0.30340741],
       [-1.43579109,  0.99645926,  1.85670895, ..., -0.22852947,
        -0.09586294,  0.10180567]])

In [132]:
# Transformador para selecionar um subconjunto das colunas Pandas DataFrame
from sklearn.base import BaseEstimator, TransformerMixin

# Classe para selecionar colunas numéricas ou categóricas
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [133]:
# Juntando todos esses componentes num grande pipeline que irá processar as características numéricas e categóricas
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('label_binarizer', LabelBinarizer()),
    ])

In [134]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [135]:
housing_prepared = full_pipeline.fit_transform(df)
housing_prepared


Out[135]:
array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ..., 
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [136]:
housing_prepared.shape


Out[136]:
(16512, 16)

Escolhendo e treinando um modelo


In [137]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, df_labels) # Fit linear model
# From docummentation: 
#lin_reg.fit(X, y, sample_weight=None)
# X : numpy array or sparse matrix of shape [n_samples,n_features]
#     Training data
#y : numpy array of shape [n_samples, n_targets]
#    Target values


Out[137]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [138]:
# tentando o pipeline completo de algumas instâncias de treinamento

some_data = df.iloc[:5]
some_labels = df_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data) # Transform some_data separately by each transformer, concatenate results.

print("Predictions:", lin_reg.predict(some_data_prepared))


Predictions: [ 210644.60459286  317768.80697211  210956.43331178   59218.98886849
  189747.55849879]

Comparando contra os valores atuais:


In [139]:
print("Labels:", list(some_labels))


Labels: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]

In [140]:
some_data_prepared


Out[140]:
array([[-1.15604281,  0.77194962,  0.74333089, -0.49323393, -0.44543821,
        -0.63621141, -0.42069842, -0.61493744, -0.31205452, -0.08649871,
         0.15531753,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , -0.90896655, -1.0369278 ,
        -0.99833135, -1.02222705,  1.33645936,  0.21768338, -0.03353391,
        -0.83628902,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, -0.31365989, -0.15334458,
        -0.43363936, -0.0933178 , -0.5320456 , -0.46531516, -0.09240499,
         0.4222004 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.01706767,  0.31357576, -0.29052016, -0.36276217, -0.39675594,
         0.03604096, -0.38343559, -1.04556555, -0.07966124,  0.08973561,
        -0.19645314,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.49247384, -0.65929936, -0.92673619,  1.85619316,  2.41221109,
         2.72415407,  2.57097492, -0.44143679, -0.35783383, -0.00419445,
         0.2699277 ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])

In [141]:
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(df_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


Out[141]:
68628.198198489219

In [142]:
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(df_labels, housing_predictions)
lin_mae


Out[142]:
49439.895990018973

In [143]:
# http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, df_labels)


Out[143]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=42,
           splitter='best')

In [144]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(df_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse


Out[144]:
0.0

Estranho aqui ter dado zero de erro, é mais provável que o modelo tenha se sobreajustado (overfitting) aos dados.


In [145]:
tree_mae = mean_absolute_error(df_labels, housing_predictions)
tree_mae


Out[145]:
0.0

Melhorando a avaliação com Validação Cruzada k-fold, que randomicamente divide o conjunto de treinamento em 10 subconjuntos distintos chamados folds, que treina e avalia o modelo de Árvore de Decisão 10 vezes, pegando um fold diferente para avaliação a cada vez e treinando os outros 9 folds.


In [146]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, df_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [147]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)


Scores: [ 70232.0136482   66828.46839892  72444.08721003  70761.50186201
  71125.52697653  75581.29319857  70169.59286164  70055.37863456
  75370.49116773  71222.39081244]
Mean: 71379.0744771
Standard deviation: 2458.31882043

In [148]:
# Calculando agora as mesmas pontuações para o modelo de Regressão Linear
lin_scores = cross_val_score(lin_reg, housing_prepared, df_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)


Scores: [ 66782.73843989  66960.118071    70347.95244419  74739.57052552
  68031.13388938  71193.84183426  64969.63056405  68281.61137997
  71552.91566558  67665.10082067]
Mean: 69052.4613635
Standard deviation: 2731.6740018

In [149]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(housing_prepared, df_labels)


Out[149]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [150]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(df_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse


Out[150]:
21941.911027380233

In [151]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, df_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)


Scores: [ 51650.94405471  48920.80645498  52979.16096752  54412.74042021
  50861.29381163  56488.55699727  51866.90120786  49752.24599537
  55399.50713191  53309.74548294]
Mean: 52564.1902524
Standard deviation: 2301.87380392

In [152]:
scores = cross_val_score(lin_reg, housing_prepared, df_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()


Out[152]:
count       10.00
mean     69052.46
std       2879.44
min      64969.63
25%      67136.36
50%      68156.37
75%      70982.37
max      74739.57
dtype: float64

In [ ]: