lin_reg_assignment



In [48]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

pop = pd.read_csv('../data/population.csv', skiprows=4)
pop.rename(columns=lambda c: c.lower().replace(' ', '_'), inplace=True)
pop.drop(['indicator_name', 'indicator_code','unnamed:_60', '2015'], axis=1, inplace=True)
pop = pd.melt(pop, id_vars=['country_name', 'country_code'], var_name='year', value_name='population')
pop.year = pd.to_numeric(pop.year)
pop['log_pop'] = np.log(pop.population)
pop.head()
pop[pop.year==2012].log_pop.hist(bins=100)
pop[pop.country_name == 'United States'].plot(kind='scatter', x='year', y='population')


Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x117fcca58>

In [82]:
usa = pop[pop.country_name == 'United States']
# usa.plot(kind='scatter', x='year', y='population')

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(usa[['year']], usa['population'])
model.coef_
# model.intercept_
model.predict(2016)
model.predict([[2017], [2015], [2016]])
pred_pop = model.predict(usa[['year']])
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mean_absolute_error(pred_pop, usa.population)
np.sqrt(mean_squared_error(pred_pop, usa.population))

# plt.scatter(usa.year, (pred_pop -usa.population)/usa.population )
error = pd.Series((pred_pop -usa.population)/usa.population)
error.hist()
error.abs().mean()


Out[82]:
0.010354515316421731

In [106]:
uae = pop[pop.country_code=='ARE']
uae.plot(kind='scatter', x='year', y='population')
from sklearn.linear_model import LinearRegression

model_uae = LinearRegression()
model_uae.fit(uae[['year']], uae['log_pop'])
model_uae.coef_
# model.intercept_
# model.predict(2016)
# model.predict([[2017], [2015], [2016]])
pred_pop = model_uae.predict(uae[['year']])
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# mean_absolute_error(pred_pop, uae.population)
# np.sqrt(mean_squared_error(pred_pop, uae.population))

# plt.scatter(uae.year, (pred_pop -uae.population)/uae.population )

plt.plot(uae.year, np.exp(pred_pop) )
error = pd.Series((pred_pop -uae.log_pop))
# error.hist()
error.abs().mean()


Out[106]:
0.19277159807165747

In [103]:
# plt.plot(uae.year, np.exp(pred_pop) )
np.exp()


Out[103]:
array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
        inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,
        inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,
        inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,
        inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf,  inf])

In [21]:
pop


Out[21]:
country_name country_code year pop
0 Aruba ABW 1960 5.420800e+04
1 Andorra AND 1960 1.341400e+04
2 Afghanistan AFG 1960 8.994793e+06
3 Angola AGO 1960 5.270844e+06
4 Albania ALB 1960 1.608800e+06
5 Arab World ARB 1960 9.249590e+07
6 United Arab Emirates ARE 1960 9.261200e+04
7 Argentina ARG 1960 2.061908e+07
8 Armenia ARM 1960 1.867396e+06
9 American Samoa ASM 1960 2.001200e+04
10 Antigua and Barbuda ATG 1960 5.468100e+04
11 Australia AUS 1960 1.027648e+07
12 Austria AUT 1960 7.047539e+06
13 Azerbaijan AZE 1960 3.897889e+06
14 Burundi BDI 1960 2.786740e+06
15 Belgium BEL 1960 9.153489e+06
16 Benin BEN 1960 2.431620e+06
17 Burkina Faso BFA 1960 4.829291e+06
18 Bangladesh BGD 1960 4.820070e+07
19 Bulgaria BGR 1960 7.867374e+06
20 Bahrain BHR 1960 1.625010e+05
21 Bahamas, The BHS 1960 1.095260e+05
22 Bosnia and Herzegovina BIH 1960 3.214520e+06
23 Belarus BLR 1960 8.198000e+06
24 Belize BLZ 1960 9.206800e+04
25 Bermuda BMU 1960 4.440000e+04
26 Bolivia BOL 1960 3.693451e+06
27 Brazil BRA 1960 7.249358e+07
28 Barbados BRB 1960 2.309340e+05
29 Brunei Darussalam BRN 1960 8.182500e+04
... ... ... ... ...
13610 Togo TGO 2014 7.115163e+06
13611 Thailand THA 2014 6.772598e+07
13612 Tajikistan TJK 2014 8.295840e+06
13613 Turkmenistan TKM 2014 5.307188e+06
13614 Timor-Leste TLS 2014 1.212107e+06
13615 Tonga TON 2014 1.055860e+05
13616 Trinidad and Tobago TTO 2014 1.354483e+06
13617 Tunisia TUN 2014 1.099660e+07
13618 Turkey TUR 2014 7.593235e+07
13619 Tuvalu TUV 2014 9.893000e+03
13620 Tanzania TZA 2014 5.182262e+07
13621 Uganda UGA 2014 3.778297e+07
13622 Ukraine UKR 2014 4.536290e+07
13623 Upper middle income UMC 2014 2.360818e+09
13624 Uruguay URY 2014 3.419516e+06
13625 United States USA 2014 3.188571e+08
13626 Uzbekistan UZB 2014 3.075770e+07
13627 St. Vincent and the Grenadines VCT 2014 1.093600e+05
13628 Venezuela, RB VEN 2014 3.069383e+07
13629 Virgin Islands (U.S.) VIR 2014 1.041700e+05
13630 Vietnam VNM 2014 9.072890e+07
13631 Vanuatu VUT 2014 2.588830e+05
13632 West Bank and Gaza PSE 2014 4.294682e+06
13633 World WLD 2014 7.259692e+09
13634 Samoa WSM 2014 1.918450e+05
13635 Yemen, Rep. YEM 2014 2.618368e+07
13636 South Africa ZAF 2014 5.400195e+07
13637 Congo, Dem. Rep. COD 2014 7.487703e+07
13638 Zambia ZMB 2014 1.572134e+07
13639 Zimbabwe ZWE 2014 1.524586e+07

13640 rows × 4 columns


In [44]:
pop.country_name.value_counts().so


Out[44]:
Belize                                          55
Sub-Saharan Africa (developing only)            55
Romania                                         55
Lebanon                                         55
Belgium                                         55
Cuba                                            55
Sweden                                          55
Liechtenstein                                   55
Fiji                                            55
St. Vincent and the Grenadines                  55
Angola                                          55
Botswana                                        55
Saudi Arabia                                    55
Chile                                           55
Australia                                       55
Pakistan                                        55
Comoros                                         55
American Samoa                                  55
Russian Federation                              55
Latin America & Caribbean (developing only)     55
Kuwait                                          55
Kazakhstan                                      55
Bhutan                                          55
Macao SAR, China                                55
United Kingdom                                  55
Timor-Leste                                     55
United States                                   55
Croatia                                         55
Mauritania                                      55
Portugal                                        55
                                                ..
Europe & Central Asia (all income levels)       55
Morocco                                         55
Zambia                                          55
Tonga                                           55
Indonesia                                       55
Cyprus                                          55
Central African Republic                        55
Cabo Verde                                      55
Turkmenistan                                    55
Colombia                                        55
Curacao                                         55
Peru                                            55
Ethiopia                                        55
Mexico                                          55
Bahrain                                         55
Poland                                          55
Thailand                                        55
Denmark                                         55
Venezuela, RB                                   55
High income                                     55
Hong Kong SAR, China                            55
Eritrea                                         55
Middle East & North Africa (developing only)    55
Upper middle income                             55
United Arab Emirates                            55
Small states                                    55
Uruguay                                         55
Uzbekistan                                      55
Lower middle income                             55
Brunei Darussalam                               55
Name: country_name, dtype: int64
ERROR! Session/line number was not unique in database. History logging moved to new session 95

In [ ]: