In [48]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
pop = pd.read_csv('../data/population.csv', skiprows=4)
pop.rename(columns=lambda c: c.lower().replace(' ', '_'), inplace=True)
pop.drop(['indicator_name', 'indicator_code','unnamed:_60', '2015'], axis=1, inplace=True)
pop = pd.melt(pop, id_vars=['country_name', 'country_code'], var_name='year', value_name='population')
pop.year = pd.to_numeric(pop.year)
pop['log_pop'] = np.log(pop.population)
pop.head()
pop[pop.year==2012].log_pop.hist(bins=100)
pop[pop.country_name == 'United States'].plot(kind='scatter', x='year', y='population')
Out[48]:
In [82]:
usa = pop[pop.country_name == 'United States']
# usa.plot(kind='scatter', x='year', y='population')
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(usa[['year']], usa['population'])
model.coef_
# model.intercept_
model.predict(2016)
model.predict([[2017], [2015], [2016]])
pred_pop = model.predict(usa[['year']])
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
mean_absolute_error(pred_pop, usa.population)
np.sqrt(mean_squared_error(pred_pop, usa.population))
# plt.scatter(usa.year, (pred_pop -usa.population)/usa.population )
error = pd.Series((pred_pop -usa.population)/usa.population)
error.hist()
error.abs().mean()
Out[82]:
In [106]:
uae = pop[pop.country_code=='ARE']
uae.plot(kind='scatter', x='year', y='population')
from sklearn.linear_model import LinearRegression
model_uae = LinearRegression()
model_uae.fit(uae[['year']], uae['log_pop'])
model_uae.coef_
# model.intercept_
# model.predict(2016)
# model.predict([[2017], [2015], [2016]])
pred_pop = model_uae.predict(uae[['year']])
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
# mean_absolute_error(pred_pop, uae.population)
# np.sqrt(mean_squared_error(pred_pop, uae.population))
# plt.scatter(uae.year, (pred_pop -uae.population)/uae.population )
plt.plot(uae.year, np.exp(pred_pop) )
error = pd.Series((pred_pop -uae.log_pop))
# error.hist()
error.abs().mean()
Out[106]:
In [103]:
# plt.plot(uae.year, np.exp(pred_pop) )
np.exp()
Out[103]:
In [21]:
pop
Out[21]:
In [44]:
pop.country_name.value_counts().so
Out[44]:
In [ ]: