In [21]:
import pandas as pd
In [22]:
empl_data = pd.read_csv('fixtures/training_data.csv')
In [23]:
empl_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 108 entries, 0 to 107
Data columns (total 34 columns):
job_growth 108 non-null float64
job_growth_min_1 108 non-null float64
job_growth_min_3 108 non-null float64
job_growth_min_6 108 non-null float64
job_growth_min_12 108 non-null float64
avg_weeks_unemp 108 non-null float64
avg_weeks_unemp_min_1 108 non-null float64
avg_weeks_unemp_min_3 108 non-null float64
avg_weeks_unemp_min_6 108 non-null float64
avg_weeks_unemp_min_12 108 non-null float64
emp_pop_ratio 108 non-null float64
lbr_frc_prtcp 108 non-null float64
not_in_lbr_frc 108 non-null float64
totl_emp_payrl 108 non-null float64
unemp_rate 108 non-null float64
fed_fund_rate 108 non-null float64
mortgage_rate 108 non-null float64
import_idx 108 non-null float64
export_idx 108 non-null float64
cpi_idx 108 non-null float64
cpi_index_min_1 108 non-null float64
cpi_index_min_3 108 non-null float64
cpi_index_min_6 108 non-null float64
cpi_index_min_12 108 non-null float64
taxes 108 non-null float64
taxes_min_1 108 non-null int64
taxes_min_3 108 non-null int64
taxes_min_6 108 non-null int64
taxes_min_12 108 non-null int64
spending 108 non-null int64
spending_min_1 108 non-null int64
spending_min_3 108 non-null int64
spending_min_6 108 non-null int64
spending_min_12 108 non-null int64
dtypes: float64(25), int64(9)
In [24]:
empl_data.head()
Out[24]:
job_growth
job_growth_min_1
job_growth_min_3
job_growth_min_6
job_growth_min_12
avg_weeks_unemp
avg_weeks_unemp_min_1
avg_weeks_unemp_min_3
avg_weeks_unemp_min_6
avg_weeks_unemp_min_12
...
taxes
taxes_min_1
taxes_min_3
taxes_min_6
taxes_min_12
spending
spending_min_1
spending_min_3
spending_min_6
spending_min_12
0
0.085638
-0.075590
0.175644
0.274477
0.050552
-6.8
-6.890756
-6.790756
-7.490756
-6.190756
...
5886.6
19419
-59434
-61915
-104321
-64514
-40315
-65111
-55050
-69916
1
0.099825
0.085638
0.357112
0.012181
-0.064240
-7.2
-6.790756
-6.590756
-7.490756
-6.490756
...
-95459.4
5887
-61783
-58601
-63905
-43965
-64514
-66908
-79764
-53288
2
0.191616
0.099825
-0.075590
-0.061617
0.163955
-6.8
-7.190756
-6.890756
-6.890756
-6.690756
...
-47571.4
-95459
19419
11021
23761
-38142
-43965
-40315
-75885
-56112
3
0.426579
0.191616
0.085638
0.175644
0.124027
-6.7
-6.790756
-6.790756
-6.790756
-6.490756
...
81283.6
-47571
5887
-59434
-80880
-38935
-38142
-64514
-65111
-80712
4
0.251320
0.426579
0.099825
0.357112
0.231902
-7.7
-6.690756
-7.190756
-6.590756
-5.790756
...
-43599.4
81284
-95459
-61783
18052
-69705
-38935
-43965
-66908
-63367
5 rows × 34 columns
In [25]:
import statsmodels.api as sm
In [35]:
y = empl_data.job_growth
In [36]:
X = empl_data.ix[:,1:34]
X.head()
Out[36]:
job_growth_min_1
job_growth_min_3
job_growth_min_6
job_growth_min_12
avg_weeks_unemp
avg_weeks_unemp_min_1
avg_weeks_unemp_min_3
avg_weeks_unemp_min_6
avg_weeks_unemp_min_12
emp_pop_ratio
...
taxes
taxes_min_1
taxes_min_3
taxes_min_6
taxes_min_12
spending
spending_min_1
spending_min_3
spending_min_6
spending_min_12
0
-0.075590
0.175644
0.274477
0.050552
-6.8
-6.890756
-6.790756
-7.490756
-6.190756
1.7
...
5886.6
19419
-59434
-61915
-104321
-64514
-40315
-65111
-55050
-69916
1
0.085638
0.357112
0.012181
-0.064240
-7.2
-6.790756
-6.590756
-7.490756
-6.490756
1.7
...
-95459.4
5887
-61783
-58601
-63905
-43965
-64514
-66908
-79764
-53288
2
0.099825
-0.075590
-0.061617
0.163955
-6.8
-7.190756
-6.890756
-6.890756
-6.690756
1.7
...
-47571.4
-95459
19419
11021
23761
-38142
-43965
-40315
-75885
-56112
3
0.191616
0.085638
0.175644
0.124027
-6.7
-6.790756
-6.790756
-6.790756
-6.490756
2.0
...
81283.6
-47571
5887
-59434
-80880
-38935
-38142
-64514
-65111
-80712
4
0.426579
0.099825
0.357112
0.231902
-7.7
-6.690756
-7.190756
-6.590756
-5.790756
2.1
...
-43599.4
81284
-95459
-61783
18052
-69705
-38935
-43965
-66908
-63367
5 rows × 33 columns
In [40]:
X = sm.add_constant(X)
In [41]:
est = sm.OLS(y,X).fit()
In [42]:
est.summary()
Out[42]:
OLS Regression Results
Dep. Variable: job_growth R-squared: 0.619
Model: OLS Adj. R-squared: 0.449
Method: Least Squares F-statistic: 3.643
Date: Sat, 07 Mar 2015 Prob (F-statistic): 2.00e-06
Time: 13:11:28 Log-Likelihood: 47.212
No. Observations: 108 AIC: -26.42
Df Residuals: 74 BIC: 64.77
Df Model: 33
coef std err t P>|t| [95.0% Conf. Int.]
const 0.1105 0.096 1.147 0.255 -0.081 0.302
job_growth_min_1 -0.2519 0.104 -2.422 0.018 -0.459 -0.045
job_growth_min_3 -0.0113 0.116 -0.097 0.923 -0.243 0.220
job_growth_min_6 -0.1333 0.114 -1.168 0.247 -0.361 0.094
job_growth_min_12 -0.1083 0.117 -0.928 0.357 -0.341 0.124
avg_weeks_unemp 0.0895 0.033 2.700 0.009 0.023 0.156
avg_weeks_unemp_min_1 -0.0389 0.036 -1.085 0.281 -0.110 0.033
avg_weeks_unemp_min_3 -0.0063 0.029 -0.221 0.826 -0.063 0.051
avg_weeks_unemp_min_6 0.0673 0.028 2.388 0.019 0.011 0.123
avg_weeks_unemp_min_12 -0.0543 0.019 -2.783 0.007 -0.093 -0.015
emp_pop_ratio 0.6522 0.484 1.347 0.182 -0.313 1.617
lbr_frc_prtcp -0.3543 0.448 -0.790 0.432 -1.248 0.539
not_in_lbr_frc 0.0002 0.000 0.657 0.513 -0.000 0.001
totl_emp_payrl -0.0001 9.24e-05 -1.421 0.160 -0.000 5.28e-05
unemp_rate -0.0204 0.332 -0.061 0.951 -0.683 0.642
fed_fund_rate 0.0198 0.041 0.482 0.631 -0.062 0.102
mortgage_rate 0.0028 0.101 0.027 0.978 -0.199 0.204
import_idx 0.0057 0.016 0.366 0.715 -0.025 0.037
export_idx -0.0379 0.020 -1.895 0.062 -0.078 0.002
cpi_idx 0.0527 0.041 1.283 0.204 -0.029 0.134
cpi_index_min_1 -0.0195 0.032 -0.606 0.547 -0.084 0.045
cpi_index_min_3 0.0078 0.018 0.432 0.667 -0.028 0.044
cpi_index_min_6 -0.0036 0.014 -0.265 0.792 -0.031 0.024
cpi_index_min_12 0.0112 0.017 0.667 0.507 -0.022 0.045
taxes -4.267e-07 4.08e-07 -1.046 0.299 -1.24e-06 3.86e-07
taxes_min_1 -2.865e-07 4.19e-07 -0.684 0.496 -1.12e-06 5.48e-07
taxes_min_3 -2.41e-07 3.65e-07 -0.659 0.512 -9.69e-07 4.87e-07
taxes_min_6 -3.078e-07 3.45e-07 -0.891 0.376 -9.96e-07 3.81e-07
taxes_min_12 -1.241e-07 4.45e-07 -0.279 0.781 -1.01e-06 7.64e-07
spending -8.608e-07 7.6e-07 -1.133 0.261 -2.38e-06 6.54e-07
spending_min_1 -2.734e-07 8.06e-07 -0.339 0.735 -1.88e-06 1.33e-06
spending_min_3 -3.801e-07 7.04e-07 -0.540 0.591 -1.78e-06 1.02e-06
spending_min_6 3.091e-07 7.06e-07 0.438 0.663 -1.1e-06 1.72e-06
spending_min_12 -8.209e-08 7.41e-07 -0.111 0.912 -1.56e-06 1.39e-06
Omnibus: 0.311 Durbin-Watson: 1.874
Prob(Omnibus): 0.856 Jarque-Bera (JB): 0.091
Skew: 0.058 Prob(JB): 0.955
Kurtosis: 3.083 Cond. No. 3.54e+06
In [43]:
test_data = pd.read_csv('fixtures/test_data.csv')
In [44]:
y_test = test_data.job_growth
X_test = test_data.ix[:,1:34]
In [45]:
est.predict(X_test)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-45-f916d1250932> in <module>()
----> 1 est.predict(X_test)
/Users/itadmin/anaconda/lib/python2.7/site-packages/statsmodels/base/model.pyc in predict(self, exog, transform, *args, **kwargs)
878 exog = dmatrix(self.model.data.orig_exog.design_info.builder,
879 exog)
--> 880 return self.model.predict(self.params, exog, *args, **kwargs)
881
882
/Users/itadmin/anaconda/lib/python2.7/site-packages/statsmodels/regression/linear_model.pyc in predict(self, params, exog)
175 if exog is None:
176 exog = self.exog
--> 177 return np.dot(exog, params)
178
179 class GLS(RegressionModel):
ValueError: shapes (12,33) and (34,) not aligned: 33 (dim 1) != 34 (dim 0)
In [46]:
X_test.head()
Out[46]:
job_growth_min_1
job_growth_min_3
job_growth_min_6
job_growth_min_12
avg_weeks_unemp
avg_weeks_unemp_min_1
avg_weeks_unemp_min_3
avg_weeks_unemp_min_6
avg_weeks_unemp_min_12
emp_pop_ratio
...
taxes
taxes_min_1
taxes_min_3
taxes_min_6
taxes_min_12
spending
spending_min_1
spending_min_3
spending_min_6
spending_min_12
0
0.12527
-0.58490
0.17358
0.03350
9.00924
10.50924
9.20924
10.80924
9.20924
-1.8521
...
99666.56303
88710.56303
2596.56303
3699.56303
75894.56303
47622.15966
-26803.84034
30886.15966
39002.15966
10715.15966
1
0.10481
0.72288
0.00901
0.07047
10.60924
9.00924
10.50924
11.10924
10.40924
-1.8521
...
-51981.43697
99666.56303
-13877.43697
-10960.43697
-73515.43697
79255.15966
47622.15966
59054.15966
74650.15966
67729.15966
2
0.16141
0.12527
0.00624
-0.03835
8.90924
10.60924
10.50924
10.90924
10.60924
-1.6521
...
19515.56303
-51981.43697
88710.56303
105138.56303
-10312.43697
-5883.84034
79255.15966
-26803.84034
-32269.84034
33923.15966
3
0.14736
0.10481
-0.58490
0.20297
8.50924
8.90924
9.00924
9.20924
10.20924
-1.7521
...
217906.56303
19515.56303
99666.56303
2596.56303
210392.56303
48758.15966
-5883.84034
47622.15966
30886.15966
35209.15966
4
0.22035
0.16141
0.72288
0.15661
8.00924
8.50924
10.60924
10.50924
10.60924
-1.7521
...
3558.56303
217906.56303
-51981.43697
-13877.43697
851.56303
71235.15966
48758.15966
79255.15966
59054.15966
77289.15966
5 rows × 33 columns
In [47]:
X_test = sm.add_constant(X_test)
In [50]:
y_predicted = est.predict(X_test)
In [49]:
print(y_test)
0 0.10481
1 0.16141
2 0.14736
3 0.22035
4 0.16562
5 0.19278
6 0.17512
7 0.14604
8 0.19467
9 0.18712
10 0.25261
11 0.17988
Name: job_growth, dtype: float64
In [64]:
SSE = ((y_predicted - y_test)**2).sum()
In [65]:
SST = ((y_predicted - y_test.mean())**2).sum()
In [66]:
# Sum of Squared Errors/Residuals
print(SSE)
0.316107835165
In [67]:
# Sum of Squared Total
print(SST)
0.275184725735
In [69]:
# R-squared
R2 = 1 - (SSE/SST)
print(R2)
-0.148711413107
In [ ]:
Content source: DistrictDataLabs/02-ppm-data
Similar notebooks: