In [1]:
import numpy as np, pandas as pd, os
from synthicity.utils import misc
from drcog.models import transition, dataset
dset = dataset.DRCOGDataset(os.path.join(misc.data_dir(),'drcog.h5'))
from drcog.variables import variable_library
variable_library.calculate_variables(dset)


Fetching parcels
Fetching modify_table
Fetching buildings
Fetching establishments
Fetching modify_table
Fetching modify_table
Fetching households_for_estimation
Fetching modify_table
Fetching households
Fetching modify_table
Fetching zones
Fetching modify_table
Fetching travel_data
Fetching modify_table

In [9]:
#hh_controls = dset.store.annual_household_control_totals
ct = dset.fetch('annual_household_control_totals')
if 'persons' in ct.columns:
    del ct['persons']
# hh = dset.households
# persons = dset.persons

# hh = pd.read_csv('c://users//janowicz//desktop//transition_model_test_data//drcog_test_hh.csv')
# persons = pd.read_csv('c://users//janowicz//desktop//transition_model_test_data//drcog_test_persons.csv')
# hh = hh.set_index('household_id')

hh = dset.fetch('households')
persons = dset.fetch('persons')

In [10]:
tran = transition.TabularTotalsTransition(ct, 'total_number_of_households')
model = transition.TransitionModel(tran)
new, added, new_linked = model.transition(
        hh, 2013, linked_tables={'linked': (persons, 'household_id')})
new.loc[added,'building_id'] = -1
dset.d['households'] = new
dset.d['persons'] = new_linked['linked']


      total_number_of_households
2013                     1231459

In [11]:
hh = dset.fetch('households')
persons = dset.fetch('persons')

In [16]:
persons.tail()


Out[16]:
serialno pnum household_id
3017653 2008000631914 4 1220546
3017654 2007000519562 1 1226681
3017655 2011000321139 1 1227461
3017656 2011000321139 2 1227461
3017657 2011000321139 3 1227461

In [17]:
hh[['building_id']].tail()


Out[17]:
building_id
1231456 -1
1231457 -1
1231458 -1
1231459 -1
1231460 -1

In [19]:
hh


Out[19]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1231459 entries, 1 to 1231460
Data columns (total 21 columns):
age_of_head         1231459  non-null values
cars                1231459  non-null values
children            1231459  non-null values
income              1231459  non-null values
persons             1231459  non-null values
tenure              1231459  non-null values
workers             1231459  non-null values
serialno            1231459  non-null values
taz                 1231459  non-null values
building_id         1231459  non-null values
zone_id             1231459  non-null values
building_type_id    1231459  non-null values
county_id           1231459  non-null values
btype               1231459  non-null values
income_3_tenure     1231459  non-null values
younghead           1231459  non-null values
hh_with_child       1231459  non-null values
ln_income           1230685  non-null values
income5xlt          1231459  non-null values
income10xlt         1231459  non-null values
wkrs_hhs            1231459  non-null values
dtypes: bool(2), float64(7), int32(3), int64(9)

In [6]:
hh['building_id'] = 200
hh.head()


Out[6]:
taz serialno building_id
household_id
1 202120 2007001314602 200
2 202120 2011001251394 200
3 202120 2011001449309 200
4 202120 2010001198948 200
5 202120 2007000291766 200

In [10]:
hh_controls = dset.store.annual_household_control_totals

In [13]:
'persons' in hh_controls.columns


Out[13]:
True

In [3]:
hh_controls.to_csv('c://users//janowicz//desktop//annual_household_control_totals.csv')

In [4]:
print hh.describe()
print persons.describe()


                  taz      serialno
count  1146690.000000  1.146690e+06
mean    373742.032089  2.009001e+12
std     168002.014688  1.426718e+09
min     101020.000000  2.007000e+12
25%     211120.000000  2.008000e+12
50%     404270.000000  2.009001e+12
75%     506090.000000  2.010001e+12
max     803120.000000  2.011002e+12

[8 rows x 2 columns]
           person_id    household_id            pnum      serialno
count  2807816.00000  2807816.000000  2807816.000000  2.807816e+06
mean   1403908.67021   566294.052037        2.159760  2.009019e+12
std     810547.05069   338661.154242        1.377123  1.423980e+09
min          1.00000        1.000000        1.000000  2.007000e+12
25%     701954.75000   259172.000000        1.000000  2.008000e+12
50%    1403908.50000   579327.500000        2.000000  2.009001e+12
75%    2105862.25000   866371.000000        3.000000  2.010001e+12
max    2807817.00000  1146691.000000       15.000000  2.011002e+12

[8 rows x 4 columns]

In [4]:
hh_controls


Out[4]:
total_number_of_households
year
2000 977781
2001 993027
2002 1008273
2003 1023518
2004 1038764
2005 1054009
2006 1075377
2007 1096745
2008 1118112
2009 1139480
2010 1160847
2011 1184385
2012 1207922
2013 1231459
2014 1254996
2015 1278534
2016 1303538
2017 1328543
2018 1353547
2019 1378552
2020 1403557
2021 1426381
2022 1449206
2023 1472031
2024 1494856
2025 1517681
2026 1537997
2027 1558313
2028 1578628
2029 1598944
2030 1619260
2031 1635902
2032 1652545
2033 1669188
2034 1685831
2035 1702473
2036 1716286
2037 1730099
2038 1743913
2039 1757726
2040 1771539

41 rows × 1 columns


In [7]:
tran = transition.TabularTotalsTransition(hh_controls, 'total_number_of_households')

In [8]:
model = transition.TransitionModel(tran)

In [9]:
#new = model.transition(hh,2012)
new, added, new_linked = model.transition(
        hh, 2012, linked_tables={'linked': (persons, 'household_id')})


      total_number_of_households
2012                     1207922

In [14]:
new.loc[added,'building_id'] = -1

In [15]:
print new.head()
new.describe()


      taz       serialno  building_id
1  202120  2007001314602          200
2  202120  2011001251394          200
3  202120  2011001449309          200
4  202120  2010001198948          200
5  202120  2007000291766          200
Out[15]:
taz serialno building_id
count 1207922.000000 1.207922e+06 1207922.000000
mean 373752.079013 2.009001e+12 189.810905
std 167986.326333 1.426793e+09 44.092993
min 101020.000000 2.007000e+12 -1.000000
25% 211120.000000 2.008000e+12 200.000000
50% 404270.000000 2.009001e+12 200.000000
75% 506090.000000 2.010001e+12 200.000000
max 803120.000000 2.011002e+12 200.000000

In [16]:
new.head(50)


Out[16]:
taz serialno building_id
1 202120 2007001314602 200
2 202120 2011001251394 200
3 202120 2011001449309 200
4 202120 2010001198948 200
5 202120 2007000291766 200
6 202120 2007000478712 200
7 202120 2007000814600 200
8 202120 2009000842149 200
9 202120 2010000087634 200
10 202120 2010000513482 200
11 202120 2011000339471 200
12 202120 2007000067272 200
13 202120 2007000404889 200
14 202120 2007000525443 200
15 202120 2007000923137 200
16 202120 2007001227948 200
17 202120 2008000133968 200
18 202120 2008000506716 200
19 202120 2008001093697 200
20 202120 2009000585876 200
21 202120 2009001178833 200
22 202120 2009001198940 200
23 202120 2010000118543 200
24 202120 2010000916332 200
25 202120 2010001214928 200
26 202120 2010001341008 200
27 202120 2011000828021 200
28 202120 2011000920002 200
29 202120 2011000925831 200
30 202120 2011001036405 200
31 202120 2008001271830 200
32 202120 2011000504854 200
33 202120 2007000308304 200
34 202120 2008000158867 200
35 202120 2008000392581 200
36 202120 2009000244322 200
37 202120 2008000177951 200
38 202120 2010001240314 200
39 202120 2007001072212 200
40 202120 2011000184536 200
41 202120 2008000846341 200
42 202120 2011001460351 200
43 202120 2009001331313 200
44 202120 2008000210473 200
45 202120 2008000843711 200
46 202120 2011001039054 200
47 202120 2010000832214 200
48 202120 2008001027370 200
49 202120 2009000112289 200
50 202120 2009000551075 200

In [17]:
new.tail(50)


Out[17]:
taz serialno building_id
1207874 408130 2009000772512 -1
1207875 311220 2010000859107 -1
1207876 412422 2011000699084 -1
1207877 204240 2009000831754 -1
1207878 514030 2011000689107 -1
1207879 501060 2011000279707 -1
1207880 108170 2007000968152 -1
1207881 104350 2009000432057 -1
1207882 607850 2008001012467 -1
1207883 410070 2010000103455 -1
1207884 311220 2010001049235 -1
1207885 405070 2008001074449 -1
1207886 214230 2009000526894 -1
1207887 304460 2008000109777 -1
1207888 505030 2009000686753 -1
1207889 606170 2007001029356 -1
1207890 209440 2011000613093 -1
1207891 803100 2011001311078 -1
1207892 502150 2007000357124 -1
1207893 401200 2009000268170 -1
1207894 408020 2009000420668 -1
1207895 206180 2010000737542 -1
1207896 702080 2010001009642 -1
1207897 509090 2008000842533 -1
1207898 401220 2008001324109 -1
1207899 408090 2008000882865 -1
1207900 102830 2007000779133 -1
1207901 505120 2007000667667 -1
1207902 307200 2009000776341 -1
1207903 509240 2011000267088 -1
1207904 402210 2007000834922 -1
1207905 512080 2009000661079 -1
1207906 308120 2010001183476 -1
1207907 405270 2010001039152 -1
1207908 507040 2010000392286 -1
1207909 404450 2010000988822 -1
1207910 104380 2010001192931 -1
1207911 310220 2008000436381 -1
1207912 214280 2009001242708 -1
1207913 410020 2010001376420 -1
1207914 212130 2007000731881 -1
1207915 802360 2007001021946 -1
1207916 406310 2011000623000 -1
1207917 501050 2008001095522 -1
1207918 209400 2008001377801 -1
1207919 407040 2011000167020 -1
1207920 506240 2007000267635 -1
1207921 313120 2011000956783 -1
1207922 410290 2009000120884 -1
1207923 401150 2007000098886 -1

In [ ]:


In [7]:
new_linked['linked'].describe()


Out[7]:
person_id household_id pnum serialno
count 2958183.000000 2958183.000000 2958183.000000 2.958183e+06
mean 1403639.390951 597351.827304 2.160003 2.009019e+12
std 810618.631844 356215.004738 1.377144 1.423930e+09
min 1.000000 1.000000 1.000000 2.007000e+12
25% 701691.500000 274311.000000 1.000000 2.008000e+12
50% 1403373.000000 605771.000000 2.000000 2.009001e+12
75% 2105879.500000 912553.500000 3.000000 2.010001e+12
max 2807817.000000 1207923.000000 15.000000 2.011002e+12

In [8]:
persons.describe()


Out[8]:
person_id household_id pnum serialno
count 2807816.00000 2807816.000000 2807816.000000 2.807816e+06
mean 1403908.67021 566294.052037 2.159760 2.009019e+12
std 810547.05069 338661.154242 1.377123 1.423980e+09
min 1.00000 1.000000 1.000000 2.007000e+12
25% 701954.75000 259172.000000 1.000000 2.008000e+12
50% 1403908.50000 579327.500000 2.000000 2.009001e+12
75% 2105862.25000 866371.000000 3.000000 2.010001e+12
max 2807817.00000 1146691.000000 15.000000 2.011002e+12

8 rows × 4 columns


In [8]:
new[0].building_id.describe()


Out[8]:
count    1207922.000000
mean         -13.707595
std       294538.358958
min            1.000000
25%       192387.000000
50%       466589.000000
75%       711655.750000
max      1015490.000000
Name: building_id, dtype: float64

In [14]:
len(new[1])


Out[14]:
47535

In [ ]:
hh =