In [1]:
import pandas as pd, numpy as np
import psycopg2
import pandas.io.sql as sql
conn_string = "host='paris.urbansim.org' port=5433 dbname='denver' user='urbancanvas' password='Visua1ization'"
conn = psycopg2.connect(conn_string)
cur = conn.cursor()
pums_hh = sql.read_frame('select * from pums_hh',conn)
pums_p = sql.read_frame('select * from pums_person',conn)
pums_hh = pums_hh.set_index('serialno')
pums_p = pums_p.set_index('serialno')
pums_hh = pums_hh[(pums_hh.type==1)*(pums_hh.np>0)]
## Need not replace all 99's right?  They won't join?

In [2]:
pums_p.naicsp07[pums_p.naicsp07=='N.A.////'] = '0'
pums_p.naicsp07[pums_p.naicsp07==''] = '0'
pums_p.naicsp07 = pums_p.naicsp07.str.slice(0,3)
pums_p.naicsp07[pums_p.naicsp07=='22S'] = '221'
pums_p.naicsp07[pums_p.naicsp07=='23'] = '230'
pums_p.naicsp07[pums_p.naicsp07=='31M'] = '313'
pums_p.naicsp07[pums_p.naicsp07=='33M'] = '331'
pums_p.naicsp07[pums_p.naicsp07=='3MS'] = '311'
pums_p.naicsp07[pums_p.naicsp07=='42S'] = '423'
pums_p.naicsp07[pums_p.naicsp07=='4M'] = '443'
pums_p.naicsp07[pums_p.naicsp07=='4MS'] = '443'
pums_p.naicsp07[pums_p.naicsp07=='52M'] = '522'
pums_p.naicsp07[pums_p.naicsp07=='53M'] = '532'
pums_p.naicsp07[pums_p.naicsp07=='55'] = '561'
pums_p.naicsp07[pums_p.naicsp07=='92M'] = '921'

In [5]:
pums_p['indnaics'] = pums_p.naicsp07.astype('float').astype('int32')

In [12]:
pums_p.agep[(pums_p.relp==0)*(pums_p.agep<16)] = 16

In [13]:
pums_p[(pums_p.relp==0)*(pums_p.agep<16)]


Out[13]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 21 columns):
puma        0  non-null values
agep        0  non-null values
relp        0  non-null values
esr         0  non-null values
sch         0  non-null values
schg        0  non-null values
sporder     0  non-null values
paug        0  non-null values
ddp         0  non-null values
sex         0  non-null values
wkw         0  non-null values
wkhp        0  non-null values
rac1p       0  non-null values
hisp        0  non-null values
msp         0  non-null values
povpip      0  non-null values
pernp       0  non-null values
pincp       0  non-null values
indp07      0  non-null values
naicsp07    0  non-null values
indnaics    0  non-null values
dtypes: int32(1), int64(18), object(2)

In [3]:
pums_p.groupby('naicsp07').size()


Out[3]:
naicsp07
0           63621
111           133
112            69
113            46
114             7
115            34
211           109
212            78
213           332
221           504
230          4558
311           532
312           228
313            21
314            49
...
623          752
624         1325
711          528
712          333
713         1141
721          669
722         3934
811          851
812         1052
813         1105
814          497
921         2398
923          320
928          404
992          440
Length: 89, dtype: int64

In [4]:
for x in np.unique(pums_p.naicsp07):  print x


0
111
112
113
114
115
211
212
213
221
230
311
312
313
314
315
316
321
322
323
324
325
326
327
331
332
333
334
335
336
337
339
423
424
425
441
442
443
444
445
446
447
448
451
452
453
454
481
482
483
484
485
486
487
488
491
492
493
511
512
515
517
518
519
522
524
531
532
541
561
562
611
621
622
623
624
711
712
713
721
722
811
812
813
814
921
923
928
992

In [12]:
pums_p.indp07.isnull().sum()


Out[12]:
0

In [13]:
pums_p.indp07[pums_p.indp07=='N.A.'] = '0'
pums_p.indp07[pums_p.indp07==''] = '0'

In [18]:
pums_p['indnaics'] = pums_p.indp07.astype('float').astype('int32')

In [ ]:
pums_p.astype

In [3]:
pums_p['age'] = pums_p.agep
pums_p['relate'] = pums_p.relp
pums_p['esr_recode'] = 0*(pums_p.esr.isin([-99,6])) + 1*(pums_p.esr.isin([1,4])) + 2*(pums_p.esr.isin([2,5])) + 3*(pums_p.esr==3)
pums_p['grade'] = pums_p.schg*(pums_p.schg>0) + 0*(pums_p.schg==-99)
pums_p['pnum'] = pums_p.sporder
pums_p['paug'] = 0
pums_p['ddp'] = 0
pums_p['sex'] = pums_p.sex
pums_p['weeks'] = 0*(pums_p.wkw==-99) + 7*(pums_p.wkw==6) + 20*(pums_p.wkw==5) + 33*(pums_p.wkw==4) + 44*(pums_p.wkw==3) + 49*(pums_p.wkw==2) + 52*(pums_p.wkw==1)
pums_p['hours'] = pums_p.wkhp*(pums_p.wkhp>0) + 0*(pums_p.wkhp==-99)
pums_p['race1'] = pums_p.rac1p
pums_p['hispan'] = pums_p.hisp
pums_p['msp'] = pums_p.msp ##Get rid of -99's in the whole dataframe at end
pums_p['poverty'] = pums_p.povpip
#pums_p['earns'] = pums_p.pernp ##Needs the inflation adjustment #####NEED PERNP INSTEAD OF PINC
#pums_p['indnaics'] = pums_p.naicsp07  ####NEED INDP07
pums_p['pemploy'] = 1*(pums_p.wkhp>=25) + 2*(pums_p.wkhp<25)*(pums_p.wkhp>0) + 3*(pums_p.wkhp<1)
pums_p['pstudent'] = 1*(pums_p.schg<=5)*(pums_p.schg>0) + 2*(pums_p.schg>5) + 3*(pums_p.schg==-99)
pums_p['padkid'] = 1*(pums_p.agep>=18)*(pums_p.agep<=24)*(pums_p.relp.isin([1,2,4,5]))
pums_p.padkid[pums_p.padkid==0] = 2
pums_p['pagecat'] = 0*(pums_p.agep<=5) + 1*(pums_p.agep>=6)*(pums_p.agep<=11) + 2*(pums_p.agep>=12)*(pums_p.agep<=15) + 3*(pums_p.agep>=16)*(pums_p.agep<=17) \
                    + 4*(pums_p.agep>=18)*(pums_p.agep<=24) + 5*(pums_p.agep>=25)*(pums_p.agep<=34) + 6*(pums_p.agep>=35)*(pums_p.agep<=49) \
                    + 7*(pums_p.agep>=50)*(pums_p.agep<=64) + 8*(pums_p.agep>=65)*(pums_p.agep<=79) + 9*(pums_p.agep>=80)
pums_p['ptype'] = 1*(pums_p.agep>=16)*(pums_p.wkhp>=25)*(pums_p.schg==-99) + 2*(pums_p.agep>=16)*(pums_p.wkhp>0)*(pums_p.wkhp<25)*(pums_p.schg==-99) + 3*(pums_p.schg>5) \
                  + 4*(pums_p.schg==-99)*(pums_p.wkhp==-99)*(pums_p.agep>=16)*(pums_p.agep<=64) + 5*(pums_p.schg==-99)*(pums_p.wkhp==-99)*(pums_p.agep>=65) \
                  + 6*(pums_p.schg<6)*(pums_p.wkhp<25)*(pums_p.agep>=16)*(pums_p.agep<=19) + 7*(pums_p.agep>=6)*(pums_p.agep<=15) + 8*(pums_p.agep<=5)  
        ##The categories are not mutually exclusive, I need better bin defintion
joined = pd.merge(pums_p, pums_hh, left_index=True, right_index=True)

In [4]:
pums_p.ptype.value_counts()


Out[4]:
1     59859
7     17234
5     12697
4     11197
8     10354
3      9187
2      6942
6      3872
0       648
10      259
dtype: int64

In [5]:
joined['serialno'] = joined.index.values
joined['is_worker'] = np.in1d(joined.esr,[1,2,3,4,5]).astype('int32')

In [6]:
num_children = joined[joined.agep<18].groupby('serialno').size()
num_adults = joined[joined.agep>=18].groupby('serialno').size()
num_workers = joined.groupby('serialno').is_worker.sum()

In [7]:
pums_p.index.values


Out[7]:
array([2007000000027, 2007000000027, 2007000000027, ..., 2011001501376,
       2011001501693, 2011001501745], dtype=int64)

In [8]:
pums_hh['age_of_head'] = joined[joined.relp==0].agep

In [9]:
pums_hh['hhagecat'] = 1*(pums_hh.age_of_head<65) + 2*(pums_hh.age_of_head>=65)
pums_hh['hhagecat3'] = 1*(pums_hh.age_of_head<45) + 2*(pums_hh.age_of_head>=45)*(pums_hh.age_of_head<65) + 3*(pums_hh.age_of_head>=65)

In [10]:
pums_hh['hhsizecat'] = pums_hh.np*(pums_hh.np<=5) + 5*(pums_hh.np>5)
pums_hh['hhsizecat6'] = pums_hh.np*(pums_hh.np<=6) + 6*(pums_hh.np>6)
pums_hh['hfamily'] = 1*(pums_hh.hht>3) + 2*(pums_hh.hht<=3)
pums_hh['hunittype'] = 0
pums_hh['hnoccat'] = 0*(pums_hh.noc==0) + 1*(pums_hh.noc>0)
pums_hh['hnccat6'] = num_children*(num_children<5) + 5*(num_children>=5)
pums_hh['hnacat6'] =  num_adults*(num_adults<5) + 5*(num_adults>=5)
pums_hh['hwrkrcat'] = num_workers*(num_workers<3) + 3*(num_workers>=3)
pums_hh['hwrkrcat5'] = num_workers*(num_workers<4) + 4*(num_workers>=4)
pums_hh['h0005'] = joined[joined.agep<=5].groupby('serialno').size()
pums_hh['h0611'] = joined[(joined.agep>=6)*(joined.agep<=11)].groupby('serialno').size()
pums_hh['h1215'] = joined[(joined.agep>=12)*(joined.agep<=15)].groupby('serialno').size()
pums_hh['h1617'] = joined[(joined.agep>=16)*(joined.agep<=17)].groupby('serialno').size()
pums_hh['h1824'] = joined[(joined.agep>=18)*(joined.agep<=24)].groupby('serialno').size()
pums_hh['h2534'] = joined[(joined.agep>=25)*(joined.agep<=34)].groupby('serialno').size()
pums_hh['h3549'] = joined[(joined.agep>=35)*(joined.agep<=49)].groupby('serialno').size()
pums_hh['h5064'] = joined[(joined.agep>=50)*(joined.agep<=64)].groupby('serialno').size()
pums_hh['h6579'] = joined[(joined.agep>=65)*(joined.agep<=79)].groupby('serialno').size()
pums_hh['h80up'] = joined[(joined.agep>=80)].groupby('serialno').size()
pums_hh['hworkers'] = num_workers
pums_hh['hwork_f'] = joined[joined.ptype==1].groupby('serialno').size()
pums_hh['hwork_p'] = joined[joined.ptype==2].groupby('serialno').size()
pums_hh['huniv'] = joined[joined.ptype==3].groupby('serialno').size()
pums_hh['hnwork'] = joined[joined.ptype==4].groupby('serialno').size()
pums_hh['hretire'] = joined[joined.ptype==5].groupby('serialno').size()
pums_hh['hpresch'] = joined[joined.ptype==8].groupby('serialno').size()
pums_hh['hschpred'] = joined[joined.ptype==7].groupby('serialno').size()
pums_hh['hschdriv'] = joined[joined.ptype==6].groupby('serialno').size()
pums_hh['hadnwst'] = joined[(joined.agep>=16)*(joined.wkhp==-99)*(joined.schg>0)].groupby('serialno').size()
pums_hh['hadwpst'] = joined[(joined.agep>=16)*(joined.wkhp>0)*(joined.schg>0)].groupby('serialno').size()
pums_hh['hadkids'] = joined[(joined.agep>=18)*(joined.relp==2)].groupby('serialno').size()

In [11]:
pums_hh.columns


Out[11]:
Index([u'puma', u'hincp', u'wif', u'veh', u'noc', u'nrc', u'np', u'hht', u'type', u'ten', u'age_of_head', u'hhagecat', u'hhagecat3', u'hhsizecat', u'hhsizecat6', u'hfamily', u'hunittype', u'hnoccat', u'hnccat6', u'hnacat6', u'hwrkrcat', u'hwrkrcat5', u'h0005', u'h0611', u'h1215', u'h1617', u'h1824', u'h2534', u'h3549', u'h5064', u'h6579', u'h80up', u'hworkers', u'hwork_f', u'hwork_p', u'huniv', u'hnwork', u'hretire', u'hpresch', u'hschpred', u'hschdriv', u'hadnwst', u'hadwpst', u'hadkids'], dtype=object)

In [12]:
print pums_p.esr.describe()
pums_p.columns


count    132249.000000
mean        -18.411754
std          41.221715
min         -99.000000
25%           1.000000
50%           1.000000
75%           3.000000
max           6.000000
dtype: float64
Out[12]:
Index([u'puma', u'agep', u'relp', u'esr', u'sch', u'schg', u'sporder', u'paug', u'ddp', u'sex', u'wkw', u'wkhp', u'rac1p', u'hisp', u'msp', u'povpip', u'pincp', u'naicsp07', u'age', u'relate', u'esr_recode', u'grade', u'pnum', u'weeks', u'hours', u'race1', u'hispan', u'poverty', u'pemploy', u'pstudent', u'padkid', u'pagecat', u'ptype'], dtype=object)

In [13]:
joined.esr.describe()


Out[13]:
count    129813.000000
mean        -18.826966
std          41.456634
min         -99.000000
25%           1.000000
50%           1.000000
75%           3.000000
max           6.000000
dtype: float64

In [14]:
pums_hh['puma5'] = pums_hh.puma  ##Needs to be updated by urbansim!!
pums_hh['hinc'] = pums_hh.hincp ##Still need ADJINC  to adjust HINCP to constant dollars.  Also, consider putting in terms of 1999 dollars.
pums_hh['persons'] = pums_hh.np
pums_hh.hht[pums_hh.hht==-99] = 0
pums_hh['unittype'] = pums_hh.type - 1
pums_hh['hinccat1'] = 1*(pums_hh.hinc<20000) + 2*(pums_hh.hinc>=20000)*(pums_hh.hinc<50000) + 3*(pums_hh.hinc>=50000)*(pums_hh.hinc<100000) + 4*(pums_hh.hinc>=100000)
pums_hh['hinccat2'] = 1*(pums_hh.hinc<10000) + 2*(pums_hh.hinc>=10000)*(pums_hh.hinc<20000) + 3*(pums_hh.hinc>=20000)*(pums_hh.hinc<30000) + \
                      4*(pums_hh.hinc>=30000)*(pums_hh.hinc<40000) + 5*(pums_hh.hinc>=40000)*(pums_hh.hinc<50000) + 6*(pums_hh.hinc>=50000)*(pums_hh.hinc<60000) + \
                      7*(pums_hh.hinc>=60000)*(pums_hh.hinc<75000) + 8*(pums_hh.hinc>=75000)*(pums_hh.hinc<100000) +  9*(pums_hh.hinc>=100000)
pums_hh['hownrent'] = 1*(pums_hh.ten<3) + 2*(pums_hh.ten>2)
pums_hh['bucketBin'] = 0
pums_hh['originalpuma'] = pums_hh.puma

In [16]:
household = sql.read_frame('select * from synthetic_hh',conn)
person = sql.read_frame('select * from synthetic_persons',conn)

In [21]:
household.head()


Out[21]:
state county taz serialno
0 8 1 202120 2010000000000
1 8 1 202120 2010000000000
2 8 1 202120 2010000000000
3 8 1 202120 2010000000000
4 8 1 202120 2010000000000

In [20]:
person.head()


Out[20]:
state county taz serialno pnum
0 8 1 202120 2007000000000 1
1 8 1 202120 2011000000000 1
2 8 1 202120 2011000000000 1
3 8 1 202120 2010000000000 1
4 8 1 202120 2007000000000 1

In [10]:
parcels['centroid_x'] = parcels_ref.centroid_x
parcels['centroid_y'] = parcels_ref.centroid_y

In [11]:
parcels[['parcel_sqft','centroid_x','centroid_y']].head()


Out[11]:
parcel_sqft centroid_x centroid_y
parcel_id
1 20479 3174135 1695429
2 17777 3180889 1695592
3 59025 3175519 1699844
4 1640178 3192128 1747750
5 0 3154372 1747898

In [12]:
parcels_ref.head()


Out[12]:
parcel_sqft centroid_x centroid_y
parcel_id
1 20479 3174135 1695429
2 17777 3180889 1695592
3 59025 3175519 1699844
4 285497 3192128 1747750
5 27156 3154372 1747898

In [13]:
store = pd.HDFStore('c:\\urbansim\\data\\drcog.h5')

In [14]:
store['parcels'] = parcels

In [15]:
store.close()