In [10]:
import pandas as pd

In [11]:
df = pd.read_csv('MedianValueSqFt_by_zip.csv', converters={'RegionName': lambda x: str(x)})

In [12]:
df2 = pd.read_csv('ACS.csv', converters={'zip.code.tabulation.area.': lambda x: str(x)})

In [13]:
df2.head()


Out[13]:
Unnamed: 0 X..B01003_001E B01001_011E B01001_035E B02001_002E B02001_003E B02001_004E B02001_005E B02001_006E B02001_007E ... B25002_002E B25003_002E B25075_023E B25075_024E B25075_025E B25002_003E B25077_001E B25064_001E zip.code.tabulation.area. X
0 1 18088 592 592 17408 175 5 0 0 431 ... 5820 3307 71 0 9 1336 105400 367 00601 NaN
1 2 40859 1294 1299 23919 1097 27 75 0 1520 ... 13046 10143 68 0 15 3797 91200 411 00602 NaN
2 3 53162 1508 1640 38809 1913 75 629 0 3567 ... 18020 11384 192 27 77 6306 128700 369 00603 NaN
3 4 6415 125 207 5621 154 0 0 0 519 ... 1898 1334 49 0 0 659 105800 326 00606 NaN
4 5 28805 853 838 19336 1002 0 26 0 5568 ... 9449 7165 147 22 0 2538 113700 419 00610 NaN

5 rows × 25 columns


In [14]:
df2['RegionName'] = df2['zip.code.tabulation.area.']

In [15]:
res_df = df.merge(df2, how='right', on='RegionName')

In [16]:
res_df.reset_index(drop=True).head()


Out[16]:
RegionID RegionName City State Metro CountyName SizeRank 2007-01 2007-02 2007-03 ... B25002_002E B25003_002E B25075_023E B25075_024E B25075_025E B25002_003E B25077_001E B25064_001E zip.code.tabulation.area. X
0 62080.0 11368 New York NY New York Queens 1.0 377.0 378.0 378.0 ... 29031 6009 1801 706 100 2089 461500 1410 11368 NaN
1 93144.0 79936 El Paso TX El Paso El Paso 2.0 88.0 89.0 89.0 ... 34921 24470 62 28 32 1576 121200 963 79936 NaN
2 84630.0 60629 Chicago IL Chicago Cook 3.0 207.0 208.0 208.0 ... 30692 17872 61 8 1 4100 149000 883 60629 NaN
3 96193.0 90650 Norwalk CA Los Angeles-Long Beach-Anaheim, CA Los Angeles 4.0 395.0 396.0 397.0 ... 27132 17722 419 48 88 1166 320900 1318 90650 NaN
4 62037.0 11226 New York NY New York Kings 5.0 328.0 333.0 336.0 ... 35998 4662 1037 511 663 3476 485900 1164 11226 NaN

5 rows × 142 columns


In [8]:
res_df['RegionName'] = res_df['RegionName'].astype(str)

In [17]:
res_df.head()


Out[17]:
RegionID RegionName City State Metro CountyName SizeRank 2007-01 2007-02 2007-03 ... B25002_002E B25003_002E B25075_023E B25075_024E B25075_025E B25002_003E B25077_001E B25064_001E zip.code.tabulation.area. X
0 62080.0 11368 New York NY New York Queens 1.0 377.0 378.0 378.0 ... 29031 6009 1801 706 100 2089 461500 1410 11368 NaN
1 93144.0 79936 El Paso TX El Paso El Paso 2.0 88.0 89.0 89.0 ... 34921 24470 62 28 32 1576 121200 963 79936 NaN
2 84630.0 60629 Chicago IL Chicago Cook 3.0 207.0 208.0 208.0 ... 30692 17872 61 8 1 4100 149000 883 60629 NaN
3 96193.0 90650 Norwalk CA Los Angeles-Long Beach-Anaheim, CA Los Angeles 4.0 395.0 396.0 397.0 ... 27132 17722 419 48 88 1166 320900 1318 90650 NaN
4 62037.0 11226 New York NY New York Kings 5.0 328.0 333.0 336.0 ... 35998 4662 1037 511 663 3476 485900 1164 11226 NaN

5 rows × 142 columns


In [18]:
res_df.count()


Out[18]:
RegionID                     12765
RegionName                   33120
City                         12765
State                        12765
Metro                        12308
CountyName                   12765
SizeRank                     12765
2007-01                      12307
2007-02                      12317
2007-03                      12318
2007-04                      12318
2007-05                      12318
2007-06                      12319
2007-07                      12319
2007-08                      12333
2007-09                      12333
2007-10                      12334
2007-11                      12334
2007-12                      12334
2008-01                      12334
2008-02                      12339
2008-03                      12340
2008-04                      12340
2008-05                      12340
2008-06                      12340
2008-07                      12340
2008-08                      12359
2008-09                      12360
2008-10                      12360
2008-11                      12360
                             ...  
2015-10                      12765
2015-11                      12765
2015-12                      12765
2016-01                      12765
2016-02                      12765
Unnamed: 0                   33120
X..B01003_001E               33120
B01001_011E                  33120
B01001_035E                  33120
B02001_002E                  33120
B02001_003E                  33120
B02001_004E                  33120
B02001_005E                  33120
B02001_006E                  33120
B02001_007E                  33120
B02001_008E                  33120
B03001_003E                  33120
B19013_001E                  33120
B19113_001E                  33120
B25001_001E                  33120
B25002_002E                  33120
B25003_002E                  33120
B25075_023E                  33120
B25075_024E                  33120
B25075_025E                  33120
B25002_003E                  33120
B25077_001E                  33120
B25064_001E                  33120
zip.code.tabulation.area.    33120
X                                0
dtype: int64

In [19]:
res_df.to_csv('right_joined_census.csv')

In [20]:
trunc_csv = res_df[['RegionID', 'RegionName', 'City', 'State', 'Metro', 'CountyName', '2015-01', '2016-01', '2016-02', 'B01001_011E', 'B01001_035E', 'B02001_002E', 'B02001_003E', 'B02001_004E', 'B02001_005E', 'B02001_006E', 'B02001_007E', 'B02001_008E', 'B03001_003E', 'B19013_001E', 'B19113_001E', 'B25001_001E', 'B25002_002E', 'B25003_002E', 'B25075_023E', 'B25075_024E', 'B25075_025E', 'B25002_003E', 'B25077_001E', 'B25064_001E'
]]

In [21]:
trunc_csv.count()


Out[21]:
RegionID       12765
RegionName     33120
City           12765
State          12765
Metro          12308
CountyName     12765
2015-01        12765
2016-01        12765
2016-02        12765
B01001_011E    33120
B01001_035E    33120
B02001_002E    33120
B02001_003E    33120
B02001_004E    33120
B02001_005E    33120
B02001_006E    33120
B02001_007E    33120
B02001_008E    33120
B03001_003E    33120
B19013_001E    33120
B19113_001E    33120
B25001_001E    33120
B25002_002E    33120
B25003_002E    33120
B25075_023E    33120
B25075_024E    33120
B25075_025E    33120
B25002_003E    33120
B25077_001E    33120
B25064_001E    33120
dtype: int64

In [23]:
trunc_csv.to_csv('truncated_csv.csv',index=False)

In [ ]: