notebook.community

Edit and run



In [10]:

    
import pandas as pd



In [11]:

    
df = pd.read_csv('MedianValueSqFt_by_zip.csv', converters={'RegionName': lambda x: str(x)})



In [12]:

    
df2 = pd.read_csv('ACS.csv', converters={'zip.code.tabulation.area.': lambda x: str(x)})



In [13]:

    
df2.head()









    Out[13]:






  
    
      
      Unnamed: 0
      X..B01003_001E
      B01001_011E
      B01001_035E
      B02001_002E
      B02001_003E
      B02001_004E
      B02001_005E
      B02001_006E
      B02001_007E
      ...
      B25002_002E
      B25003_002E
      B25075_023E
      B25075_024E
      B25075_025E
      B25002_003E
      B25077_001E
      B25064_001E
      zip.code.tabulation.area.
      X
    
  
  
    
      0
      1
      18088
      592
      592
      17408
      175
      5
      0
      0
      431
      ...
      5820
      3307
      71
      0
      9
      1336
      105400
      367
      00601
      NaN
    
    
      1
      2
      40859
      1294
      1299
      23919
      1097
      27
      75
      0
      1520
      ...
      13046
      10143
      68
      0
      15
      3797
      91200
      411
      00602
      NaN
    
    
      2
      3
      53162
      1508
      1640
      38809
      1913
      75
      629
      0
      3567
      ...
      18020
      11384
      192
      27
      77
      6306
      128700
      369
      00603
      NaN
    
    
      3
      4
      6415
      125
      207
      5621
      154
      0
      0
      0
      519
      ...
      1898
      1334
      49
      0
      0
      659
      105800
      326
      00606
      NaN
    
    
      4
      5
      28805
      853
      838
      19336
      1002
      0
      26
      0
      5568
      ...
      9449
      7165
      147
      22
      0
      2538
      113700
      419
      00610
      NaN
    
  

5 rows × 25 columns



In [14]:

    
df2['RegionName'] = df2['zip.code.tabulation.area.']



In [15]:

    
res_df = df.merge(df2, how='right', on='RegionName')



In [16]:

    
res_df.reset_index(drop=True).head()









    Out[16]:






  
    
      
      RegionID
      RegionName
      City
      State
      Metro
      CountyName
      SizeRank
      2007-01
      2007-02
      2007-03
      ...
      B25002_002E
      B25003_002E
      B25075_023E
      B25075_024E
      B25075_025E
      B25002_003E
      B25077_001E
      B25064_001E
      zip.code.tabulation.area.
      X
    
  
  
    
      0
      62080.0
      11368
      New York
      NY
      New York
      Queens
      1.0
      377.0
      378.0
      378.0
      ...
      29031
      6009
      1801
      706
      100
      2089
      461500
      1410
      11368
      NaN
    
    
      1
      93144.0
      79936
      El Paso
      TX
      El Paso
      El Paso
      2.0
      88.0
      89.0
      89.0
      ...
      34921
      24470
      62
      28
      32
      1576
      121200
      963
      79936
      NaN
    
    
      2
      84630.0
      60629
      Chicago
      IL
      Chicago
      Cook
      3.0
      207.0
      208.0
      208.0
      ...
      30692
      17872
      61
      8
      1
      4100
      149000
      883
      60629
      NaN
    
    
      3
      96193.0
      90650
      Norwalk
      CA
      Los Angeles-Long Beach-Anaheim, CA
      Los Angeles
      4.0
      395.0
      396.0
      397.0
      ...
      27132
      17722
      419
      48
      88
      1166
      320900
      1318
      90650
      NaN
    
    
      4
      62037.0
      11226
      New York
      NY
      New York
      Kings
      5.0
      328.0
      333.0
      336.0
      ...
      35998
      4662
      1037
      511
      663
      3476
      485900
      1164
      11226
      NaN
    
  

5 rows × 142 columns



In [8]:

    
res_df['RegionName'] = res_df['RegionName'].astype(str)



In [17]:

    
res_df.head()









    Out[17]:






  
    
      
      RegionID
      RegionName
      City
      State
      Metro
      CountyName
      SizeRank
      2007-01
      2007-02
      2007-03
      ...
      B25002_002E
      B25003_002E
      B25075_023E
      B25075_024E
      B25075_025E
      B25002_003E
      B25077_001E
      B25064_001E
      zip.code.tabulation.area.
      X
    
  
  
    
      0
      62080.0
      11368
      New York
      NY
      New York
      Queens
      1.0
      377.0
      378.0
      378.0
      ...
      29031
      6009
      1801
      706
      100
      2089
      461500
      1410
      11368
      NaN
    
    
      1
      93144.0
      79936
      El Paso
      TX
      El Paso
      El Paso
      2.0
      88.0
      89.0
      89.0
      ...
      34921
      24470
      62
      28
      32
      1576
      121200
      963
      79936
      NaN
    
    
      2
      84630.0
      60629
      Chicago
      IL
      Chicago
      Cook
      3.0
      207.0
      208.0
      208.0
      ...
      30692
      17872
      61
      8
      1
      4100
      149000
      883
      60629
      NaN
    
    
      3
      96193.0
      90650
      Norwalk
      CA
      Los Angeles-Long Beach-Anaheim, CA
      Los Angeles
      4.0
      395.0
      396.0
      397.0
      ...
      27132
      17722
      419
      48
      88
      1166
      320900
      1318
      90650
      NaN
    
    
      4
      62037.0
      11226
      New York
      NY
      New York
      Kings
      5.0
      328.0
      333.0
      336.0
      ...
      35998
      4662
      1037
      511
      663
      3476
      485900
      1164
      11226
      NaN
    
  

5 rows × 142 columns



In [18]:

    
res_df.count()









    Out[18]:





RegionID                     12765
RegionName                   33120
City                         12765
State                        12765
Metro                        12308
CountyName                   12765
SizeRank                     12765
2007-01                      12307
2007-02                      12317
2007-03                      12318
2007-04                      12318
2007-05                      12318
2007-06                      12319
2007-07                      12319
2007-08                      12333
2007-09                      12333
2007-10                      12334
2007-11                      12334
2007-12                      12334
2008-01                      12334
2008-02                      12339
2008-03                      12340
2008-04                      12340
2008-05                      12340
2008-06                      12340
2008-07                      12340
2008-08                      12359
2008-09                      12360
2008-10                      12360
2008-11                      12360
                             ...  
2015-10                      12765
2015-11                      12765
2015-12                      12765
2016-01                      12765
2016-02                      12765
Unnamed: 0                   33120
X..B01003_001E               33120
B01001_011E                  33120
B01001_035E                  33120
B02001_002E                  33120
B02001_003E                  33120
B02001_004E                  33120
B02001_005E                  33120
B02001_006E                  33120
B02001_007E                  33120
B02001_008E                  33120
B03001_003E                  33120
B19013_001E                  33120
B19113_001E                  33120
B25001_001E                  33120
B25002_002E                  33120
B25003_002E                  33120
B25075_023E                  33120
B25075_024E                  33120
B25075_025E                  33120
B25002_003E                  33120
B25077_001E                  33120
B25064_001E                  33120
zip.code.tabulation.area.    33120
X                                0
dtype: int64



In [19]:

    
res_df.to_csv('right_joined_census.csv')



In [20]:

    
trunc_csv = res_df[['RegionID', 'RegionName', 'City', 'State', 'Metro', 'CountyName', '2015-01', '2016-01', '2016-02', 'B01001_011E', 'B01001_035E', 'B02001_002E', 'B02001_003E', 'B02001_004E', 'B02001_005E', 'B02001_006E', 'B02001_007E', 'B02001_008E', 'B03001_003E', 'B19013_001E', 'B19113_001E', 'B25001_001E', 'B25002_002E', 'B25003_002E', 'B25075_023E', 'B25075_024E', 'B25075_025E', 'B25002_003E', 'B25077_001E', 'B25064_001E'
]]



In [21]:

    
trunc_csv.count()









    Out[21]:





RegionID       12765
RegionName     33120
City           12765
State          12765
Metro          12308
CountyName     12765
2015-01        12765
2016-01        12765
2016-02        12765
B01001_011E    33120
B01001_035E    33120
B02001_002E    33120
B02001_003E    33120
B02001_004E    33120
B02001_005E    33120
B02001_006E    33120
B02001_007E    33120
B02001_008E    33120
B03001_003E    33120
B19013_001E    33120
B19113_001E    33120
B25001_001E    33120
B25002_002E    33120
B25003_002E    33120
B25075_023E    33120
B25075_024E    33120
B25075_025E    33120
B25002_003E    33120
B25077_001E    33120
B25064_001E    33120
dtype: int64



In [23]:

    
trunc_csv.to_csv('truncated_csv.csv',index=False)



In [ ]:

	Unnamed: 0	X..B01003_001E	B01001_011E	B01001_035E	B02001_002E	B02001_003E	B02001_004E	B02001_005E	B02001_007E	...	B25002_002E	B25003_002E	B25075_023E	B25075_024E	B25075_025E	B25002_003E	B25077_001E	B25064_001E	zip.code.tabulation.area.	X
0	1	18088	592	592	17408	175	5	0	431	...	5820	3307	71	0	9	1336	105400	367	00601	NaN
1	2	40859	1294	1299	23919	1097	27	75	1520	...	13046	10143	68	0	15	3797	91200	411	00602	NaN
2	3	53162	1508	1640	38809	1913	75	629	3567	...	18020	11384	192	27	77	6306	128700	369	00603	NaN
3	4	6415	125	207	5621	154	0	0	519	...	1898	1334	49	0	0	659	105800	326	00606	NaN
4	5	28805	853	838	19336	1002	0	26	5568	...	9449	7165	147	22	0	2538	113700	419	00610	NaN

	RegionID	RegionName	City	State	Metro	CountyName	SizeRank	2007-01	2007-02	2007-03	...	B25002_002E	B25003_002E	B25075_023E	B25075_024E	B25075_025E	B25002_003E	B25077_001E	B25064_001E	zip.code.tabulation.area.	X
0	62080.0	11368	New York	NY	New York	Queens	1.0	377.0	378.0	378.0	...	29031	6009	1801	706	100	2089	461500	1410	11368	NaN
1	93144.0	79936	El Paso	TX	El Paso	El Paso	2.0	88.0	89.0	89.0	...	34921	24470	62	28	32	1576	121200	963	79936	NaN
2	84630.0	60629	Chicago	IL	Chicago	Cook	3.0	207.0	208.0	208.0	...	30692	17872	61	8	1	4100	149000	883	60629	NaN
3	96193.0	90650	Norwalk	CA	Los Angeles-Long Beach-Anaheim, CA	Los Angeles	4.0	395.0	396.0	397.0	...	27132	17722	419	48	88	1166	320900	1318	90650	NaN
4	62037.0	11226	New York	NY	New York	Kings	5.0	328.0	333.0	336.0	...	35998	4662	1037	511	663	3476	485900	1164	11226	NaN