In [1]:
import pandas as pd
In [2]:
DATA_FOLDER = '.\\Data\\Processed Data\\Zip\\'
In [3]:
data = pd.read_csv(DATA_FOLDER + 'joined data.csv', index_col=[0,1], usecols=[x for x in range(87) if x not in [1,2]])
data.head()
Out[3]:
Zip_HomesSoldAsForeclosures-Ratio_AllHomes
Zip_MedianListingPricePerSqft_1Bedroom
Zip_MedianListingPricePerSqft_2Bedroom
Zip_MedianListingPricePerSqft_3Bedroom
Zip_MedianListingPricePerSqft_4Bedroom
Zip_MedianListingPricePerSqft_5BedroomOrMore
Zip_MedianListingPricePerSqft_AllHomes
Zip_MedianListingPricePerSqft_CondoCoop
Zip_MedianListingPricePerSqft_DuplexTriplex
Zip_MedianListingPricePerSqft_Sfr
...
Zip_Zhvi_MiddleTier
Zip_Zhvi_MiddleTier_yoy
Zip_Zhvi_SingleFamilyResidence
Zip_Zhvi_TopTier
Zip_Zhvi_TopTier_yoy
Zip_ZriPerSqft_AllHomes
Zip_Zri_AllHomes
Zip_Zri_AllHomesPlusMultifamily
Zip_Zri_MultiFamilyResidenceRental
Zip_Zri_SingleFamilyResidenceRental
RegionName
Period
1001
2011-01
2.5503
NaN
141.901724
NaN
NaN
NaN
142.220114
NaN
NaN
150.107659
...
176800.0
-0.053026
189400.0
235000.0
-0.046653
1.068
1392.0
1378.0
1225.0
1433.0
2011-02
3.9721
NaN
141.370474
NaN
NaN
NaN
144.645833
NaN
NaN
153.088546
...
177300.0
-0.055911
189400.0
235700.0
-0.042648
1.082
1408.0
1397.0
1228.0
1450.0
2011-03
4.9612
NaN
144.645833
NaN
NaN
NaN
144.292433
NaN
NaN
148.055556
...
178300.0
-0.048051
190200.0
237700.0
-0.024220
1.080
1412.0
1400.0
1213.0
1449.0
2011-04
4.6598
NaN
144.645833
NaN
NaN
NaN
143.456660
NaN
NaN
145.918491
...
178600.0
-0.041845
190000.0
238300.0
-0.015696
1.086
1424.0
1411.0
1216.0
1459.0
2011-05
4.0497
NaN
142.712671
NaN
NaN
NaN
142.356336
NaN
NaN
143.487395
...
178200.0
-0.037797
189100.0
237000.0
-0.017005
1.088
1446.0
1434.0
1231.0
1475.0
5 rows × 83 columns
In [4]:
col_pop = {}
for col in data.axes[1]:
not_nan = 0
col_data = pd.Series(data[col])
for row in col_data:
if row < 0 or row >= 0: # Get rid of NaN
not_nan += 1
col_pop[col] = not_nan / len(col_data)
print('{}:\t\t{:.2f}'.format(col, col_pop[col]))
Zip_HomesSoldAsForeclosures-Ratio_AllHomes: 0.44
Zip_MedianListingPricePerSqft_1Bedroom: 0.00
Zip_MedianListingPricePerSqft_2Bedroom: 0.04
Zip_MedianListingPricePerSqft_3Bedroom: 0.20
Zip_MedianListingPricePerSqft_4Bedroom: 0.10
Zip_MedianListingPricePerSqft_5BedroomOrMore: 0.01
Zip_MedianListingPricePerSqft_AllHomes: 0.40
Zip_MedianListingPricePerSqft_CondoCoop: 0.04
Zip_MedianListingPricePerSqft_DuplexTriplex: 0.01
Zip_MedianListingPricePerSqft_Sfr: 0.37
Zip_MedianListingPrice_1Bedroom: 0.00
Zip_MedianListingPrice_2Bedroom: 0.03
Zip_MedianListingPrice_3Bedroom: 0.17
Zip_MedianListingPrice_4Bedroom: 0.08
Zip_MedianListingPrice_5BedroomOrMore: 0.01
Zip_MedianListingPrice_AllHomes: 0.31
Zip_MedianListingPrice_CondoCoop: 0.03
Zip_MedianListingPrice_DuplexTriplex: 0.01
Zip_MedianListingPrice_Sfr: 0.29
Zip_MedianRentalPricePerSqft_1Bedroom: 0.01
Zip_MedianRentalPricePerSqft_2Bedroom: 0.01
Zip_MedianRentalPricePerSqft_3Bedroom: 0.01
Zip_MedianRentalPricePerSqft_4Bedroom: 0.00
Zip_MedianRentalPricePerSqft_5BedroomOrMore: 0.00
Zip_MedianRentalPricePerSqft_AllHomes: 0.05
Zip_MedianRentalPricePerSqft_CondoCoop: 0.01
Zip_MedianRentalPricePerSqft_DuplexTriplex: 0.01
Zip_MedianRentalPricePerSqft_Mfr5Plus: 0.03
Zip_MedianRentalPricePerSqft_Sfr: 0.02
Zip_MedianRentalPricePerSqft_Studio: 0.00
Zip_MedianRentalPrice_1Bedroom: 0.01
Zip_MedianRentalPrice_2Bedroom: 0.02
Zip_MedianRentalPrice_3Bedroom: 0.01
Zip_MedianRentalPrice_4Bedroom: 0.00
Zip_MedianRentalPrice_5BedroomOrMore: 0.00
Zip_MedianRentalPrice_AllHomes: 0.05
Zip_MedianRentalPrice_CondoCoop: 0.01
Zip_MedianRentalPrice_DuplexTriplex: 0.02
Zip_MedianRentalPrice_Mfr5Plus: 0.04
Zip_MedianRentalPrice_Sfr: 0.02
Zip_MedianRentalPrice_Studio: 0.00
Zip_MedianSoldPricePerSqft_AllHomes: 0.29
Zip_MedianSoldPricePerSqft_Condominum: 0.03
Zip_MedianSoldPricePerSqft_SingleFamilyResidence: 0.26
Zip_MedianSoldPrice_AllHomes: 0.31
Zip_MedianValuePerSqft_AllHomes: 0.63
Zip_PctOfHomesDecreasingInValues_AllHomes: 0.61
Zip_PctOfHomesIncreasingInValues_AllHomes: 0.61
Zip_PctTransactionsThatArePreviouslyForeclosuredHomes_AllHomes: 0.30
Zip_PriceToRentRatio_AllHomes: 0.63
Zip_Turnover_AllHomes: 0.93
Zip_UnsoldReos_AllHomes: 0.68
Zip_ZhvAvgRaw_AllHomes: 0.64
Zip_ZhvAvgRaw_AllHomes_all: 0.95
Zip_ZhvAvgRaw_Condominum: 0.28
Zip_ZhvAvgRaw_Condominum_all: 0.35
Zip_ZhvAvgRaw_SingleFamilyResidence: 0.63
Zip_ZhvAvgRaw_SingleFamilyResidence_all: 0.94
Zip_ZhvAvg_AllHomes_all_yoy: 0.95
Zip_ZhvAvg_AllHomes_yoy: 0.63
Zip_ZhvAvg_Condominum_all_yoy: 0.35
Zip_ZhvAvg_Condominum_yoy: 0.27
Zip_ZhvAvg_SingleFamilyResidence_all_yoy: 0.94
Zip_ZhvAvg_SingleFamilyResidence_yoy: 0.62
Zip_Zhvi_1bedroom: 0.12
Zip_Zhvi_2bedroom: 0.46
Zip_Zhvi_3bedroom: 0.57
Zip_Zhvi_4bedroom: 0.49
Zip_Zhvi_5BedroomOrMore: 0.28
Zip_Zhvi_AllHomes: 0.64
Zip_Zhvi_BottomTier: 0.60
Zip_Zhvi_BottomTier_yoy: 0.59
Zip_Zhvi_Condominum: 0.28
Zip_Zhvi_MiddleTier: 0.64
Zip_Zhvi_MiddleTier_yoy: 0.63
Zip_Zhvi_SingleFamilyResidence: 0.63
Zip_Zhvi_TopTier: 0.63
Zip_Zhvi_TopTier_yoy: 0.63
Zip_ZriPerSqft_AllHomes: 0.69
Zip_Zri_AllHomes: 0.71
Zip_Zri_AllHomesPlusMultifamily: 0.71
Zip_Zri_MultiFamilyResidenceRental: 0.43
Zip_Zri_SingleFamilyResidenceRental: 0.71
In [5]:
for col, pop in col_pop.items():
if pop < 0.1:
print('{}:\t\t{:.2f}'.format(col, pop))
Zip_MedianListingPricePerSqft_1Bedroom: 0.00
Zip_MedianListingPricePerSqft_2Bedroom: 0.04
Zip_MedianListingPricePerSqft_4Bedroom: 0.10
Zip_MedianListingPricePerSqft_5BedroomOrMore: 0.01
Zip_MedianListingPricePerSqft_CondoCoop: 0.04
Zip_MedianListingPricePerSqft_DuplexTriplex: 0.01
Zip_MedianListingPrice_1Bedroom: 0.00
Zip_MedianListingPrice_2Bedroom: 0.03
Zip_MedianListingPrice_4Bedroom: 0.08
Zip_MedianListingPrice_5BedroomOrMore: 0.01
Zip_MedianListingPrice_CondoCoop: 0.03
Zip_MedianListingPrice_DuplexTriplex: 0.01
Zip_MedianRentalPricePerSqft_1Bedroom: 0.01
Zip_MedianRentalPricePerSqft_2Bedroom: 0.01
Zip_MedianRentalPricePerSqft_3Bedroom: 0.01
Zip_MedianRentalPricePerSqft_4Bedroom: 0.00
Zip_MedianRentalPricePerSqft_5BedroomOrMore: 0.00
Zip_MedianRentalPricePerSqft_AllHomes: 0.05
Zip_MedianRentalPricePerSqft_CondoCoop: 0.01
Zip_MedianRentalPricePerSqft_DuplexTriplex: 0.01
Zip_MedianRentalPricePerSqft_Mfr5Plus: 0.03
Zip_MedianRentalPricePerSqft_Sfr: 0.02
Zip_MedianRentalPricePerSqft_Studio: 0.00
Zip_MedianRentalPrice_1Bedroom: 0.01
Zip_MedianRentalPrice_2Bedroom: 0.02
Zip_MedianRentalPrice_3Bedroom: 0.01
Zip_MedianRentalPrice_4Bedroom: 0.00
Zip_MedianRentalPrice_5BedroomOrMore: 0.00
Zip_MedianRentalPrice_AllHomes: 0.05
Zip_MedianRentalPrice_CondoCoop: 0.01
Zip_MedianRentalPrice_DuplexTriplex: 0.02
Zip_MedianRentalPrice_Mfr5Plus: 0.04
Zip_MedianRentalPrice_Sfr: 0.02
Zip_MedianRentalPrice_Studio: 0.00
Zip_MedianSoldPricePerSqft_Condominum: 0.03
In [6]:
data2 = data.drop([col for col in col_pop if col_pop[col] < .6], axis='columns').dropna(how='any', axis='index')
print('{:.2f}'.format(len(data2.axes[0]) / len(data.axes[0])))
0.47
In [7]:
data2.head()
Out[7]:
Zip_MedianValuePerSqft_AllHomes
Zip_PctOfHomesDecreasingInValues_AllHomes
Zip_PctOfHomesIncreasingInValues_AllHomes
Zip_PriceToRentRatio_AllHomes
Zip_Turnover_AllHomes
Zip_UnsoldReos_AllHomes
Zip_ZhvAvgRaw_AllHomes
Zip_ZhvAvgRaw_AllHomes_all
Zip_ZhvAvgRaw_SingleFamilyResidence
Zip_ZhvAvgRaw_SingleFamilyResidence_all
...
Zip_Zhvi_AllHomes
Zip_Zhvi_MiddleTier
Zip_Zhvi_MiddleTier_yoy
Zip_Zhvi_SingleFamilyResidence
Zip_Zhvi_TopTier
Zip_Zhvi_TopTier_yoy
Zip_ZriPerSqft_AllHomes
Zip_Zri_AllHomes
Zip_Zri_AllHomesPlusMultifamily
Zip_Zri_SingleFamilyResidenceRental
RegionName
Period
1001
2011-01
136.0
70.48
15.17
11.26
5.5905
44.0
196000.0
196000.0
215500.0
215500.0
...
176800.0
176800.0
-0.053026
189400.0
235000.0
-0.046653
1.068
1392.0
1378.0
1433.0
2011-02
136.0
66.60
17.63
11.15
5.4682
45.7
195000.0
195000.0
214500.0
214500.0
...
177300.0
177300.0
-0.055911
189400.0
235700.0
-0.042648
1.082
1408.0
1397.0
1450.0
2011-03
137.0
62.95
20.83
11.46
5.3450
47.3
197500.0
197500.0
217500.0
217500.0
...
178300.0
178300.0
-0.048051
190200.0
237700.0
-0.024220
1.080
1412.0
1400.0
1449.0
2011-04
137.0
69.29
14.49
10.79
5.0131
48.6
196500.0
196500.0
216000.0
216000.0
...
178600.0
178600.0
-0.041845
190000.0
238300.0
-0.015696
1.086
1424.0
1411.0
1459.0
2011-05
136.0
74.57
11.57
10.37
4.8384
49.5
195500.0
195500.0
214500.0
214500.0
...
178200.0
178200.0
-0.037797
189100.0
237000.0
-0.017005
1.088
1446.0
1434.0
1475.0
5 rows × 24 columns
In [8]:
data2.to_csv('.\\Data\\Processed Data\\zip_clean.csv')
In [ ]:
Content source: Think-Machine/Zillow-Demo
Similar notebooks: