In [1]:
import pandas as pd

In [2]:
DATA_FOLDER = '.\\Data\\Processed Data\\Zip\\'

In [3]:
data = pd.read_csv(DATA_FOLDER + 'joined data.csv', index_col=[0,1], usecols=[x for x in range(87) if x not in [1,2]])
data.head()


Out[3]:
Zip_HomesSoldAsForeclosures-Ratio_AllHomes Zip_MedianListingPricePerSqft_1Bedroom Zip_MedianListingPricePerSqft_2Bedroom Zip_MedianListingPricePerSqft_3Bedroom Zip_MedianListingPricePerSqft_4Bedroom Zip_MedianListingPricePerSqft_5BedroomOrMore Zip_MedianListingPricePerSqft_AllHomes Zip_MedianListingPricePerSqft_CondoCoop Zip_MedianListingPricePerSqft_DuplexTriplex Zip_MedianListingPricePerSqft_Sfr ... Zip_Zhvi_MiddleTier Zip_Zhvi_MiddleTier_yoy Zip_Zhvi_SingleFamilyResidence Zip_Zhvi_TopTier Zip_Zhvi_TopTier_yoy Zip_ZriPerSqft_AllHomes Zip_Zri_AllHomes Zip_Zri_AllHomesPlusMultifamily Zip_Zri_MultiFamilyResidenceRental Zip_Zri_SingleFamilyResidenceRental
RegionName Period
1001 2011-01 2.5503 NaN 141.901724 NaN NaN NaN 142.220114 NaN NaN 150.107659 ... 176800.0 -0.053026 189400.0 235000.0 -0.046653 1.068 1392.0 1378.0 1225.0 1433.0
2011-02 3.9721 NaN 141.370474 NaN NaN NaN 144.645833 NaN NaN 153.088546 ... 177300.0 -0.055911 189400.0 235700.0 -0.042648 1.082 1408.0 1397.0 1228.0 1450.0
2011-03 4.9612 NaN 144.645833 NaN NaN NaN 144.292433 NaN NaN 148.055556 ... 178300.0 -0.048051 190200.0 237700.0 -0.024220 1.080 1412.0 1400.0 1213.0 1449.0
2011-04 4.6598 NaN 144.645833 NaN NaN NaN 143.456660 NaN NaN 145.918491 ... 178600.0 -0.041845 190000.0 238300.0 -0.015696 1.086 1424.0 1411.0 1216.0 1459.0
2011-05 4.0497 NaN 142.712671 NaN NaN NaN 142.356336 NaN NaN 143.487395 ... 178200.0 -0.037797 189100.0 237000.0 -0.017005 1.088 1446.0 1434.0 1231.0 1475.0

5 rows × 83 columns


In [4]:
col_pop = {}
for col in data.axes[1]:
    not_nan = 0
    col_data = pd.Series(data[col])
    for row in col_data:
        if row < 0 or row >= 0: # Get rid of NaN
            not_nan += 1
        
    col_pop[col] = not_nan / len(col_data)
    print('{}:\t\t{:.2f}'.format(col, col_pop[col]))


Zip_HomesSoldAsForeclosures-Ratio_AllHomes:		0.44
Zip_MedianListingPricePerSqft_1Bedroom:		0.00
Zip_MedianListingPricePerSqft_2Bedroom:		0.04
Zip_MedianListingPricePerSqft_3Bedroom:		0.20
Zip_MedianListingPricePerSqft_4Bedroom:		0.10
Zip_MedianListingPricePerSqft_5BedroomOrMore:		0.01
Zip_MedianListingPricePerSqft_AllHomes:		0.40
Zip_MedianListingPricePerSqft_CondoCoop:		0.04
Zip_MedianListingPricePerSqft_DuplexTriplex:		0.01
Zip_MedianListingPricePerSqft_Sfr:		0.37
Zip_MedianListingPrice_1Bedroom:		0.00
Zip_MedianListingPrice_2Bedroom:		0.03
Zip_MedianListingPrice_3Bedroom:		0.17
Zip_MedianListingPrice_4Bedroom:		0.08
Zip_MedianListingPrice_5BedroomOrMore:		0.01
Zip_MedianListingPrice_AllHomes:		0.31
Zip_MedianListingPrice_CondoCoop:		0.03
Zip_MedianListingPrice_DuplexTriplex:		0.01
Zip_MedianListingPrice_Sfr:		0.29
Zip_MedianRentalPricePerSqft_1Bedroom:		0.01
Zip_MedianRentalPricePerSqft_2Bedroom:		0.01
Zip_MedianRentalPricePerSqft_3Bedroom:		0.01
Zip_MedianRentalPricePerSqft_4Bedroom:		0.00
Zip_MedianRentalPricePerSqft_5BedroomOrMore:		0.00
Zip_MedianRentalPricePerSqft_AllHomes:		0.05
Zip_MedianRentalPricePerSqft_CondoCoop:		0.01
Zip_MedianRentalPricePerSqft_DuplexTriplex:		0.01
Zip_MedianRentalPricePerSqft_Mfr5Plus:		0.03
Zip_MedianRentalPricePerSqft_Sfr:		0.02
Zip_MedianRentalPricePerSqft_Studio:		0.00
Zip_MedianRentalPrice_1Bedroom:		0.01
Zip_MedianRentalPrice_2Bedroom:		0.02
Zip_MedianRentalPrice_3Bedroom:		0.01
Zip_MedianRentalPrice_4Bedroom:		0.00
Zip_MedianRentalPrice_5BedroomOrMore:		0.00
Zip_MedianRentalPrice_AllHomes:		0.05
Zip_MedianRentalPrice_CondoCoop:		0.01
Zip_MedianRentalPrice_DuplexTriplex:		0.02
Zip_MedianRentalPrice_Mfr5Plus:		0.04
Zip_MedianRentalPrice_Sfr:		0.02
Zip_MedianRentalPrice_Studio:		0.00
Zip_MedianSoldPricePerSqft_AllHomes:		0.29
Zip_MedianSoldPricePerSqft_Condominum:		0.03
Zip_MedianSoldPricePerSqft_SingleFamilyResidence:		0.26
Zip_MedianSoldPrice_AllHomes:		0.31
Zip_MedianValuePerSqft_AllHomes:		0.63
Zip_PctOfHomesDecreasingInValues_AllHomes:		0.61
Zip_PctOfHomesIncreasingInValues_AllHomes:		0.61
Zip_PctTransactionsThatArePreviouslyForeclosuredHomes_AllHomes:		0.30
Zip_PriceToRentRatio_AllHomes:		0.63
Zip_Turnover_AllHomes:		0.93
Zip_UnsoldReos_AllHomes:		0.68
Zip_ZhvAvgRaw_AllHomes:		0.64
Zip_ZhvAvgRaw_AllHomes_all:		0.95
Zip_ZhvAvgRaw_Condominum:		0.28
Zip_ZhvAvgRaw_Condominum_all:		0.35
Zip_ZhvAvgRaw_SingleFamilyResidence:		0.63
Zip_ZhvAvgRaw_SingleFamilyResidence_all:		0.94
Zip_ZhvAvg_AllHomes_all_yoy:		0.95
Zip_ZhvAvg_AllHomes_yoy:		0.63
Zip_ZhvAvg_Condominum_all_yoy:		0.35
Zip_ZhvAvg_Condominum_yoy:		0.27
Zip_ZhvAvg_SingleFamilyResidence_all_yoy:		0.94
Zip_ZhvAvg_SingleFamilyResidence_yoy:		0.62
Zip_Zhvi_1bedroom:		0.12
Zip_Zhvi_2bedroom:		0.46
Zip_Zhvi_3bedroom:		0.57
Zip_Zhvi_4bedroom:		0.49
Zip_Zhvi_5BedroomOrMore:		0.28
Zip_Zhvi_AllHomes:		0.64
Zip_Zhvi_BottomTier:		0.60
Zip_Zhvi_BottomTier_yoy:		0.59
Zip_Zhvi_Condominum:		0.28
Zip_Zhvi_MiddleTier:		0.64
Zip_Zhvi_MiddleTier_yoy:		0.63
Zip_Zhvi_SingleFamilyResidence:		0.63
Zip_Zhvi_TopTier:		0.63
Zip_Zhvi_TopTier_yoy:		0.63
Zip_ZriPerSqft_AllHomes:		0.69
Zip_Zri_AllHomes:		0.71
Zip_Zri_AllHomesPlusMultifamily:		0.71
Zip_Zri_MultiFamilyResidenceRental:		0.43
Zip_Zri_SingleFamilyResidenceRental:		0.71

In [5]:
for col, pop in col_pop.items():
    if pop < 0.1:
        print('{}:\t\t{:.2f}'.format(col, pop))


Zip_MedianListingPricePerSqft_1Bedroom:		0.00
Zip_MedianListingPricePerSqft_2Bedroom:		0.04
Zip_MedianListingPricePerSqft_4Bedroom:		0.10
Zip_MedianListingPricePerSqft_5BedroomOrMore:		0.01
Zip_MedianListingPricePerSqft_CondoCoop:		0.04
Zip_MedianListingPricePerSqft_DuplexTriplex:		0.01
Zip_MedianListingPrice_1Bedroom:		0.00
Zip_MedianListingPrice_2Bedroom:		0.03
Zip_MedianListingPrice_4Bedroom:		0.08
Zip_MedianListingPrice_5BedroomOrMore:		0.01
Zip_MedianListingPrice_CondoCoop:		0.03
Zip_MedianListingPrice_DuplexTriplex:		0.01
Zip_MedianRentalPricePerSqft_1Bedroom:		0.01
Zip_MedianRentalPricePerSqft_2Bedroom:		0.01
Zip_MedianRentalPricePerSqft_3Bedroom:		0.01
Zip_MedianRentalPricePerSqft_4Bedroom:		0.00
Zip_MedianRentalPricePerSqft_5BedroomOrMore:		0.00
Zip_MedianRentalPricePerSqft_AllHomes:		0.05
Zip_MedianRentalPricePerSqft_CondoCoop:		0.01
Zip_MedianRentalPricePerSqft_DuplexTriplex:		0.01
Zip_MedianRentalPricePerSqft_Mfr5Plus:		0.03
Zip_MedianRentalPricePerSqft_Sfr:		0.02
Zip_MedianRentalPricePerSqft_Studio:		0.00
Zip_MedianRentalPrice_1Bedroom:		0.01
Zip_MedianRentalPrice_2Bedroom:		0.02
Zip_MedianRentalPrice_3Bedroom:		0.01
Zip_MedianRentalPrice_4Bedroom:		0.00
Zip_MedianRentalPrice_5BedroomOrMore:		0.00
Zip_MedianRentalPrice_AllHomes:		0.05
Zip_MedianRentalPrice_CondoCoop:		0.01
Zip_MedianRentalPrice_DuplexTriplex:		0.02
Zip_MedianRentalPrice_Mfr5Plus:		0.04
Zip_MedianRentalPrice_Sfr:		0.02
Zip_MedianRentalPrice_Studio:		0.00
Zip_MedianSoldPricePerSqft_Condominum:		0.03

In [6]:
data2 = data.drop([col for col in col_pop if col_pop[col] < .6], axis='columns').dropna(how='any', axis='index')

print('{:.2f}'.format(len(data2.axes[0]) / len(data.axes[0])))


0.47

In [7]:
data2.head()


Out[7]:
Zip_MedianValuePerSqft_AllHomes Zip_PctOfHomesDecreasingInValues_AllHomes Zip_PctOfHomesIncreasingInValues_AllHomes Zip_PriceToRentRatio_AllHomes Zip_Turnover_AllHomes Zip_UnsoldReos_AllHomes Zip_ZhvAvgRaw_AllHomes Zip_ZhvAvgRaw_AllHomes_all Zip_ZhvAvgRaw_SingleFamilyResidence Zip_ZhvAvgRaw_SingleFamilyResidence_all ... Zip_Zhvi_AllHomes Zip_Zhvi_MiddleTier Zip_Zhvi_MiddleTier_yoy Zip_Zhvi_SingleFamilyResidence Zip_Zhvi_TopTier Zip_Zhvi_TopTier_yoy Zip_ZriPerSqft_AllHomes Zip_Zri_AllHomes Zip_Zri_AllHomesPlusMultifamily Zip_Zri_SingleFamilyResidenceRental
RegionName Period
1001 2011-01 136.0 70.48 15.17 11.26 5.5905 44.0 196000.0 196000.0 215500.0 215500.0 ... 176800.0 176800.0 -0.053026 189400.0 235000.0 -0.046653 1.068 1392.0 1378.0 1433.0
2011-02 136.0 66.60 17.63 11.15 5.4682 45.7 195000.0 195000.0 214500.0 214500.0 ... 177300.0 177300.0 -0.055911 189400.0 235700.0 -0.042648 1.082 1408.0 1397.0 1450.0
2011-03 137.0 62.95 20.83 11.46 5.3450 47.3 197500.0 197500.0 217500.0 217500.0 ... 178300.0 178300.0 -0.048051 190200.0 237700.0 -0.024220 1.080 1412.0 1400.0 1449.0
2011-04 137.0 69.29 14.49 10.79 5.0131 48.6 196500.0 196500.0 216000.0 216000.0 ... 178600.0 178600.0 -0.041845 190000.0 238300.0 -0.015696 1.086 1424.0 1411.0 1459.0
2011-05 136.0 74.57 11.57 10.37 4.8384 49.5 195500.0 195500.0 214500.0 214500.0 ... 178200.0 178200.0 -0.037797 189100.0 237000.0 -0.017005 1.088 1446.0 1434.0 1475.0

5 rows × 24 columns


In [8]:
data2.to_csv('.\\Data\\Processed Data\\zip_clean.csv')

In [ ]: