In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import operator
import sys
import os
from utils import zillow_helpers
sys.path.insert(0,'../')
%load_ext autoreload
%autoreload 2
Set some data folders here. Be sure to "ignore" your actual data folders when pushing to Github.
In [2]:
datadir1 = 'data/46/ZAsmt/'
datadir2 = 'data/46/ZTrans/'
datadir3 = 'data/50/ZAsmt/'
datadir4 = 'data/50/ZTrans/'
List the text files in the folder. This could be helpful for making iterables later!
In [3]:
os.listdir(datadir1)
Out[3]:
['AdditionalPropertyAddress.txt',
'BKManagedSpecific.txt',
'Building.txt',
'BuildingAreas.txt',
'CareOfName.txt',
'ExteriorWall.txt',
'ExtraFeature.txt',
'Garage.txt',
'InteriorFlooring.txt',
'InteriorWall.txt',
'LotSiteAppeal.txt',
'MailAddress.txt',
'Main.txt',
'Name.txt',
'Oby.txt',
'Pool.txt',
'SaleData.txt',
'TaxDistrict.txt',
'TaxExemption.txt',
'TypeConstruction.txt',
'Value.txt',
'VestingCodes.txt']
Known syntax for opening and "reading" lines from text files contained below
In [4]:
f = open(datadir1+'SaleData.txt', 'r')
In [5]:
# Only read a few lines at a time with these files!!!
for lines in range(3):
line = f.readline()
print(line)
A30D272B-3F28-E611-80C4-3863BB43AC67|1|||2011-06-10|||90|398|WD|WRDE|| |46107|801464
A90D272B-3F28-E611-80C4-3863BB43AC67|1|||1998-08-24|||||||| |46107|801464
AD0D272B-3F28-E611-80C4-3863BB43AC67|1|||2010-07-14|||90|14|WD|WRDE|58000.0000|AF|46107|801464
It's good to note that creation of a dataframe from these massive file-sets does not seem to require a lot of processing power. Attempting to read and print the whole dataset probably will crash your kernel! Be sure to read a few lines of the head each time to check!
In [6]:
df = pd.read_csv(datadir1+'SaleData.txt', sep="|", index_col=False, header=None, low_memory=False)
df.head(3)
Out[6]:
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
0
A30D272B-3F28-E611-80C4-3863BB43AC67
1
NaN
NaN
2011-06-10
NaN
NaN
90
398
WD
WRDE
NaN
46107
801464
1
A90D272B-3F28-E611-80C4-3863BB43AC67
1
NaN
NaN
1998-08-24
NaN
NaN
NaN
NaN
NaN
NaN
NaN
46107
801464
2
AD0D272B-3F28-E611-80C4-3863BB43AC67
1
NaN
NaN
2010-07-14
NaN
NaN
90
14
WD
WRDE
58000.0
AF
46107
801464
In [7]:
df2 = zillow_helpers.txt_to_df(datadir1+'Main.txt')
df2.head(3)
Out[7]:
0
1
2
3
4
5
6
7
8
9
...
85
86
87
88
89
90
91
92
93
94
0
01D1B108-F8A6-E611-80C9-3863BB43AC67
119614254
46137
SD
ZIEBACH
2016-11-01
72016
4
BKF
002643
...
NaN
NaN
NaN
NaN
NaN
NaN
0
1564496
2624
562086187
1
02D1B108-F8A6-E611-80C9-3863BB43AC67
119613826
46137
SD
ZIEBACH
2016-11-01
72016
4
BKF
002213
...
NaN
NaN
NaN
NaN
NaN
NaN
0
1564496
2196
-1621235809
2
03D1B108-F8A6-E611-80C9-3863BB43AC67
119615712
46137
SD
ZIEBACH
2016-11-01
72016
4
BKF
004104
...
NaN
NaN
NaN
NaN
NaN
NaN
0
1564496
4081
1892456234
3 rows × 95 columns
If your data frame is good, then pickle it! It will save your dataframe and prevent the need to re-create it later.
In [8]:
df.to_pickle("df.pickle")
In [9]:
df3 = pd.read_pickle("df.pickle")
In [10]:
df3.head(3)
Out[10]:
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
0
A30D272B-3F28-E611-80C4-3863BB43AC67
1
NaN
NaN
2011-06-10
NaN
NaN
90
398
WD
WRDE
NaN
46107
801464
1
A90D272B-3F28-E611-80C4-3863BB43AC67
1
NaN
NaN
1998-08-24
NaN
NaN
NaN
NaN
NaN
NaN
NaN
46107
801464
2
AD0D272B-3F28-E611-80C4-3863BB43AC67
1
NaN
NaN
2010-07-14
NaN
NaN
90
14
WD
WRDE
58000.0
AF
46107
801464
First, lets use our column helper files in order to create arrays to be used to name columns later.
In [90]:
chelpMain = 'column_helper/main.csv'
chelpPI = 'column_helper/propertyinfo.csv'
In [91]:
chelpMainOpen = open(chelpMain, 'r')
chelpPIOpen = open(chelpPI, 'r')
In [92]:
for lines in range(3):
line = chelpMainOpen.readline()
print(line)
TransId
FIPS
State
In [93]:
for lines in range(3):
line = chelpPIOpen.readline()
print(line)
TransId
AssessorParcelNumber
APNIndicatorStndCode
In [94]:
df4 = zillow_helpers.txt_to_df(datadir2+'Main.txt')
In [95]:
df4.head(3)
Out[95]:
0
1
2
3
4
5
6
7
8
9
...
121
122
123
124
125
126
127
128
129
130
0
315852373
46009
SD
BON HOMME
D
P
2009-01-06
NaN
96
0001
...
NM
NaN
2411791749
NaN
1702496
1
BKF
1.651814e+09
1
315852374
46009
SD
BON HOMME
D
P
2009-01-07
NaN
96
0002
...
NM
NaN
2411791750
NaN
1702496
2
BKF
1.531757e+08
2
315852375
46009
SD
BON HOMME
D
P
2009-01-09
NaN
96
0003
...
NM
NaN
2411791751
NaN
1702496
3
BKF
-6.570774e+08
3 rows × 131 columns
In [105]:
chelpMain = 'column_helper/main.csv'
chelpPI = 'column_helper/propertyinfo.csv'
chelpMainOpen = open(chelpMain, 'r')
chelpPIOpen = open(chelpPI, 'r')
chelpMainArray = []
for line in chelpMainOpen:
chelpMainArray.append(line.rstrip())
In [106]:
print(chelpMainArray)
['TransId', 'FIPS', 'State', 'County', 'DataClassStndCode', 'RecordTypeStndCode', 'RecordingDate', 'RecordingDocumentNumber', 'RecordingBookNumber', 'RecordingPageNumber', 'ReRecordedCorrectionStndCode', 'PriorRecordingDate', 'PriorDocumentDate', 'PriorDocumentNumber', 'PriorBookNumber', 'PriorPageNumber', 'DocumentTypeStndCode', 'DocumentDate', 'SignatureDate', 'EffectiveDate', 'BuyerVestingStndCode', 'BuyerMultiVestingFlag', 'PartialInterestTransferStndCode', 'PartialInterestTransferPercent', 'SalesPriceAmount', 'SalesPriceAmountStndCode', 'CityTransferTax', 'CountyTransferTax', 'StateTransferTax', 'TotalTransferTax', 'IntraFamilyTransferFlag', 'TransferTaxExemptFlag', 'PropertyUseStndCode', 'AssessmentLandUseStndCode', 'OccupancyStatusStndCode', 'LegalStndCode', 'BorrowerVestingStndCode', 'LenderName', 'LenderTypeStndCode', 'LenderIDStndCode', 'LenderDBAName', 'DBALenderTypeStndCode', 'DBALenderIDStndCode', 'LenderMailCareOfName', 'LenderMailHouseNumber', 'LenderMailHouseNumberExt', 'LenderMailStreetPreDirectional', 'LenderMailStreetName', 'LenderMailStreetSuffix', 'LenderMailStreetPostDirectional', 'LenderMailFullStreetAddress', 'LenderMailBuildingName', 'LenderMailBuildingNumber', 'LenderMailUnitDesignator', 'LenderMailUnit', 'LenderMailCity', 'LenderMailState', 'LenderMailZip', 'LenderMailZip4', 'LoanAmount', 'LoanAmountStndCode', 'MaximumLoanAmount', 'LoanTypeStndCode', 'LoanTypeClosedOpenEndStndCode', 'LoanTypeFutureAdvanceFlag', 'LoanTypeProgramStndCode', 'LoanRateTypeStndCode', 'LoanDueDate', 'LoanTermMonths', 'LoanTermYears', 'InitialInterestRate', 'ARMFirstAdjustmentDate', 'ARMFirstAdjustmentMaxRate', 'ARMFirstAdjustmentMinRate', 'ARMIndexStndCode', 'ARMAdjustmentFrequencyStndCode', 'ARMMargin', 'ARMInitialCap', 'ARMPeriodicCap', 'ARMLifetimeCap', 'ARMMaxInterestRate', 'ARMMinInterestRate', 'InterestOnlyFlag', 'InterestOnlyTerm', 'PrepaymentPenaltyFlag', 'PrepaymentPenaltyTerm', 'BiWeeklyPaymentFlag', 'AssumabilityRiderFlag', 'BalloonRiderFlag', 'CondominiumRiderFlag', 'PlannedUnitDevelopmentRiderFlag', 'SecondHomeRiderFlag', 'OneToFourFamilyRiderFlag', 'ConcurrentMtgeDocOrBkPg', 'LoanNumber', 'MERSMINNumber', 'CaseNumber', 'MERSFlag', 'TitleCompanyName', 'TitleCompanyIDStndCode', 'AccommodationRecordingFlag', 'UnpaidBalance', 'InstallmentAmount', 'InstallmentDueDate', 'TotalDelinquentAmount', 'DelinquentAsOfDate', 'CurrentLender', 'CurrentLenderTypeStndCode', 'CurrentLenderIDStndCode', 'TrusteeSaleNumber', 'AttorneyFileNumber', 'AuctionDate', 'AuctionTime', 'AuctionFullStreetAddress', 'AuctionCityName', 'StartingBid', 'KeyedDate', 'KeyerID', 'SubVendorStndCode', 'ImageFileName', 'BuilderFlag', 'MatchStndCode', 'REOStndCode', 'UpdateOwnershipFlag', 'LoadID', 'StatusInd', 'TransactionTypeStndCode', 'BatchID', 'BKFSPID', 'ZVendorStndCode', 'SourceChkSum']
In [107]:
df4.columns=chelpMainArray
In [108]:
df4
Out[108]:
TransId
FIPS
State
County
DataClassStndCode
RecordTypeStndCode
RecordingDate
RecordingDocumentNumber
RecordingBookNumber
RecordingPageNumber
...
MatchStndCode
REOStndCode
UpdateOwnershipFlag
LoadID
StatusInd
TransactionTypeStndCode
BatchID
BKFSPID
ZVendorStndCode
SourceChkSum
0
315852373
46009
SD
BON HOMME
D
P
2009-01-06
NaN
96
0001
...
NM
NaN
2411791749
NaN
1702496
1
BKF
1.651814e+09
1
315852374
46009
SD
BON HOMME
D
P
2009-01-07
NaN
96
0002
...
NM
NaN
2411791750
NaN
1702496
2
BKF
1.531757e+08
2
315852375
46009
SD
BON HOMME
D
P
2009-01-09
NaN
96
0003
...
NM
NaN
2411791751
NaN
1702496
3
BKF
-6.570774e+08
3
315852376
46009
SD
BON HOMME
D
P
2009-01-09
NaN
96
0004
...
NM
NaN
2411791752
NaN
1702496
4
BKF
-8.579398e+08
4
315852377
46009
SD
BON HOMME
D
P
2009-01-13
NaN
96
0005
...
NM
NaN
2411791753
NaN
1702496
5
BKF
3.610861e+08
5
315852378
46009
SD
BON HOMME
D
P
2009-01-14
NaN
96
0006
...
NM
NaN
2411791754
NaN
1702496
6
BKF
-1.132851e+09
6
315852379
46009
SD
BON HOMME
D
P
2009-01-15
NaN
96
0008
...
NM
NaN
2411791755
NaN
1702496
7
BKF
1.282545e+09
7
315852380
46009
SD
BON HOMME
D
P
2009-01-16
NaN
96
0010
...
NM
NaN
2411791756
NaN
1702496
8
BKF
2.096268e+09
8
315852381
46009
SD
BON HOMME
D
P
2009-01-16
NaN
96
011
...
NM
NaN
2411791757
NaN
1702496
9
BKF
2.133314e+09
9
315852382
46009
SD
BON HOMME
D
P
2009-01-20
NaN
96
0013
...
NM
NaN
2411791758
NaN
1702496
10
BKF
-6.505621e+08
10
315852383
46009
SD
BON HOMME
D
P
2009-01-20
NaN
96
0014
...
NM
NaN
2411791759
NaN
1702496
11
BKF
-1.409499e+09
11
315852384
46009
SD
BON HOMME
D
P
2009-01-20
NaN
96
0015
...
NM
NaN
2411791760
NaN
1702496
12
BKF
-5.732568e+08
12
315852385
46009
SD
BON HOMME
D
P
2009-01-20
NaN
96
0016
...
NM
NaN
2411791761
NaN
1702496
13
BKF
-2.019233e+09
13
315852386
46009
SD
BON HOMME
D
P
2009-01-23
NaN
96
0018
...
NM
NaN
2411791762
NaN
1702496
14
BKF
1.764427e+09
14
315970772
46045
SD
EDMUNDS
D
P
2007-03-05
005657
121
45
...
NM
NaN
2837193155
NaN
2240820
837
BKF
1.299543e+09
15
315970773
46045
SD
EDMUNDS
D
P
2007-03-05
005659
121
46
...
NM
NaN
2837193156
NaN
2240820
838
BKF
1.349484e+09
16
315970774
46045
SD
EDMUNDS
D
P
2007-03-05
005660
121
47
...
NM
NaN
2837193941
NaN
2240820
357
BKF
-8.435763e+08
17
315970775
46045
SD
EDMUNDS
D
P
2007-03-06
005664
121
48
...
NM
NaN
2837193942
NaN
2240820
358
BKF
-1.511650e+09
18
315970776
46045
SD
EDMUNDS
D
P
2007-03-06
005667
121
49
...
NM
NaN
2837193157
NaN
2240820
839
BKF
-7.015568e+08
19
315970777
46045
SD
EDMUNDS
D
P
2007-03-06
005668
121
50
...
NM
NaN
2837193158
NaN
2240820
840
BKF
-1.419252e+09
20
315970778
46045
SD
EDMUNDS
D
P
2007-03-07
005678
121
51
...
NM
NaN
2837193159
NaN
2240820
841
BKF
1.278553e+09
21
315975260
46047
SD
FALL RIVER
D
P
2008-08-25
041770
133
327
...
NaN
322642117
NaN
6704
1
BKF
-1.826860e+09
22
315975261
46047
SD
FALL RIVER
D
P
2008-08-25
041780
133
329
...
NaN
322642118
NaN
6704
2
BKF
-5.297475e+08
23
315975262
46047
SD
FALL RIVER
D
P
2008-08-25
041783
133
332
...
NaN
322642119
NaN
1062517
3
BKF
-1.285240e+09
24
315975263
46047
SD
FALL RIVER
D
P
2008-08-27
041789
133
337
...
NaN
322642120
NaN
6704
4
BKF
-5.618417e+07
25
315975264
46047
SD
FALL RIVER
D
P
2008-08-28
041794
133
341
...
NaN
322642121
NaN
6704
5
BKF
-9.101336e+08
26
315975265
46047
SD
FALL RIVER
D
P
2008-08-28
041797
133
343
...
NaN
322642122
NaN
6704
6
BKF
5.707815e+08
27
315975266
46047
SD
FALL RIVER
D
P
2008-08-28
041799
133
344
...
NaN
322642123
NaN
6704
7
BKF
5.342536e+07
28
315975267
46047
SD
FALL RIVER
D
P
2008-08-29
041800
133
345
...
NaN
322642124
NaN
6704
8
BKF
8.652645e+08
29
315975268
46047
SD
FALL RIVER
D
P
2008-09-02
041802
133
346
...
NaN
322642125
NaN
6704
9
BKF
1.137384e+09
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
434192
411233641
46099
SD
MINNEHAHA
D
P
2013-05-08
NaN
549
213
...
MA
NaN
2861326246
NaN
2283726
25523
BKF
-4.062573e+08
434193
411233642
46099
SD
MINNEHAHA
D
P
2013-05-08
NaN
549
215
...
NM
NaN
2795005386
NaN
2186724
25524
BKF
1.131106e+09
434194
411233643
46099
SD
MINNEHAHA
D
P
2013-05-08
NaN
549
217
...
MA
NaN
2861326322
NaN
2283726
25525
BKF
1.710760e+09
434195
411233644
46099
SD
MINNEHAHA
D
P
2013-05-08
NaN
549
218
...
NM
NaN
2795005388
NaN
2186724
25526
BKF
-1.354516e+09
434196
411233645
46099
SD
MINNEHAHA
D
P
2013-05-08
NaN
549
221
...
NM
NaN
2795005389
NaN
2186724
25527
BKF
-1.732588e+09
434197
411233646
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
222
...
NM
NaN
2795005390
NaN
2186724
25528
BKF
-1.095525e+09
434198
411233647
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
223
...
MA
NaN
2861326345
NaN
2283726
25529
BKF
8.309003e+08
434199
411233648
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
224
...
MA
NaN
2861326139
NaN
2283726
25530
BKF
-3.064645e+08
434200
411233649
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
225
...
NM
NaN
2795005393
NaN
2186724
25531
BKF
1.904139e+09
434201
411233650
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
226
...
MA
NaN
2861326136
NaN
2283726
25532
BKF
-1.619932e+09
434202
411233651
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
227
...
MA
NaN
2861326334
NaN
2283726
25533
BKF
-1.400223e+09
434203
411233652
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
228
...
MA
NaN
2861326275
NaN
2283726
25534
BKF
-7.713479e+08
434204
411233653
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
231
...
MA
NaN
2861326286
NaN
2283726
25535
BKF
-1.473133e+09
434205
411233654
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
232
...
MA
NaN
2861326223
NaN
2283726
25536
BKF
2.092558e+09
434206
411233655
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
233
...
NM
NaN
2795005399
NaN
2186724
25537
BKF
-1.097496e+09
434207
411233656
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
235
...
MA
NaN
2861326177
NaN
2283726
25538
BKF
-6.867208e+08
434208
411233657
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
237
...
MA
NaN
2861326332
NaN
2283726
25539
BKF
-6.489207e+08
434209
411233658
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
241
...
MA
NaN
2861326144
NaN
2283726
25540
BKF
1.901485e+09
434210
411233659
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
242
...
MA
NaN
2861326242
NaN
2283726
25541
BKF
7.517638e+08
434211
411233660
46099
SD
MINNEHAHA
D
P
2013-05-09
NaN
549
243
...
MA
NaN
2861326184
NaN
2283726
25542
BKF
9.468836e+08
434212
411233661
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
245
...
MA
NaN
2861326352
NaN
2283726
25543
BKF
-9.727775e+08
434213
411233662
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
246
...
NM
NaN
2795005406
NaN
2186724
25544
BKF
3.757779e+08
434214
411233663
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
249
...
MA
NaN
2861326255
NaN
2283726
25545
BKF
1.460535e+09
434215
411233664
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
250
...
MA
NaN
2861326361
NaN
2283726
25546
BKF
-1.850310e+09
434216
411233665
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
251
...
MA
NaN
2861326368
NaN
2283726
25547
BKF
3.817695e+08
434217
411233666
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
252
...
NM
NaN
2795005410
NaN
2186724
25548
BKF
-9.039898e+08
434218
411233667
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
254
...
MA
NaN
2861326133
NaN
2283726
25549
BKF
2.129362e+08
434219
411233668
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
255
...
MA
NaN
2861326204
NaN
2283726
25550
BKF
-9.487834e+08
434220
411233669
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
258
...
NM
NaN
2795005413
NaN
2186724
25551
BKF
-1.026002e+09
434221
411233670
46099
SD
MINNEHAHA
D
P
2013-05-10
NaN
549
259
...
NM
NaN
2795005414
NaN
2186724
25552
BKF
6.311876e+08
434222 rows × 131 columns
In [112]:
df5 = zillow_helpers.txt_to_column_df('data/46/ZTrans/Main.txt', 'column_helper/main.csv')
df5.head(5)
Out[112]:
TransId
FIPS
State
County
DataClassStndCode
RecordTypeStndCode
RecordingDate
RecordingDocumentNumber
RecordingBookNumber
RecordingPageNumber
...
MatchStndCode
REOStndCode
UpdateOwnershipFlag
LoadID
StatusInd
TransactionTypeStndCode
BatchID
BKFSPID
ZVendorStndCode
SourceChkSum
0
315852373
46009
SD
BON HOMME
D
P
2009-01-06
NaN
96
0001
...
NM
NaN
2411791749
NaN
1702496
1
BKF
1.651814e+09
1
315852374
46009
SD
BON HOMME
D
P
2009-01-07
NaN
96
0002
...
NM
NaN
2411791750
NaN
1702496
2
BKF
1.531757e+08
2
315852375
46009
SD
BON HOMME
D
P
2009-01-09
NaN
96
0003
...
NM
NaN
2411791751
NaN
1702496
3
BKF
-6.570774e+08
3
315852376
46009
SD
BON HOMME
D
P
2009-01-09
NaN
96
0004
...
NM
NaN
2411791752
NaN
1702496
4
BKF
-8.579398e+08
4
315852377
46009
SD
BON HOMME
D
P
2009-01-13
NaN
96
0005
...
NM
NaN
2411791753
NaN
1702496
5
BKF
3.610861e+08
5 rows × 131 columns
In [113]:
df6 = zillow_helpers.txt_to_column_df('data/46/ZTrans/PropertyInfo.txt', 'column_helper/propertyinfo.csv')
df6.head(5)
Out[113]:
TransId
AssessorParcelNumber
APNIndicatorStndCode
TaxIDNumber
TaxIDIndicatorStndCode
UnformattedAssessorParcelNumber
AlternateParcelNumber
HawaiiCondoCPRCode
PropertyHouseNumber
PropertyHouseNumberExt
...
PropertyAddressMatchType
PropertyAddressDPV
PropertyGeocodeQualityCode
PropertyAddressQualityCode
FIPS
LoadID
ImportParcelID
BKFSPID
AssessmentRecordMatchFlag
BatchID
0
315997382
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
46061
2565142377
NaN
2
0
1872223
1
315997383
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
46061
2565142378
NaN
3
0
1872223
2
315997384
103-57-022-004-000-02
NaN
NaN
NaN
1035702200400002
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
46061
2565142119
119206404.0
1
1
1872223
3
315997385
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
46061
2565142379
NaN
4
0
1000198961
4
315997386
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
46061
2565142380
NaN
5
0
1000198961
5 rows × 68 columns
In [114]:
df7 = df5.merge(df6, left_on='TransId', right_on='TransId', how='outer')
df7.head(5)
Out[114]:
TransId
FIPS_x
State
County
DataClassStndCode
RecordTypeStndCode
RecordingDate
RecordingDocumentNumber
RecordingBookNumber
RecordingPageNumber
...
PropertyAddressMatchType
PropertyAddressDPV
PropertyGeocodeQualityCode
PropertyAddressQualityCode
FIPS_y
LoadID_y
ImportParcelID
BKFSPID_y
AssessmentRecordMatchFlag
BatchID_y
0
315852373
46009
SD
BON HOMME
D
P
2009-01-06
NaN
96
0001
...
NaN
NaN
NaN
NaN
46009.0
2.411792e+09
NaN
1.0
0.0
1702496.0
1
315852374
46009
SD
BON HOMME
D
P
2009-01-07
NaN
96
0002
...
NaN
NaN
NaN
NaN
46009.0
2.411792e+09
NaN
2.0
0.0
1702496.0
2
315852375
46009
SD
BON HOMME
D
P
2009-01-09
NaN
96
0003
...
NaN
NaN
NaN
NaN
46009.0
2.411792e+09
NaN
3.0
0.0
1702496.0
3
315852376
46009
SD
BON HOMME
D
P
2009-01-09
NaN
96
0004
...
NaN
NaN
NaN
NaN
46009.0
2.411792e+09
NaN
4.0
0.0
1702496.0
4
315852377
46009
SD
BON HOMME
D
P
2009-01-13
NaN
96
0005
...
NaN
NaN
NaN
NaN
46009.0
2.411792e+09
NaN
5.0
0.0
1702496.0
5 rows × 198 columns
In [115]:
df5.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434222 entries, 0 to 434221
Columns: 131 entries, TransId to SourceChkSum
dtypes: float64(53), int64(5), object(73)
memory usage: 434.0+ MB
In [119]:
df6.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439769 entries, 0 to 439768
Data columns (total 68 columns):
TransId 439769 non-null int64
AssessorParcelNumber 217117 non-null object
APNIndicatorStndCode 361 non-null object
TaxIDNumber 0 non-null float64
TaxIDIndicatorStndCode 914 non-null object
UnformattedAssessorParcelNumber 217117 non-null object
AlternateParcelNumber 0 non-null float64
HawaiiCondoCPRCode 222390 non-null object
PropertyHouseNumber 142392 non-null object
PropertyHouseNumberExt 0 non-null float64
PropertyStreetPreDirectional 266342 non-null object
PropertyStreetName 142420 non-null object
PropertyStreetSuffix 139201 non-null object
PropertyStreetPostDirectional 232125 non-null object
PropertyBuildingNumber 0 non-null float64
PropertyFullStreetAddress 219292 non-null object
PropertyCity 217827 non-null object
PropertyState 220841 non-null object
PropertyZip 223761 non-null float64
PropertyZip4 197977 non-null float64
OriginalPropertyFullStreetAddress 3006 non-null object
OriginalPropertyAddressLastline 0 non-null float64
PropertyAddressStndCode 872 non-null object
LegalLot 297233 non-null object
LegalOtherLot 7 non-null object
LegalLotCode 263757 non-null object
LegalBlock 213365 non-null object
LegalSubdivisionName 341412 non-null object
LegalCondoProjectPUDDevName 0 non-null float64
LegalBuildingNumber 0 non-null float64
LegalUnit 3365 non-null object
LegalSection 138 non-null object
LegalPhase 1051 non-null object
LegalTract 28982 non-null object
LegalDistrict 8932 non-null object
LegalMunicipality 0 non-null float64
LegalCity 258950 non-null object
LegalTownship 2638 non-null object
LegalSTRSection 0 non-null float64
LegalSTRTownship 0 non-null float64
LegalSTRRange 0 non-null float64
LegalSTRMeridian 0 non-null float64
LegalSecTwnRngMer 32660 non-null object
LegalRecordersMapReference 9108 non-null object
LegalDescription 15180 non-null object
LegalLotSize 50904 non-null object
PropertySequenceNumber 439769 non-null int64
PropertyAddressMatchcode 217521 non-null object
PropertyAddressUnitDesignator 10745 non-null object
PropertyAddressUnitNumber 11251 non-null object
PropertyAddressCarrierRoute 190553 non-null object
PropertyAddressGeoCodeMatchCode 217379 non-null object
PropertyAddressLatitude 59856 non-null float64
PropertyAddressLongitude 59856 non-null float64
PropertyAddressCensusTractAndBlock 59856 non-null float64
PropertyAddressConfidenceScore 0 non-null float64
PropertyAddressCBSACode 0 non-null float64
PropertyAddressCBSADivisionCode 0 non-null float64
PropertyAddressMatchType 0 non-null float64
PropertyAddressDPV 0 non-null float64
PropertyGeocodeQualityCode 0 non-null float64
PropertyAddressQualityCode 0 non-null float64
FIPS 439769 non-null int64
LoadID 439769 non-null int64
ImportParcelID 209399 non-null float64
BKFSPID 439769 non-null int64
AssessmentRecordMatchFlag 439769 non-null int64
BatchID 439769 non-null int64
dtypes: float64(25), int64(7), object(36)
memory usage: 228.2+ MB
In [120]:
df7.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 439771 entries, 0 to 439770
Columns: 198 entries, TransId to BatchID_y
dtypes: float64(84), int64(5), object(109)
memory usage: 667.7+ MB
In [123]:
from utils import zillow_helpers as zh
In [122]:
datadir2 = 'data/46/ZTrans/'
datadir4 = 'data/50/ZTrans/'
chelpMain = 'column_helper/main.csv'
chelpPI = 'column_helper/propertyinfo.csv'
In [124]:
df8 = zh.txt_to_column_df(datadir2+'Main.txt', chelpMain)
df9 = zh.txt_to_column_df(datadir4+'Main.txt', chelpMain)
In [125]:
df8.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434222 entries, 0 to 434221
Columns: 131 entries, TransId to SourceChkSum
dtypes: float64(53), int64(5), object(73)
memory usage: 434.0+ MB
In [126]:
df8.head(3)
Out[126]:
TransId
FIPS
State
County
DataClassStndCode
RecordTypeStndCode
RecordingDate
RecordingDocumentNumber
RecordingBookNumber
RecordingPageNumber
...
MatchStndCode
REOStndCode
UpdateOwnershipFlag
LoadID
StatusInd
TransactionTypeStndCode
BatchID
BKFSPID
ZVendorStndCode
SourceChkSum
0
315852373
46009
SD
BON HOMME
D
P
2009-01-06
NaN
96
0001
...
NM
NaN
2411791749
NaN
1702496
1
BKF
1.651814e+09
1
315852374
46009
SD
BON HOMME
D
P
2009-01-07
NaN
96
0002
...
NM
NaN
2411791750
NaN
1702496
2
BKF
1.531757e+08
2
315852375
46009
SD
BON HOMME
D
P
2009-01-09
NaN
96
0003
...
NM
NaN
2411791751
NaN
1702496
3
BKF
-6.570774e+08
3 rows × 131 columns
In [127]:
df9.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 789800 entries, 0 to 789799
Columns: 131 entries, TransId to SourceChkSum
dtypes: float64(82), int64(6), object(43)
memory usage: 789.4+ MB
In [128]:
df9.head(3)
Out[128]:
TransId
FIPS
State
County
DataClassStndCode
RecordTypeStndCode
RecordingDate
RecordingDocumentNumber
RecordingBookNumber
RecordingPageNumber
...
MatchStndCode
REOStndCode
UpdateOwnershipFlag
LoadID
StatusInd
TransactionTypeStndCode
BatchID
BKFSPID
ZVendorStndCode
SourceChkSum
0
349481687
50005
VT
CALEDONIA
D
P
1963-11-18
845731
56
34
...
NM
NaN
2768030159
NaN
2137747
37256
BKF
-1350954867
1
349481688
50005
VT
CALEDONIA
D
P
1985-12-05
800475
NaN
NaN
...
NM
NaN
2768030160
NaN
2137747
37257
BKF
1782568710
2
349481691
50005
VT
CALEDONIA
D
P
1987-01-01
501211
NaN
NaN
...
NM
NaN
2768030164
NaN
2137747
37261
BKF
1669866767
3 rows × 131 columns
In [129]:
df10 = pd.concat([df8,df9])
In [130]:
df10.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1224022 entries, 0 to 789799
Columns: 131 entries, TransId to SourceChkSum
dtypes: float64(53), int64(5), object(73)
memory usage: 1.2+ GB
In [ ]:
Content source: vt-zillow-project/data-usage
Similar notebooks: