In [11]:
import csv
import re
import collections
import json

txt_file = r"../data/a-001.txt"
out_file = r"../data/a-001.json"
out_csv = r"../data/a-001.csv"

Loop Through Each Line in the TXT file


In [4]:
parsed_data = {}  
section = 0

with open(txt_file) as f:
    for line in f:
        line = line.strip()
        line = ' '.join(line.split())
        
        if line.startswith("Security Map of"):
            section = 1
            parsed_data['0.b'] = ' '.join(f.readline().strip().split())
        elif line.startswith("a. Increasing"):
            parsed_data['1.a.1'] = ' '.join(f.readline().strip().split())
        elif line.startswith("Decreasing"):
            parsed_data['1.a.2'] = ' '.join(f.readline().strip().split())
        elif line.startswith("Static"):
            parsed_data['1.a.3'] = ' '.join(f.readline().strip().split())
        elif line.startswith("b. Class and Occupation"):
            parsed_data['1.b'] = ' '.join(f.readline().strip().split())
        elif line.startswith("c. Foreign Families"):
            parsed_data['1.c.1'] = ' '.join(f.readline().strip().split())
        elif line.startswith("Nationalities"):
            parsed_data['1.c.2'] = ' '.join(f.readline().strip().split())
        elif line.startswith("d. Negro"):
            datum = f.readline().strip() # 0 %
            no_percentage = re.match('[\d]', datum).group(0) # if wanting only the number 
            parsed_data['1.d'] = ' '.join(no_percentage.split())
        elif line.startswith("e. Shifting or Infiltration"):
            parsed_data['1.e'] = ' '.join(f.readline().strip().split())
        elif line.startswith("PREDOMINATING"):
            parsed_data['2.2'] = ' '.join(f.readline().strip().split())
        elif line.startswith("OTHER TYPE"):
            parsed_data['2.3'] = ' '.join(f.readline().strip().split())
            
        # Two-line reading
        elif line.startswith("a. Type and Size"):    
            parsed_data['2.a.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.a.2'] = ' '.join(k.strip().split())
        elif line.startswith("b. Construction"):    
            parsed_data['2.b.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.b.2'] = ' '.join(k.strip().split())
        elif line.startswith("c. Average Age"):    
            parsed_data['2.c.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.c.2'] = ' '.join(k.strip().split())
        elif line.startswith("d. Repair"):    
            parsed_data['2.d.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.d.2'] = ' '.join(k.strip().split())
        elif line.startswith("e. Occupancy"):    
            parsed_data['2.e.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.e.2'] = ' '.join(k.strip().split())
        elif line.startswith("f. Owner-occupied"):    
            parsed_data['2.f.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.f.2'] = ' '.join(k.strip().split())
        elif line.startswith("g. 1935 Price Bracket"):    
            parsed_data['2.g.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            k = f.readline()
            parsed_data['2.g.2'] = ' '.join(k.strip().split())
        elif line.startswith("h. 1937 Price Bracket"):    
            parsed_data['2.h.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.h.2'] = ' '.join(k.strip().split())
            k = f.readline()
            parsed_data['2.h.3'] = ' '.join(k.strip().split())
            k = f.readline()
            parsed_data['2.h.4'] = ' '.join(k.strip().split())
            k = f.readline()
            if (k.strip() == 'i.'):
                k = f.readline()
                parsed_data['2.i.1'] = ' '.join(k.strip().split())
                k = f.readline()
                k = f.readline()
                
                # omitting $ sign
                r = re.compile(r"\$")
                parsed_data['2.i.3'] = ' '.join(r.sub("", k).strip().split())
                k = f.readline()
                parsed_data['2.i.4'] = ' '.join(k.strip().split())
                k = f.readline()
                parsed_data['2.i.5'] = ' '.join(k.strip().split())          
                k = f.readline()
                parsed_data['2.i.6'] = ' '.join(k.strip().split())
        elif line.startswith("j. Sales Demand"):    
            parsed_data['2.j.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.j.2'] = ' '.join(k.strip().split())
        elif line.startswith("k. Predicted Price Trend"):    
            parsed_data['2.k.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.k.2'] = ' '.join(k.strip().split())
        elif line.startswith("l. 1935 Rent Bracket"):    
            parsed_data['2.l.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            k = f.readline()
            parsed_data['2.l.2'] = ' '.join(k.strip().split())            
        elif line.startswith("m. 1937 Rent Bracket"):    
            parsed_data['2.m.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.m.2'] = ' '.join(k.strip().split())
            k = f.readline()
            parsed_data['2.m.3'] = ' '.join(k.strip().split())
            k = f.readline()
            parsed_data['2.m.4'] = ' '.join(k.strip().split())
            k = f.readline()
            if (k.strip() == 'n.'):
                k = f.readline()
                parsed_data['2.n.1'] = ' '.join(k.strip().split())
                k = f.readline()
                k = f.readline()
    
                # omitting $ sign
                r = re.compile(r"\$")
                parsed_data['2.n.3'] = ' '.join(r.sub("", k).strip().split())
                k = f.readline()
                parsed_data['2.n.4'] = ' '.join(k.strip().split())
                k = f.readline()
                parsed_data['2.n.5'] = ' '.join(k.strip().split())          
                k = f.readline()
                parsed_data['2.n.6'] = ' '.join(k.strip().split())
            
        elif line.startswith("o. Rental Demand"):    
            parsed_data['2.o.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.o.2'] = ' '.join(k.strip().split())
            
        elif line.startswith("p. Predicted Rent Trend"):    
            parsed_data['2.p.1'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['2.p.2'] = ' '.join(k.strip().split())
        elif line.startswith("3. NEW CONSTRUCTION"):    
            parsed_data['3.a'] = ' '.join(f.readline().strip().split())
        elif line.startswith("Type & Price"):
            parsed_data['3.b'] = ' '.join(f.readline().strip().split())
        elif line.startswith("How Selling"):
            parsed_data['3.c'] = ' '.join(f.readline().strip().split())
            section = 4
        elif section == 4: 
            if line.startswith("a. HOLC"):
                parsed_data['4.a'] = ' '.join(f.readline().strip().split())
            elif line.startswith("b. Institutions"):
                parsed_data['4.b'] = ' '.join(f.readline().strip().split())
                section = 5
        elif section == 5: 
            if line.startswith("5. SALE OF HOME PROPERTIES"):
                # How do you extract only the digit?
                parsed_data['5'] = ' '.join(f.readline().strip().split())
            elif line.startswith("a. HOLC"):
                parsed_data['5.a'] = ' '.join(f.readline().strip().split())
            elif line.startswith("b. Institutions"):
                parsed_data['5.b'] = ' '.join(f.readline().strip().split())
            section = 6
        
        elif line.startswith("6. MORTGAGE FUNDS"):
            parsed_data['6'] = ' '.join(f.readline().strip().split())
        elif line.startswith("7. TOTAL TAX RATE PER $1000"):
            parsed_data['7.a'] = ' '.join(f.readline().strip().split())
            k = f.readline()
            parsed_data['7.b'] = ' '.join(k.strip().split())
        elif line.startswith("8. DESCRIPTION AND CHARACTERISTICS OF AREA"):
            parsed_data['8'] = ' '.join(f.readline().strip().split()) 
            section = 9
        elif section == 9:
            if line.startswith("9. LOCATION"):
                parsed_data['9.a'] = ' '.join(f.readline().strip().split()) 
            elif line.startswith("SECURITY GRADE"):
                parsed_data['9.b'] = ' '.join(f.readline().strip().split())
            elif line.startswith("AREA NO."):
                parsed_data['9.c'] = ' '.join(f.readline().strip().split())
            elif line.startswith("DATE"):
                parsed_data['9.d'] = ' '.join(f.readline().strip().split())
                k=f.readline()
                parsed_data['9.e'] = ' '.join(k.strip().split())

Sort the parsed data in the Dictionary


In [5]:
od = collections.OrderedDict(sorted(parsed_data.items()))

In [9]:
od


Out[9]:
OrderedDict([('0.b', 'Los Angeles County'),
             ('1.a.1', 'Rapidly'),
             ('1.a.2', ''),
             ('1.a.3', ''),
             ('1.b',
              'Motion picture stars, executives & technicians, professional_and business men. Income $3600-$6000 & up.'),
             ('1.c.1', '0 %'),
             ('1.c.2', '--'),
             ('1.d', '0'),
             ('1.e', 'None apparent'),
             ('2.2', '85 %'),
             ('2.3', '15 %'),
             ('2.a.1', '5,6 & 7 rooms'),
             ('2.a.2', 'Larger type'),
             ('2.b.1', 'Frame, stucco & masonry'),
             ('2.b.2', ''),
             ('2.c.1', '2 years'),
             ('2.c.2', ''),
             ('2.d.1', 'Good'),
             ('2.d.2', ''),
             ('2.e.1', '99%'),
             ('2.e.2', ''),
             ('2.f.1', '90%'),
             ('2.f.2', ''),
             ('2.g.1', '$ --'),
             ('2.g.2', '$'),
             ('2.h.1', '$ 5000-7500 & up'),
             ('2.h.2', '%'),
             ('2.h.3', '$'),
             ('2.h.4', '%'),
             ('2.i.1', '1939'),
             ('2.i.3', '5000-7500 & up'),
             ('2.i.4', '%'),
             ('2.i.5', '$'),
             ('2.i.6', '%'),
             ('2.j.1', 'Good'),
             ('2.j.2', ''),
             ('2.k.1', 'Static & up'),
             ('2.k.2', ''),
             ('2.l.1', '$ No record of rentals'),
             ('2.l.2', '$'),
             ('2.m.1', '$ No record of rentals'),
             ('2.m.2', '%'),
             ('2.m.3', '$'),
             ('2.m.4', '%'),
             ('2.n.1', '1939'),
             ('2.n.3', '--'),
             ('2.n.4', '%'),
             ('2.n.5', '$'),
             ('2.n.6', '%'),
             ('2.o.1', 'Good'),
             ('2.o.2', ''),
             ('2.p.1', '--'),
             ('2.p.2', ''),
             ('3.a', '250'),
             ('3.b', '5,6 & 7 rm. $5500-$7500 & up $25,000 & up Mansion type'),
             ('3.c', 'Moderately'),
             ('4.a', '1'),
             ('4.b', 'Few'),
             ('5', '(3 yr.)'),
             ('6', 'Ample'),
             ('7.a', '(1937-1938)'),
             ('7.b', '$54.95'),
             ('8',
              'Terrain: Level to rolling hillside. No construction hazard or flood threat. Land improved 30%. Highly deed restricted and protected from racial hazards. Conveniences are all readily available with exception of transportation which is as yet only fair. Street improvements are still in process of construction. This area, located on the southern rim of the San Fernando Valley, was subdivided some 15 years ago, and substantial street improvements were installed. The depression retarded development. Under the stimulus of well directed promotional effort and FHA financing a revival started some 4 years ago and it is now one of the "hot spots" in the Valley. Construction and maintenance are of high quality. Architectural designs are harmonious. Location is favorable and attractive. Population is homogeneous. Lots are generally of extra size and sold on "homesite" basis and at widely varying prices according to size and location. The pattern of the area is well established and it is accorded a "low green" grade.'),
             ('9.a', 'Encino'),
             ('9.b', '1st -'),
             ('9.c', 'A-1'),
             ('9.d', '3/23/39'),
             ('9.e',
              'CAUTION: This area is currently affected in whole or in part by an Ad valorem Tax District. Individual properties should be checked for this hazard.')])

Save the Dictionary in JSON


In [12]:
with open(out_file, 'w') as outfile:
    json.dump(od, outfile)

Save the Dictionary in CSV


In [13]:
with open(out_csv, 'w') as f:  # Just use 'w' mode in 3.x
    w = csv.DictWriter(f, od.keys())
    w.writeheader()
    w.writerow(od)

In [ ]: