In [ ]:
import json
from bs4 import BeautifulSoup
import requests
from pprint import pprint
import re
import html5lib
import numpy as np
import pandas as pd

def pop_type(dens):
    if dens < 300:
        return 'rural'
    elif dens > 1000:
        return 'urban'
    else:
        return 'suburban'
    
def parse_density_table(s):
    table = s.find('table', attrs={'class':'infobox geography vcard'})
    rows = table.find_all('tr')
    for row in rows:
        th_row = row.find_all('th')
        for ele in th_row:
            if re.search("Density", ele.text):
                density = ele.next_sibling.next_sibling.text
                density = density.replace(",", "")
                find = re.compile(r"^(.*?)\/")
                return float(re.search(find, density).group(1))
    return None     

def check_city(city, state, file):
    for f in range(len(file[0])):
        if file[0][f]==city and file[1][f]==state:
            return True
    return False

def parse_climate_table(s):
    tables = s.find_all('table')
    climate_table = ''
    for table in tables:
        try:
            if table.find('tr').find('th'):
                if table.find('tr').find('th').text.lower().find('climate data') == 0:
                    climate_table = table
        except:
            print("No Climate Data")
            continue
    rows = climate_table.find_all('tr')
    for row in rows:
        if re.search("Average high", row.find('th').text):
            temp_high = row.find_all('td')[12].text.split()[0]
        if re.search("Average low", row.find('th').text):
            temp_low = row.find_all('td')[12].text.split()[0]
            break
    return float(temp_high), float(temp_low)
    
with open('C:/Users/David/Downloads/all_cities_data.json', 'r') as data_file:  
    all_cities = map(lambda x: json.loads(x), data_file.readlines())
#check for cost > 0, job == true, zipcode == true

file = pd.read_csv('densityTable.csv', header=None)
# count = 0
for entry in all_cities:
#     if count >= 20:
#         break
    if check_city(entry['name'], entry['state'], file):
        print('Skipping: ' + entry['name'] + ', ' + entry['state'])
        continue
    if "job" in entry and entry["cost"] > 0 and len(entry["zipcode"]) != 0:
        try:
            city = entry['name']
            city.replace(' ','_')
            state = entry['state']
            url = "https://en.wikipedia.org/w/index.php?title=" + city + ",_" + state + "&printable=yes"
#             entry = 'Big Bar'
#             entry.replace(' ','_')
#             name = 'CA'
#             url = "https://en.wikipedia.org/w/index.php?title=Irvine,_CA&printable=yes"
        except:
            print('No Result:', city + ',', state)
            continue
        # HTML parsing
        handle = requests.get(url)
        data = handle.text
        soup = BeautifulSoup(data, 'html.parser')
        # Parse table for DENSITY
        try:
            DENSITY_OUT = 0
            DENSITY_OUT = parse_density_table(soup)
        except:
            print('No Density:', city + ',', state)
#         TEMP_HIGH_OUT, TEMP_LOW_OUT = parse_climate_table(soup)
#         POPULATION_TYPE_OUT = pop_type(DENSITY_OUT)
        # output to file
        output = city + ',' + state + ',' + str(DENSITY_OUT) + '\n'
        print(output)
        fd = open('densityTable.csv', 'a')
        fd.write(output)
        fd.close()
#     count = count + 1

In [ ]:


In [ ]: