In [6]:

    
#!/usr/bin/env python
# -*- coding: UTF-8

import sys

sys.path.append("/usr/local/lib/python3.5/dist-packages")

Dictionary Analysis on HTML from `wget` run!

Initializing



In [84]:

    
# import necessary libraries
import os, re, fnmatch # for navigating file trees and working with strings
import csv # for reading in CSV files
#from glob import glob # for finding files within nested folders--compare with os.walk
import json, pickle # For saving a loading dictionaries, etc. from file with JSON and pickle formats
from datetime import datetime # For timestamping files
import sys # For working with user input
import logging # for logging output, to help with troubleshooting
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import urllib, urllib.request # for testing pages
from unicodedata import normalize # for cleaning text by converting unicode character encodings into readable format
#import shelve # For working with big dictionary files without having the whole file in memory at once
import pandas as pd # modifies data more efficiently than with a list of dicts
from tqdm import tqdm # For progress information during involved Pandas operations

# Import parser
from bs4 import BeautifulSoup # BS reads and parses even poorly/unreliably coded HTML 
from bs4.element import Comment # helps with detecting inline/junk tags when parsing with BS
import lxml # for fast HTML parsing with BS, compared to "html.parser"
bsparser = "lxml"



In [85]:

    
# ### Set script options

Debug = False # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = False # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!
workstation = False # If working from office PC

if notebook:
    usefile = False # Prompting user for input file is only useful in command-line

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML



In [86]:

    
# ### Set directories

if workstation and notebook:
    dir_prefix = "C:\\Users\\Jaren\\Documents\\" # One level further down than the others
elif notebook:
    dir_prefix = "/home/jovyan/work/"
else:
    dir_prefix = "/vol_b/data/"

example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"

save_dir = dir_prefix + "Charter-school-identities" + os.sep + "data" + os.sep # Directory in which to save data files
dicts_dir = dir_prefix + "Charter-school-identities" + os.sep + "dicts" + os.sep # Directory in which to find & save dictionary files
temp_dir = save_dir + "temp" + os.sep # Directory in which to save temporary data files

micro_sample13 = save_dir + "micro-sample13_coded.csv" # Random micro-sample of 300 US charter schools
URL_schooldata = save_dir + "charter_URLs_2014.csv" # 2014 population of 6,973 US charter schools
full_schooldata = save_dir + "charter_merged_2014.csv" # Above merged with PVI, EdFacts, year opened/closed
temp_data = save_dir + "school_parser_temp.json" # Full_schooldata dict with output for some schools
example_file = save_dir + "example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"



In [87]:

    
# Set logging options
log_file = temp_dir + "dict_parsing_" + str(datetime.today()) + ".log"
logging.basicConfig(filename=log_file,level=logging.INFO)



In [88]:

    
# Set input file, if any
if usefile and not notebook:
    print("\nWould you like to load from file a list of dictionaries to add to? (Y/N)")
    answer = input()
    if answer == "Y":
        print("Please indicate file path for dictionary list file.")
        answer2 = input()
        if os.path.exists(answer2):
            input_file = answer2
            usefile = True
        else:
            print("Invalid file path. Aborting script.")
            sys.exit()

    elif answer == "N":
        print("OK! This script will create a new file for this list of dictionaries.")
        usefile = False
    
    else:
        print("Response not interpretable. Aborting script.")
        sys.exit()



In [89]:

    
# ### Define (non-parsing) helper functions

def get_vars(data):
    """Defines variable names based on the data source called."""
    
    if data==URL_schooldata:
        URL_variable = "TRUE_URL"
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS"
        
    elif data==full_schooldata:
        URL_variable = "SCH_NAME" # Work-around until URLs merged into full data file
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS14"
    
    elif data==micro_sample13:
        URL_variable = "URL"
        NAME_variable = "SCHNAM"
        ADDR_variable = "ADDRESS"
    
    else:
        try:
            print("Error processing variables from data file " + str(data) + "!")
        except Exception as e:
            print("ERROR: No data source established!\n")
            print(e)
    
    return(URL_variable,NAME_variable,ADDR_variable)


def tag_visible(element):
    """Returns false if a web element has a non-visible tag, 
    i.e. one site visitors wouldn't actually read--and thus one we don't want to parse"""
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def webtext_from_files(datalocation):
    """Concatenate and return a single string from all webtext (with .txt format) in datalocation"""
    
    string = ""
    for root, dirs, files in os.walk(datalocation):
        for file in files:
            if file.endswith(".txt"):
                fileloc = open(datalocation+file, "r")
                string = string + (fileloc.read())
    return string


def remove_spaces(file_path):
    """Remove spaces from text file at file_path"""
    
    words = [x for x in open(file_path).read().split() if x != ""]
    text = ""
    for word in words:
        text += word + " "
    return text


def write_errors(error_file, error1, error2, error3, file_count):
    """Writes to error_file three binary error flags derived from parse_school(): 
    duplicate_flag, parse_error_flag, wget_fail_flag, and file_count."""
    
    with open(error_file, 'w') as file_handler:
        file_handler.write("duplicate_flag {}\n".format(int(error1)))
        file_handler.write("parse_error_flag {}\n".format(int(error2)))
        file_handler.write("wget_fail_flag {}\n".format(int(error3)))
        file_handler.write("file_count {}".format(int(file_count)))
        return
    

def write_counts(file_path, names_list, counts_list):
    """Writes to file_path the input dict_count names (a list) and counts (another list).
    Assumes these two lists have same length and are in same order--
    e.g., names_list[0]="ess_count" and counts_list[0]=ess_count."""
    
    with open(file_path, 'w') as file_handler:
        for tup in zip(names_list,counts_list): # iterate over zipped list of tuples
            if tup != list(zip(names_list,counts_list))[-1]:
                file_handler.write("{} {}\n".format(tup[0],tup[1]))
            else:
                file_handler.write("{} {}".format(tup[0],tup[1]))
        return

    
def write_list(file_path, textlist):
    """Writes textlist to file_path. Useful for recording output of parse_school()."""
    
    with open(file_path, 'w') as file_handler:
        for elem in textlist:
            file_handler.write("{}\n".format(elem))
        return
    

def load_list(file_path):
    """Loads list into memory. Must be assigned to object."""
    
    textlist = []
    with open(file_path) as file_handler:
        line = file_handler.readline()
        while line:
            textlist.append(line)
            line = file_handler.readline()
    return textlist

        
def save_datafile(data, file, thismode):
    """BROKEN for saving to CSV Pandas DataFrames (only saves header) and lists of dicts (only saves keys).
    Saves data to file using JSON, pickle, or CSV format (whichever was specified).
    Works with Pandas DataFrames or other objects, e.g. a list of dictionaries.
    Deletes file first to reduce risk of data duplication."""
    
    file = str(file)
    thismode = str(thismode)
    
    try:
        if os.path.exists(file):
            os.remove(file) # Delete file first to reduce risk of data duplication
        else:
            pass
        
        if thismode.upper()=="JSON" or thismode.upper()==".JSON":
            if not file.endswith(".json"):
                file += ".json"
            
            if type(data)=="pandas.core.frame.DataFrame":
                data.to_json(file)
            
            else:
                with open(file, 'w') as outfile:
                    json.dump(data, outfile, encoding="utf-8")
            
            #print("Data saved to " + file + "!")

        elif thismode.lower()=="pickle" or thismode.lower()==".pickle":
            if not file.endswith(".pickle"):
                file += ".pickle"
                
            if type(data)=="pandas.core.frame.DataFrame":
                data.to_pickle(file, encoding="utf-8")
                
            else:
                with open(file, "wb") as outfile:
                    pickle.dump(data, outfile, encoding="utf-8")
                    
            #print("Data saved to " + file + "!")
                
        elif thismode.upper()=="CSV" or thismode.upper()==".CSV":
            if not file.endswith(".csv"):
                file += ".csv"
                
            if type(data)=="pandas.core.frame.DataFrame":
                if os.path.exists(file): # If file already exists, assume we are appending to it (with same column names)
                    data.to_csv(file,mode="a",index=False,sep="\t",header=False,encoding="utf-8")
                else: # If file doesn't exist, create it
                    data.to_csv(file,mode="w",index=False,sep="\t",header=data.columns.values,encoding="utf-8")
                
            else:
                with open(file, "w") as outfile:
                    wr = csv.writer(outfile)
                    wr.writerows(data)
                
            #print("Data saved to " + file + "!")

        else:
            print("ERROR! Improper arguments. Please include: data object to save (Pandas DataFrames OK), file path, and file format ('JSON', 'pickle', or 'CSV').")
    
    except Exception as e:
        print("Failed to save to " + str(file) + " into memory using " + str(thismode) + " format. Please check arguments (data, file, file format) and try again.")
        print(e)
    

def load_datafile(file):
    """Loads dicts_list (or whatever) from file, using either JSON or pickle format. 
    The created object should be assigned when called."""
    
    file = str(file)
    
    if file.lower().endswith(".json"):
        with open(file,'r') as infile:
            var = json.load(infile)
    
    if file.lower().endswith(".pickle"):
        with open(file,'rb') as infile:
            var = pickle.load(infile)
        
    print(file + " successfully loaded!")
    return var


def load_dict(custom_dict, file_path):
    """Loads in a dictionary. Adds each entry from the dict at file_path to the defined set custom_dict (the input), 
    which can also be an existing dictionary. This allows the creation of combined dictionaries!"""

    with open(file_path) as file_handler:
        line = file_handler.readline()
        while line:
            custom_dict.add(stemmer.stem(line.replace("\n", ""))) # Add line after stemming dictionary entries and eliminating newlines
            line = file_handler.readline() # Look for anything else in that line, add that too
    return custom_dict


def list_files(folder_path, extension):
    """Outputs a list of every file in folder_path or its subdirectories that has a specified extension.
    Prepends specified extension with '.' if it doesn't start with it already.
    If no extension is specified, it just returns all files in folder_path."""
    
    matches = []
    if extension:
        extension = str(extension) # Coerce to string, just in case
    
    if extension and not extension.startswith("."):
        extension = "." + extension
    
    for dirpath,dirnames,filenames in os.walk(folder_path):
        if extension:
            for filename in fnmatch.filter(filenames, "*" + extension): # Use extension to filter list of files
                matches.append(os.path.join(dirpath,filename))
        else:
                matches.append(os.path.join(dirpath,filename)) # If no extension, just take all files
    return matches


def has_html(folder_path):
    """Simple function that counts .html files and returns a binary:
    'True' if a specified folder has any .html files in it, 'False' otherwise."""
    
    html_list = []
    for dirpath,dirnames,filenames in os.walk(folder_path):
        for file in fnmatch.filter(filenames, "*.html"): # Check if any HTML files in folder_path
            html_list.append(file)
    
    if len(html_list)==0:
        return False
    else:
        return True


def convert_df(df):
    """Makes a Pandas DataFrame more memory-efficient through intelligent use of Pandas data types: 
    specifically, by storing columns with repetitive Python strings not with the object dtype for unique values 
    (entirely stored in memory) but as categoricals, which are represented by repeated integer values. This is a 
    net gain in memory when the reduced memory size of the category type outweighs the added memory cost of storing 
    one more thing. As such, this function checks the degree of redundancy for a given column before converting it."""
    
    converted_df = pd.DataFrame() # Initialize DF for memory-efficient storage of strings (object types)
    # TO DO: Infer dtypes of df
    df_obj = df.select_dtypes(include=['object']).copy() # Filter to only those columns of object data type

    for col in df.columns: 
        if col in df_obj: 
            num_unique_values = len(df_obj[col].unique())
            num_total_values = len(df_obj[col])
            if (num_unique_values / num_total_values) < 0.5: # Only convert data types if at least half of values are duplicates
                converted_df.loc[:,col] = df[col].astype('category') # Store these columns as dtype "category"
            else: 
                converted_df.loc[:,col] = df[col]
        else:    
            converted_df.loc[:,col] = df[col]
                      
    converted_df.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='float')
    converted_df.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    
    return converted_df



In [90]:

    
os.getcwd()









    Out[90]:





'/home/jovyan/work/Charter-school-identities/scripts'



In [91]:

    
#school_index = next((index for (index, d) in enumerate(dicts_list) if d["folder_name"] == "Natomas_Charter_CA"), None) # Find index of that school
#print(school_index)
#print(dicts_list[748]["folder_name"])



In [92]:

    
thispath = wget_dataloc + "Natomas_Charter_CA/"

html_list = []
for dirpath,dirnames,filenames in os.walk(thispath):
    for file in fnmatch.filter(filenames, "*.html"): # Check if any HTML files in folder_path
        html_list.append(file)

print(len(html_list))
#for dirpath,dirnames,filenames in os.walk(thispath):
#    print(len([file for file in fnmatch.filter(filenames, "*.html")]))



In [93]:

    
def set_fail_flag2(folder_name):
    """The web_fail_flag indicates whether the webcrawl/download operation failed to capture any .html for a particular folder_name.
    This function sets the web_fail_flag depending on two conditions: 
    (1) Whether or not there exists a web download folder corresponding to folder_name, and
    (2) Whether or not that folder contains at least one file with the .html extension."""
    
    global wget_dataloc,dicts_list # Need access to the dictionary file
    web_fail_flag = "" # make output a str to work with currently limited Pandas dtype conversion functionality
    
    folder_path = str(wget_dataloc) + folder_name + "/"
    if (not os.path.exists(folder_path)) or (has_html(folder_path)==False):
        web_fail_flag = str(1) # If folder doesn't exist, mark as fail and ignore when loading files
    else:
        web_fail_flag = str(0) # make str so can work with currently limited Pandas dtype conversion functionality
    
    match_index = next((index for (index, d) in enumerate(dicts_list) if d["folder_name"] == folder_name), None) # Find dict index of input/folder_name
    dicts_list[match_index]['wget_fail_flag'] = web_fail_flag # Assign output to dict entry for folder_name
    
    return



In [94]:

    
#print(dicts_list[748]["wget_fail_flag"])
#print(dicts_list[748]["folder_name"])



In [95]:

    
#set_fail_flag2("Natomas_Charter_CA")
#print(dicts_list[748]["wget_fail_flag"])



In [96]:

    
# ### Set parsing keywords

keywords = ['values', 'academics', 'skills', 'purpose',
                       'direction', 'mission', 'vision', 'vision', 'mission', 'our purpose',
                       'our ideals', 'ideals', 'our cause', 'curriculum','curricular',
                       'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system',
                       'structure','philosophy', 'philosophical', 'beliefs', 'believe',
                       'principles', 'creed', 'credo', 'values','moral', 'history', 'our story',
                       'the story', 'school story', 'background', 'founding', 'founded',
                       'established','establishment', 'our school began', 'we began',
                       'doors opened', 'school opened', 'about us', 'our school', 'who we are',
                       'our identity', 'profile', 'highlights']

mission_keywords = ['mission','vision', 'vision:', 'mission:', 'our purpose', 'our ideals', 'ideals:', 'our cause', 'cause:', 'goals', 'objective']
curriculum_keywords = ['curriculum', 'curricular', 'program', 'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system', 'structure']
philosophy_keywords = ['philosophy', 'philosophical', 'beliefs', 'believe', 'principles', 'creed', 'credo', 'value',  'moral']
history_keywords = ['history', 'story','our story', 'the story', 'school story', 'background', 'founding', 'founded', 'established', 'establishment', 'our school began', 'we began', 'doors opened', 'school opened']
about_keywords =  ['about us', 'our school', 'who we are', 'overview', 'general information', 'our identity', 'profile', 'highlights']

# Create sets for each aspect and one for all keywords
mission_keywords = set(stemmer.stem(word) for word in mission_keywords)
curriculum_keywords = set(stemmer.stem(word) for word in curriculum_keywords)
philosophy_keywords = set(stemmer.stem(word) for word in philosophy_keywords)
history_keywords = set(stemmer.stem(word) for word in history_keywords)
about_keywords =  set(stemmer.stem(word) for word in about_keywords)
all_keywords = set(stemmer.stem(key) for key in keywords)

if Debug:
    print("\nList of keywords:\n", list(all_keywords))



In [111]:

    
# ### Create dictionaries for each ideology and one for combined ideologies

ess_dict, prog_dict, rit_dict, all_ideol = set(), set(), set(), set()
all_ideol = load_dict(all_ideol, dicts_dir + "ess_dict.txt")
all_ideol = load_dict(all_ideol, dicts_dir + "prog_dict.txt")
ess_dict = load_dict(ess_dict, dicts_dir + "ess_dict.txt")
prog_dict = load_dict(prog_dict, dicts_dir + "prog_dict.txt")
rit_dict = load_dict(rit_dict, dicts_dir + "rit_dict.txt")

logging.info(str(len(all_ideol)) + "entries loaded into the combined ideology dictionary.")
list_dict = list(all_ideol)
list_dict.sort(key = lambda x: x.lower())
print("First 10 elements of combined ideology dictionary are:\n", list_dict[:10])









    



First 10 elements of combined ideology dictionary are:
 ['abstract think', 'abstract thought', 'account', 'achievement gain', 'achievement gap', 'activi', 'adapt', 'agricult', 'anim', "another's sho"]



In [17]:

    
# ### Define list of tuples: keywords lists and their titles, for dictionary analyses

titles_list = ("mission","curriculum","philosophy","history","about","ideology","keywords")
keysnames_tupzip = zip((mission_keywords,curriculum_keywords,philosophy_keywords,history_keywords,about_keywords,\
                              all_ideol,all_keywords), titles_list)

dictsnames_list = ("ess", "prog", "rit", "all_ideol")
dictsnames_tupzip = zip((ess_dict,prog_dict,rit_dict,all_ideol), dictsnames_list)

if Debug:
    print(list(keysnames_tupzip))
    print()
    print(list(dictsnames_tupzip))









    



[({'our caus', 'cause:', 'object', 'goal', 'our purpos', 'mission:', 'our id', 'vision', 'vision:', 'ideals:', 'mission'}, 'mission'), ({'method', 'approach', 'system', 'model', 'program', 'curricular', 'pedagog', 'structur', 'curriculum', 'pedagogi'}, 'curriculum'), ({'valu', 'believ', 'philosoph', 'belief', 'creed', 'principl', 'credo', 'philosophi', 'moral'}, 'philosophy'), ({'doors open', 'school open', 'histori', 'stori', 'we began', 'our school began', 'found', 'school stori', 'the stori', 'our stori', 'background', 'establish'}, 'history'), ({'overview', 'general inform', 'highlight', 'about u', 'who we ar', 'our school', 'profil', 'our ident'}, 'about'), ({'prompt', 'conform', 'joy', 'observation of teacher state averag', 'jung', 'memory-and-dril', 'spiritu', 'reading & math', 'comedi', 'rebel', 'prison', 'observation on teacher state standard', 'worker-be', 'habits of mind', 'craftspeopl', 'collaborat', 'pace of the child', 'slant', 'organ', 'handicap', 'college-complet', 'art and mus', 'social', 'recover credit', 'suspens', 'back to bas', 'incom', 'of color', 'child-cent', 'recycl', 'nurtur', 'essential knowledg', 'free think', 'emot', 'farm', 'liberal art', 'cold-cal', 'immigr', 'factori', 'achievement gain', 'disabl', 'sincer', 'rules of conduct', 'figur', 'relate to oth', 'renaiss', 'own pac', 'chastis', 'theater', 'democraci', 'self-rul', 'child*cent', 'personaliz', 'social/emot', 'pollut', 'comput', 'paper-and-pencil', 'no child left behind', 'castig', 'compass', 'zest', 'college-bound', 'common cor', 'self-guid', 'critical think', 'explor', 'wage earn', 'rote', 'lessons videotap', 'gift', 'industri', 'self-direct', 'drone', 'regalia', 'outcom', 'discuss', 'sculpt', 'teacher observ', 'drop out', 'stiff', 'datum', 'soul', 'immers', 'fact', 'appetit', 'commerci', 'code of conduct', 'problem solv', 'essentialist', 'weav', 'statist', 'agricult', 'enthusia', 'liabil', 'interperson', 'obey', 'social problem', 'lawyer provid', 'immateri', 'open-mind', 'recorded lesson', 'manifest', 'harsh', 'materiali', 'recondit', 'seminar', 'creativ', 'wast', 'at-wil', 'legal servic', 'untam', 'soft', 'montessori', 'drama', 'prepared for university tough', 'teamwork', 'social & emot', 'at-risk', 'three-fold way', 'whole-child', 'cooperat', 'occup', 'international school', 'cadenc', 'ethnic', 'wildlif', 'respect', 'racist', 'breath', "child's pac", 'provide lawy', "another's sho", 'contend', 'procedur', 'apprais', 'inner world', 'blacksmith', 'standard', 'conver', 'freir', 'well', 'lawyer servic', 'abstract think', 'team', 'college-prep', 'home-school', 'vulner', 'structur', 'shame', 'interact', 'trust', 'college prep', 'nod', 'workplac', 'medical servic', 'talent', 'well-round', 'drill', 'peril', 'social act', 'spoon-fe', 'constructivi', 'compost', 'kipp', 'account', 'traditional math', 'worker be', 'machin', 'plant', 'protocol', 'laptop', 'music', 'socioemot', 'relate to p', 'celebrat', 'basic educ', 'spin', 'corpor', 'social servic', 'nclb', 'achievement gap', 'invent', 'desir', 'free thought', 'whole child', 'consum', 'geniu', 'independen', 'teach for america', 'result', 'feed', 'danc', 'warm', 'resol', 'independent thought', 'cycl', 'order', 'student cent', 'dewey', 'inspir', 'steiner', 'indic', 'landscap', 'impoverish', 'brotherhood', 'origin', 'spoon fe', 'project-bas', 'offici', 'behavioral cod', 'adapt', 'constructed knowledg', 'knowledg', 'lectur', 'groupwork', 'PC', 'traditional educ', 'mind', 'work-rel', 'holist', 'nonconcret', 'credit recoveri', 'craftsperson', 'calculat', 'construction of', 'friend', 'offic', 'chef', 'gaps in incom', 'cue', 'compet', 'racial ident', 'home school', 'basic knowledg', 'legal help', 'reprocess', 'milieu', 'gap in achiev', 'social and emot', 'differentiat', 'ethnic ident', 'reading and math', 'excit', 'correct', 'penal', 'data', 'learner-cent', 'intuit', 'college exemplar', 'prepared for colleg', 'sentiment', 'mainfram', 'rhythm', 'craft', 'inquiry-bas', 'foreign languag', 'artist', 'inner lif', 'puls', 'measur', 'employment servic', 'car', 'performing art', 'express', 'outdoor', 'diagnost', 'individualiz', 'teacher review', 'intang', 'college-ori', 'democrat', 'excess', 'contest', 'rules of behavior', 'health servic', 'tfa', 'multicultur', 'passion', 'genuin', 'the art', 'fine art', 'national averag', 'justic', 'provide a lawy', 'natur', 'gap in incom', 'authent', 'enrich', 'implement', 'tolerat', 'activi', 'college icon', 'defer', 'heartbeat', 'cook', 'expectations for behavior', 'perform', 'inner self', 'disadvantag', 'embarrass', 'enlighten', 'essenti', 'cross-cultur', 'music & art', 'medical help', 'inquiry bas', 'uniform', 'university prep', 'dress cod', 'drop-out', 'villag', 'spoon-f', 'progressivi', 'gaps in achiev', 'discov', 'habits of heart', 'project-ori', 'salvag', 'global', 'dedicat', 'learning contract', 'learner cent', 'kinship', 'quantif', 'reus', 'parent contract', 'play', 'surround', 'test', 'attach', 'recovery of credit', 'self-relian', 'heart', 'unconsci', 'cuisin', 'reggio emilia', 'independent think', 'skeptic', 'job-rel', 'health', 'festiv', 'college enrol', 'graduation r', 'back-to-bas', 'unequ', 'waldorf', 'job market', 'group', 'art & mus', 'reduced-pric', 'critical thought', 'lust', 'assess', 'student-cent', 'bona fid', 'regul', 'hierarchi', 'illustr', 'duti', 'socioeconom', 'non-concret', 'feedback to teach', 'regimen', 'interest', 'sign', 'employe', 'integrat', 'tragedi', 'author', 'enforc', 'traditional read', 'trivium', 'nourish', 'rooted in bas', 'yarn', 'tame', 'socio-emot', 'draw', 'flora', 'review of teach', 'firm', 'experi', 'problem-solv', 'craftsman', 'code of moral conduct', 'self-motivat', 'threaten', 'commun', 'techniqu', 'culpab', 'rigid', 'crave', 'poverti', 'feder', 'credit-recoveri', 'low-incom', 'taped lesson', 'feel', 'expected behavior', 'liveri', 'social-emot', 'career', 'frpl', 'craftsmen', 'contextualiz', 'college bound', 'teacher feedback', 'sustain', 'grounded in bas', 'memory and dril', 'workstat', 'math and read', 'love', 'bound for colleg', 'vocat', 'lessons tap', 'equal', 'income gap', 'divers', 'se', 'embrac', 'confide in', 'regulatori', 'abstract thought', 'behavioral rul', 'demand', 'knowledge i', 'toler', 'music and art', 'math & read', 'exam', 'danger', 'processor', 'social work', 'inner truth', 'season', 'mechan', 'desperat', 'behavior cod', 'college-enrol', 'delicaci', 'personal learning needs technolog', 'free-think', 'embed', 'punit', 'portfolio', 'discriminat', 'meaning', 'punish', 'self-determin', 'miner', 'hard', 'communicat', 'college complet', 'meditat', 'worker', 'inequ', 'paint', 'behavioral expect', 'spoon f', 'anim', 'videotaped lesson', 'well-b', 'disciplin', 'fantasi', 'fauna', 'imagin', 'outspoken', 'progress', 'obedi', 'socrat', 'relationship', 'student contract', 'unorthodox', 'ingeni', 'militari', 'construct', 'classic', 'lessons record', 'college attain', 'garden'}, 'ideology'), ({'school open', 'histori', 'we began', 'our purpos', 'direct', 'our id', 'structur', 'our caus', 'academ', 'our ident', 'philosophi', 'doors open', 'method', 'highlight', 'model', 'pedagog', 'the stori', 'vision', 'our school', 'credo', 'our school began', 'ideal', 'belief', 'valu', 'approach', 'believ', 'creed', 'found', 'about u', 'who we ar', 'mission', 'background', 'profil', 'purpos', 'system', 'philosoph', 'school stori', 'principl', 'our stori', 'curricular', 'establish', 'skill', 'curriculum', 'pedagogi', 'moral'}, 'keywords')]

[({'prompt', 'prepared for colleg', 'racist', 'observation of teacher state averag', 'memory-and-dril', 'mainfram', 'provide lawy', 'reading & math', 'review of teach', 'firm', 'observation on teacher state standard', 'contend', 'code of moral conduct', 'slant', 'threaten', 'procedur', 'handicap', 'apprais', 'culpab', 'measur', 'college-complet', 'rigid', 'recover credit', 'suspens', 'employment servic', 'poverti', 'feder', 'back to bas', 'incom', 'credit-recoveri', 'low-incom', 'of color', 'taped lesson', 'essential knowledg', 'expected behavior', 'diagnost', 'liveri', 'individualiz', 'standard', 'teacher review', 'career', 'frpl', 'college-ori', 'cold-cal', 'contest', 'college bound', 'lawyer servic', 'rules of behavior', 'grounded in bas', 'health servic', 'immigr', 'teacher feedback', 'tfa', 'achievement gain', 'disabl', 'memory and dril', 'workstat', 'rules of conduct', 'college-prep', 'math and read', 'vulner', 'national averag', 'structur', 'bound for colleg', 'vocat', 'lessons tap', 'equal', 'personaliz', 'justic', 'college prep', 'income gap', 'nod', 'workplac', 'medical servic', 'divers', 'provide a lawy', 'gap in incom', 'se', 'implement', 'comput', 'drill', 'peril', 'social act', 'paper-and-pencil', 'regulatori', 'no child left behind', 'kipp', 'account', 'behavioral rul', 'college icon', 'demand', 'machin', 'traditional math', 'defer', 'college-bound', 'common cor', 'protocol', 'math & read', 'laptop', 'exam', 'basic educ', 'wage earn', 'danger', 'expectations for behavior', 'lessons videotap', 'perform', 'processor', 'rote', 'social work', 'disadvantag', 'corpor', 'regalia', 'outcom', 'social servic', 'nclb', 'achievement gap', 'essenti', 'teacher observ', 'drop out', 'medical help', 'desperat', 'behavior cod', 'college-enrol', 'stiff', 'dress cod', 'drop-out', 'personal learning needs technolog', 'uniform', 'university prep', 'datum', 'gaps in achiev', 'teach for america', 'result', 'dedicat', 'learning contract', 'fact', 'discriminat', 'quantif', 'parent contract', 'code of conduct', 'order', 'test', 'recovery of credit', 'punish', 'essentialist', 'indic', 'hard', 'job-rel', 'impoverish', 'college enrol', 'statist', 'college complet', 'back-to-bas', 'graduation r', 'liabil', 'obey', 'inequ', 'job market', 'social problem', 'unequ', 'worker', 'offici', 'behavioral cod', 'behavioral expect', 'lawyer provid', 'lectur', 'reduced-pric', 'recorded lesson', 'PC', 'traditional educ', 'assess', 'regul', 'videotaped lesson', 'work-rel', 'seminar', 'hierarchi', 'duti', 'socioeconom', 'disciplin', 'credit recoveri', 'at-wil', 'legal servic', 'calculat', 'offic', 'obedi', 'feedback to teach', 'regimen', 'gaps in incom', 'cue', 'sign', 'respect', 'prepared for university tough', 'compet', 'employe', 'racial ident', 'basic knowledg', 'legal help', 'author', 'enforc', 'gap in achiev', 'at-risk', 'student contract', 'differentiat', 'ethnic ident', 'reading and math', 'militari', 'traditional read', 'correct', 'rooted in bas', 'occup', 'lessons record', 'penal', 'tame', 'data', 'college attain', 'ethnic', 'college exemplar'}, 'ess'), ({'conform', 'draw', 'joy', 'jung', 'breath', "child's pac", 'flora', 'sentiment', 'spiritu', 'rhythm', "another's sho", 'comedi', 'craft', 'experi', 'inquiry-bas', 'prison', 'rebel', 'habits of mind', 'problem-solv', 'craftsman', 'foreign languag', 'worker-be', 'craftspeopl', 'self-motivat', 'collaborat', 'artist', 'pace of the child', 'inner lif', 'organ', 'commun', 'techniqu', 'puls', 'art and mus', 'social', 'inner world', 'crave', 'car', 'performing art', 'express', 'outdoor', 'child-cent', 'recycl', 'feel', 'nurtur', 'free think', 'blacksmith', 'emot', 'social-emot', 'conver', 'craftsmen', 'farm', 'freir', 'intang', 'democrat', 'liberal art', 'contextualiz', 'excess', 'well', 'abstract think', 'sustain', 'factori', 'multicultur', 'sincer', 'passion', 'genuin', 'team', 'figur', 'the art', 'home-school', 'wildlif', 'love', 'fine art', 'shame', 'relate to oth', 'renaiss', 'interact', 'own pac', 'chastis', 'theater', 'democraci', 'self-rul', 'trust', 'child*cent', 'social/emot', 'natur', 'talent', 'authent', 'well-round', 'enrich', 'pollut', 'spoon-fe', 'embrac', 'activi', 'tolerat', 'confide in', 'constructivi', 'abstract thought', 'compost', 'castig', 'knowledge i', 'worker be', 'compass', 'toler', 'zest', 'heartbeat', 'music and art', 'plant', 'cook', 'music', 'critical think', 'self-guid', 'socioemot', 'relate to p', 'celebrat', 'explor', 'inner self', 'gift', 'industri', 'spin', 'self-direct', 'drone', 'inner truth', 'embarrass', 'discuss', 'season', 'enlighten', 'invent', 'mechan', 'sculpt', 'cross-cultur', 'desir', 'music & art', 'free thought', 'inquiry bas', 'whole child', 'consum', 'villag', 'delicaci', 'geniu', 'spoon-f', 'independen', 'habits of heart', 'discov', 'progressivi', 'project-ori', 'free-think', 'feed', 'embed', 'danc', 'salvag', 'global', 'punit', 'soul', 'immers', 'learner cent', 'portfolio', 'warm', 'resol', 'independent thought', 'appetit', 'cycl', 'kinship', 'meaning', 'reus', 'commerci', 'play', 'problem solv', 'surround', 'attach', 'student cent', 'self-determin', 'dewey', 'inspir', 'self-relian', 'steiner', 'heart', 'unconsci', 'miner', 'cuisin', 'reggio emilia', 'independent think', 'skeptic', 'landscap', 'communicat', 'brotherhood', 'weav', 'health', 'festiv', 'agricult', 'enthusia', 'origin', 'spoon fe', 'meditat', 'interperson', 'project-bas', 'waldorf', 'group', 'paint', 'adapt', 'constructed knowledg', 'knowledg', 'art & mus', 'spoon f', 'critical thought', 'immateri', 'open-mind', 'lust', 'groupwork', 'manifest', 'mind', 'anim', 'harsh', 'materiali', 'student-cent', 'bona fid', 'recondit', 'creativ', 'holist', 'well-b', 'illustr', 'nonconcret', 'fantasi', 'non-concret', 'wast', 'fauna', 'craftsperson', 'untam', 'construction of', 'friend', 'imagin', 'outspoken', 'progress', 'soft', 'socrat', 'interest', 'relationship', 'montessori', 'chef', 'drama', 'reprocess', 'home school', 'integrat', 'tragedi', 'milieu', 'teamwork', 'social & emot', 'social and emot', 'three-fold way', 'ingeni', 'unorthodox', 'whole-child', 'excit', 'nourish', 'trivium', 'cooperat', 'construct', 'yarn', 'classic', 'international school', 'cadenc', 'learner-cent', 'socio-emot', 'garden', 'intuit'}, 'prog'), ({'year', 'stori', 'cultur', 'upgrad', 'behavior', 'practic', 'involv', 'research', 'charact', 'academ', 'smart', 'ambitio', 'product', 'encourag', 'organ', 'autonom', 'digniti', 'special educ', 'commun', 'infu', 'person', 'develop', 'emphasi', 'numeraci', 'believ', 'real-world', 'world', 'need', 'solid', 'train', 'esl', 'participat', 'posit', 'complicat', 'mission', 'after-car', 'math', 'empow', 'literaci', 'goal', 'persev', 'public', 'real world', 'teach', 'principl', 'today', 'tutor', 'attend', 'life', 'confiden', 'teacher', 'citizen', 'excel', 'studi', 'lesson', 'chang', 'student', 'blend', 'responsib', 'process', 'virtual', 'innovat', 'perspect', 'complet', 'except', 'grow', 'foundat', 'uniqu', 'loyalti', 'imag', 'masteri', 'school', 'steadfast', 'motivat', 'children', 'language art', 'belief', 'before-car', 'groundwork', 'pattern', 'middl', 'single-sex', 'mark', 'strive', 'one', 'cultiv', 'civic', 'inform', 'support', 'striv', 'professional develop', 'elementari', 'scienc', 'rigor', 'curricul', 'deep', 'complex', 'surpass', 'quest', 'neg', 'restraint', 'histori', 'task', 'educ', 'direct', 'intellect', 'dream', 'substan', 'fact-find', 'method', 'leader', 'co-e', 'self', 'succ', 'inten', 'focu', 'before car', 'challeng', 'formal', 'valu', 'special ne', 'exercis', 'graduat', 'prep', 'assign', 'achiev', 'elect', 'beyond', 'strengthen', 'hybrid', 'simpl', 'significan', 'decid', 'diligen', 'individu', 'instruct', 'track', 'proud', 'instil', 'read', 'capab', 'meet', 'acceler', 'aptitud', 'philosophi', 'attitud', 'environ', 'reward', 'intellig', 'determin', 'courag', 'after car', 'model', 'modern', 'charter', 'depth', 'pedagog', 'program', 'single sex', 'tough', 'cours', 'ideal', 'progress', 'activ', 'aspir', 'number', 'approach', 'strength', 'patriot', 'staff', 'facil', 'learn', 'high', 'compet', 'creed', 'public polici', 'balanc', 'strong', 'author', 'research-bas', 'english as second languag', 'build', 'system', 'indicat', 'improv', 'repres', 'decis', 'emphas', 'tenaci', 'provid', 'opportunit', 'skill', 'pride', 'extend'}, 'rit'), ({'prompt', 'conform', 'joy', 'observation of teacher state averag', 'jung', 'memory-and-dril', 'spiritu', 'reading & math', 'comedi', 'rebel', 'prison', 'observation on teacher state standard', 'worker-be', 'habits of mind', 'craftspeopl', 'collaborat', 'pace of the child', 'slant', 'organ', 'handicap', 'college-complet', 'art and mus', 'social', 'recover credit', 'suspens', 'back to bas', 'incom', 'of color', 'child-cent', 'recycl', 'nurtur', 'essential knowledg', 'free think', 'emot', 'farm', 'liberal art', 'cold-cal', 'immigr', 'factori', 'achievement gain', 'disabl', 'sincer', 'rules of conduct', 'figur', 'relate to oth', 'renaiss', 'own pac', 'chastis', 'theater', 'democraci', 'self-rul', 'child*cent', 'personaliz', 'social/emot', 'pollut', 'comput', 'paper-and-pencil', 'no child left behind', 'castig', 'compass', 'zest', 'college-bound', 'common cor', 'self-guid', 'critical think', 'explor', 'wage earn', 'rote', 'lessons videotap', 'gift', 'industri', 'self-direct', 'drone', 'regalia', 'outcom', 'discuss', 'sculpt', 'teacher observ', 'drop out', 'stiff', 'datum', 'soul', 'immers', 'fact', 'appetit', 'commerci', 'code of conduct', 'problem solv', 'essentialist', 'weav', 'statist', 'agricult', 'enthusia', 'liabil', 'interperson', 'obey', 'social problem', 'lawyer provid', 'immateri', 'open-mind', 'recorded lesson', 'manifest', 'harsh', 'materiali', 'recondit', 'seminar', 'creativ', 'wast', 'at-wil', 'legal servic', 'untam', 'soft', 'montessori', 'drama', 'prepared for university tough', 'teamwork', 'social & emot', 'at-risk', 'three-fold way', 'whole-child', 'cooperat', 'occup', 'international school', 'cadenc', 'ethnic', 'wildlif', 'respect', 'racist', 'breath', "child's pac", 'provide lawy', "another's sho", 'contend', 'procedur', 'apprais', 'inner world', 'blacksmith', 'standard', 'conver', 'freir', 'well', 'lawyer servic', 'abstract think', 'team', 'college-prep', 'home-school', 'vulner', 'structur', 'shame', 'interact', 'trust', 'college prep', 'nod', 'workplac', 'medical servic', 'talent', 'well-round', 'drill', 'peril', 'social act', 'spoon-fe', 'constructivi', 'compost', 'kipp', 'account', 'traditional math', 'worker be', 'machin', 'plant', 'protocol', 'laptop', 'music', 'socioemot', 'relate to p', 'celebrat', 'basic educ', 'spin', 'corpor', 'social servic', 'nclb', 'achievement gap', 'invent', 'desir', 'free thought', 'whole child', 'consum', 'geniu', 'independen', 'teach for america', 'result', 'feed', 'danc', 'warm', 'resol', 'independent thought', 'cycl', 'order', 'student cent', 'dewey', 'inspir', 'steiner', 'indic', 'landscap', 'impoverish', 'brotherhood', 'origin', 'spoon fe', 'project-bas', 'offici', 'behavioral cod', 'adapt', 'constructed knowledg', 'knowledg', 'lectur', 'groupwork', 'PC', 'traditional educ', 'mind', 'work-rel', 'holist', 'nonconcret', 'credit recoveri', 'craftsperson', 'calculat', 'construction of', 'friend', 'offic', 'chef', 'gaps in incom', 'cue', 'compet', 'racial ident', 'home school', 'basic knowledg', 'legal help', 'reprocess', 'milieu', 'gap in achiev', 'social and emot', 'differentiat', 'ethnic ident', 'reading and math', 'excit', 'correct', 'penal', 'data', 'learner-cent', 'intuit', 'college exemplar', 'prepared for colleg', 'sentiment', 'mainfram', 'rhythm', 'craft', 'inquiry-bas', 'foreign languag', 'artist', 'inner lif', 'puls', 'measur', 'employment servic', 'car', 'performing art', 'express', 'outdoor', 'diagnost', 'individualiz', 'teacher review', 'intang', 'college-ori', 'democrat', 'excess', 'contest', 'rules of behavior', 'health servic', 'tfa', 'multicultur', 'passion', 'genuin', 'the art', 'fine art', 'national averag', 'justic', 'provide a lawy', 'natur', 'gap in incom', 'authent', 'enrich', 'implement', 'tolerat', 'activi', 'college icon', 'defer', 'heartbeat', 'cook', 'expectations for behavior', 'perform', 'inner self', 'disadvantag', 'embarrass', 'enlighten', 'essenti', 'cross-cultur', 'music & art', 'medical help', 'inquiry bas', 'uniform', 'university prep', 'dress cod', 'drop-out', 'villag', 'spoon-f', 'progressivi', 'gaps in achiev', 'discov', 'habits of heart', 'project-ori', 'salvag', 'global', 'dedicat', 'learning contract', 'learner cent', 'kinship', 'quantif', 'reus', 'parent contract', 'play', 'surround', 'test', 'attach', 'recovery of credit', 'self-relian', 'heart', 'unconsci', 'cuisin', 'reggio emilia', 'independent think', 'skeptic', 'job-rel', 'health', 'festiv', 'college enrol', 'graduation r', 'back-to-bas', 'unequ', 'waldorf', 'job market', 'group', 'art & mus', 'reduced-pric', 'critical thought', 'lust', 'assess', 'student-cent', 'bona fid', 'regul', 'hierarchi', 'illustr', 'duti', 'socioeconom', 'non-concret', 'feedback to teach', 'regimen', 'interest', 'sign', 'employe', 'integrat', 'tragedi', 'author', 'enforc', 'traditional read', 'trivium', 'nourish', 'rooted in bas', 'yarn', 'tame', 'socio-emot', 'draw', 'flora', 'review of teach', 'firm', 'experi', 'problem-solv', 'craftsman', 'code of moral conduct', 'self-motivat', 'threaten', 'commun', 'techniqu', 'culpab', 'rigid', 'crave', 'poverti', 'feder', 'credit-recoveri', 'low-incom', 'taped lesson', 'feel', 'expected behavior', 'liveri', 'social-emot', 'career', 'frpl', 'craftsmen', 'contextualiz', 'college bound', 'teacher feedback', 'sustain', 'grounded in bas', 'memory and dril', 'workstat', 'math and read', 'love', 'bound for colleg', 'vocat', 'lessons tap', 'equal', 'income gap', 'divers', 'se', 'embrac', 'confide in', 'regulatori', 'abstract thought', 'behavioral rul', 'demand', 'knowledge i', 'toler', 'music and art', 'math & read', 'exam', 'danger', 'processor', 'social work', 'inner truth', 'season', 'mechan', 'desperat', 'behavior cod', 'college-enrol', 'delicaci', 'personal learning needs technolog', 'free-think', 'embed', 'punit', 'portfolio', 'discriminat', 'meaning', 'punish', 'self-determin', 'miner', 'hard', 'communicat', 'college complet', 'meditat', 'worker', 'inequ', 'paint', 'behavioral expect', 'spoon f', 'anim', 'videotaped lesson', 'well-b', 'disciplin', 'fantasi', 'fauna', 'imagin', 'outspoken', 'progress', 'obedi', 'socrat', 'relationship', 'student contract', 'unorthodox', 'ingeni', 'militari', 'construct', 'classic', 'lessons record', 'college attain', 'garden'}, 'all_ideol')]



In [45]:

    
# ### Define parsing helper functions

def parsefile_by_tags(HTML_file):
    
    """Cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
    Reads in HTML from storage using a given filename, HTML_file."""

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(open(HTML_file), "html5lib")
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    # Split text into list using random string while also eliminating tabs and converting unicode to readable text:
    visible_text = list(normalize("NFKC",elem.replace("\t","")) for elem in visible_text.split(random_string))
    # TO DO: Eliminate anything with a '\x' in it (after splitting by punctuation)
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
    # Consider joining list elements together with newline in between by prepending with: "\n".join

    return(visible_text)



In [46]:

    
if Debug:
    example_textlist = parsefile_by_tags(example_file)
    print("Output of parsefile_by_tags:\n\n", example_textlist, "\n\n")









    



Output of parsefile_by_tags:

 ['About', 'Administration', 'Admissions', 'News', 'Charter School Information', 'Location', 'Frequently Asked Questions', 'Photos/Videos', 'School Facebook Page', 'Financial Reports', 'Nondiscrimination Policy', 'Academics', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Associated Arts', 'Summer Reading >>', '5th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', 'Parents', 'General Information', 'School Calendar >>', 'Download Calendar', 'PlusPortals', 'Before & After School Care', 'Forms >>', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Student Handbook', 'School Uniform Order Form', 'School Supplies >>', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Food Menu', 'PARCC', 'Inclement Weather Schedule', 'West Side Bus Routes', 'Athletics', 'Coach Contact Info', 'Athletics Schedule', 'Sports News', 'Sports Release', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Grade Check', 'Committees', 'Parent Teacher Association >>', 'Contact Info', 'Agendas and Minutes', 'Governance Council >>', 'Contact Info', 'Agendas and Minutes', 'Foundation >>', 'Contact Info', 'Agendas and Minutes', 'Search', 'About', 'You are here:', 'Home', '/', 'About', 'About 21st Century', '21st Century is a charter middle school. We have been a school since 2000. We serve a diverse population of nearly 70 students per grade. All staff bring years of teaching experience into our classrooms, and many have worked together in other settings. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state. Two Associated Arts courses are offered to each student every semester, including music and media programs.', 'History', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education. 40 6th grade students were permitted to enter the doors for the first time in September, 2000, making it officially the first charter middle school in Albuquerque, New Mexico. The school’s first 8th grade graduation was held in May, 2003.', 'The school was started by teachers who had worked together under a charter at Taylor Middle School: Art Silva, Math; Kitty Krivitzky, Science; Darlene Arias, Social Studies; Heather Sickenger, Language Arts. Donna Eldredge joined the team as a Special Ed teacher and principal.', '21st Century is still going strong to this day.', 'Mission', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff. Whenever possible, these lessons will take place in the arena in which they are practiced.', 'Vision', '21st Century Public Academy will provide experiences, situations, and opportunities for students to develop talents and to understand their role in the community. The body, mind, and spirit of each person will grow through lessons learned at school. Students will acquire a sense of personal responsibility, independence, and community interdependence.', 'School Hours', 'Regular School Hours:', '8:15-3:40 Monday, Tuesday, Thursday, Friday', '8:15-3:00 Wednesday', 'Students may not be dropped off prior to 8:00.', 'Recent News', '6th Grade OSI to the Petroglyphs', '(December 13, 2017)', 'Spelling Bee', '(December 8, 2017)', 'Science Bowl Competition', '(December 7, 2017)', 'Boys Basketball', '(November 21, 2017)', '5th Grade OSI to US Eagle Federal Credit Union', '(November 15, 2017)', 'New West Side Bus Routes for 21stCPA', '(November 3, 2017)', '21st Century Girl’s Basketball starts October 25', '(October 23, 2017)', '7th Grade OSI to El Rancho de las Golondrinas', '(October 11, 2017)', 'Girls Basketball Season', '(October 9, 2017)', 'Cross Country Photos', '(October 5, 2017)', '21st Century Public Academy – APS Charter Middle School', '4300 Cutler Ave NE', 'Albuquerque, NM 87110', 'Phone: (505)254-0280', 'Fax: (505)254-8507', 'Scroll to top']



In [64]:

    
# ### Define dictionary matching helper functions

def dict_count(text_list, custom_dict):
    
    """Performs dictionary analysis, returning number of dictionary hits found.
    Removes punctuation and stems the phrase being analyzed. 
    Compatible with multiple-word dictionary elements."""
    
    counts = 0 # number of matches between text_list and custom_dict
    dictless_list = [] # Updated text_list with dictionary hits removed
    max_entry_length = max([len(entry.split()) for entry in custom_dict]) # Get length (in words) of longest entry in combined dictionary
    
    for chunk in text_list: # chunk may be several sentences or possibly paragraphs long
        chunk = re.sub(r'[^\w\s]', '', chunk) # Remove punctuation with regex that keeps only letters and spaces

        # Do dictionary analysis for word chunks of lengths max_entry_length down to 1, removing matches each time.
        # This means longer dict entries will get removed first, useful in case they contain smaller entries.
        for length in range(max_entry_length, 0, -1):
            dictless_chunk,len_counts = dict_match_len(chunk,custom_dict,length)
            dictless_list.append(dictless_chunk)
            counts += len_counts
    
    return dictless_list,int(counts)


def dict_match_len(phrase, custom_dict, length):
    
    """Helper function to dict_match. 
    Returns # dictionary hits and updated copy of phrase with dictionary hits removed. 
    Stems phrases before checking for matches."""
    
    hits_indices, counts = [], 0
    splitted_phrase = phrase.split()
    if len(splitted_phrase) < length:
        return phrase, 0 # If text chunk is shorter than length of dict entries being matched, don't continue.
    
    for i in range(len(splitted_phrase) - length + 1):
        to_stem = ""
        for j in range(length):
            to_stem += splitted_phrase[i+j] + " " # Builds chunk of 'length' words
        stemmed_word = stemmer.stem(to_stem[:-1]) # stem chunk
        if stemmed_word in custom_dict:
            hits_indices.append(i) # Store the index of the word that has a dictionary hit
            counts += 1
            #print(stemmed_word)
                
    # Iterate through list of matching word indices and remove the matches
    for i in range(len(hits_indices)-1, -1, -1):
        splitted_phrase = splitted_phrase[:hits_indices[i]] + \
        splitted_phrase[hits_indices[i] + length:]
    modified_phrase = ""
    for sp in splitted_phrase: # Rebuild the modified phrase, with matches removed
        modified_phrase += sp + " "
    return modified_phrase[:-1], counts

                  
# @timeout_decorator.timeout(20, use_signals=False)
def dictmatch_file_helper(file, listlists, allmatch_count):
    """Counts number of matches in file for each list of terms given, and also collects the terms not matched.
    listlists is a list of lists, each list containing:
    a list of key terms--e.g., for dictsnames_biglist, currently essentialism, progressivism, ritualism, and all three combined (ess_dict, prog_dict, rit_dict, all_dicts);
    the variables used to store the number of matches for each term lit (e.g., ess_count, prog_count, rit_count, alldict_count); 
    and the not-matches--that is, the list of words leftover from the file after all matches are removed (e.g., ess_dictless, prog_dictless, rit_dictless, alldict_dictless). """         
    
    for i in range(len(dictsnames_biglist)): # Iterate over dicts to find matches with parsed text of file
        # For dictsnames_list, dicts are: (ess_dict, prog_dict, rit_dict, alldict_count); count_names are: (ess_count, prog_count, rit_count, alldict_count); dictless_names are: (ess_dictless, prog_dictless, rit_dictless, alldict_dictless)
        # adict,count_name,dictless_name = dictsnames_tupzip[i]
        dictless_add,count_add = dict_count(parsed_pagetext,listlists[i][0])
        listlists[i][1] += count_add
        listlists[i][2] += dictless_add
        allmatch_count += count_add
        
        print("Discovered " + str(count_add) + " matches for " + str(file) + \
                     ", a total thus far of " + str(allmatch_count) + " matches...")
                  
    return listlists,allmatch_count



In [48]:

    
if Debug:
    print("\nOutput of dict_count with ideology dict:\n\n", dict_count(example_textlist,all_ideol), "\n\n")









    



uniform
order
divers
experi
social
well
experi
commun
music
offici
offici
social
team
experi
enrich
experi
talent
commun
mind
commun
feder
season

Output of dict_count with ideology dict:

 (['About', 'About', 'About', 'About', 'About', 'Administration', 'Administration', 'Administration', 'Administration', 'Administration', 'Admissions', 'Admissions', 'Admissions', 'Admissions', 'Admissions', 'News', 'News', 'News', 'News', 'News', 'Charter School Information', 'Charter School Information', 'Charter School Information', 'Charter School Information', 'Charter School Information', 'Location', 'Location', 'Location', 'Location', 'Location', 'Frequently Asked Questions', 'Frequently Asked Questions', 'Frequently Asked Questions', 'Frequently Asked Questions', 'Frequently Asked Questions', 'PhotosVideos', 'PhotosVideos', 'PhotosVideos', 'PhotosVideos', 'PhotosVideos', 'School Facebook Page', 'School Facebook Page', 'School Facebook Page', 'School Facebook Page', 'School Facebook Page', 'Financial Reports', 'Financial Reports', 'Financial Reports', 'Financial Reports', 'Financial Reports', 'Nondiscrimination Policy', 'Nondiscrimination Policy', 'Nondiscrimination Policy', 'Nondiscrimination Policy', 'Nondiscrimination Policy', 'Academics', 'Academics', 'Academics', 'Academics', 'Academics', '5th Grade', '5th Grade', '5th Grade', '5th Grade', '5th Grade', '6th Grade', '6th Grade', '6th Grade', '6th Grade', '6th Grade', '7th Grade', '7th Grade', '7th Grade', '7th Grade', '7th Grade', '8th Grade', '8th Grade', '8th Grade', '8th Grade', '8th Grade', 'Associated Arts', 'Associated Arts', 'Associated Arts', 'Associated Arts', 'Associated Arts', 'Summer Reading ', 'Summer Reading ', 'Summer Reading ', 'Summer Reading', 'Summer Reading', '5th Grade Reading List', '5th Grade Reading List', '5th Grade Reading List', '5th Grade Reading List', '5th Grade Reading List', '6th Grade Reading List', '6th Grade Reading List', '6th Grade Reading List', '6th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '7th Grade Reading List', '7th Grade Reading List', '7th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', '8th Grade Reading List', '8th Grade Reading List', '8th Grade Reading List', '8th Grade Reading List', 'Parents', 'Parents', 'Parents', 'Parents', 'Parents', 'General Information', 'General Information', 'General Information', 'General Information', 'General Information', 'School Calendar ', 'School Calendar ', 'School Calendar ', 'School Calendar', 'School Calendar', 'Download Calendar', 'Download Calendar', 'Download Calendar', 'Download Calendar', 'Download Calendar', 'PlusPortals', 'PlusPortals', 'PlusPortals', 'PlusPortals', 'PlusPortals', 'Before  After School Care', 'Before After School Care', 'Before After School Care', 'Before After School Care', 'Before After School Care', 'Forms ', 'Forms ', 'Forms ', 'Forms ', 'Forms', 'New Student Registration Packet', 'New Student Registration Packet', 'New Student Registration Packet', 'New Student Registration Packet', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Free and Reduced Lunch', 'Free and Reduced Lunch', 'Free and Reduced Lunch', 'Free and Reduced Lunch', 'Student Handbook', 'Student Handbook', 'Student Handbook', 'Student Handbook', 'Student Handbook', 'School Uniform Order Form', 'School Uniform Order Form', 'School Uniform Order Form', 'School Uniform Order Form', 'School Form', 'School Supplies ', 'School Supplies ', 'School Supplies ', 'School Supplies', 'School Supplies', '5th Grade', '5th Grade', '5th Grade', '5th Grade', '5th Grade', '6th Grade', '6th Grade', '6th Grade', '6th Grade', '6th Grade', '7th Grade', '7th Grade', '7th Grade', '7th Grade', '7th Grade', '8th Grade', '8th Grade', '8th Grade', '8th Grade', '8th Grade', 'Food Menu', 'Food Menu', 'Food Menu', 'Food Menu', 'Food Menu', 'PARCC', 'PARCC', 'PARCC', 'PARCC', 'PARCC', 'Inclement Weather Schedule', 'Inclement Weather Schedule', 'Inclement Weather Schedule', 'Inclement Weather Schedule', 'Inclement Weather Schedule', 'West Side Bus Routes', 'West Side Bus Routes', 'West Side Bus Routes', 'West Side Bus Routes', 'West Side Bus Routes', 'Athletics', 'Athletics', 'Athletics', 'Athletics', 'Athletics', 'Coach Contact Info', 'Coach Contact Info', 'Coach Contact Info', 'Coach Contact Info', 'Coach Contact Info', 'Athletics Schedule', 'Athletics Schedule', 'Athletics Schedule', 'Athletics Schedule', 'Athletics Schedule', 'Sports News', 'Sports News', 'Sports News', 'Sports News', 'Sports News', 'Sports Release', 'Sports Release', 'Sports Release', 'Sports Release', 'Sports Release', 'Physical Form', 'Physical Form', 'Physical Form', 'Physical Form', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Contract', 'Student Athlete Contract', 'Student Athlete Contract', 'Student Athlete Contract', 'Student Athlete Grade Check', 'Student Athlete Grade Check', 'Student Athlete Grade Check', 'Student Athlete Grade Check', 'Student Athlete Grade Check', 'Committees', 'Committees', 'Committees', 'Committees', 'Committees', 'Parent Teacher Association ', 'Parent Teacher Association ', 'Parent Teacher Association', 'Parent Teacher Association', 'Parent Teacher Association', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Governance Council ', 'Governance Council ', 'Governance Council ', 'Governance Council', 'Governance Council', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Foundation ', 'Foundation ', 'Foundation ', 'Foundation ', 'Foundation', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Search', 'Search', 'Search', 'Search', 'Search', 'About', 'About', 'About', 'About', 'About', 'You are here', 'You are here', 'You are here', 'You are here', 'You are here', 'Home', 'Home', 'Home', 'Home', 'Home', '', '', '', '', '', 'About', 'About', 'About', 'About', 'About', 'About 21st Century', 'About 21st Century', 'About 21st Century', 'About 21st Century', 'About 21st Century', '21st Century is a charter middle school We have been a school since 2000 We serve a diverse population of nearly 70 students per grade All staff bring years of teaching experience into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Social Studies and Language Arts as well as learning experiences in the community city and state Two Associated Arts courses are offered to each student every semester including music and media programs', '21st Century is a charter middle school We have been a school since 2000 We serve a diverse population of nearly 70 students per grade All staff bring years of teaching experience into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Social Studies and Language Arts as well as learning experiences in the community city and state Two Associated Arts courses are offered to each student every semester including music and media programs', '21st Century is a charter middle school We have been a school since 2000 We serve a diverse population of nearly 70 students per grade All staff bring years of teaching experience into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Social Studies and Language Arts as well as learning experiences in the community city and state Two Associated Arts courses are offered to each student every semester including music and media programs', '21st Century is a charter middle school We have been a school since 2000 We serve a diverse population of nearly 70 students per grade All staff bring years of teaching experience into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Social Studies and Language Arts as well as learning experiences in the community city and state Two Associated Arts courses are offered to each student every semester including music and media programs', '21st Century is a charter middle school We have been a school since 2000 We serve a population of nearly 70 students per grade All staff bring years of teaching into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Studies and Language Arts as as learning in the city and state Two Associated Arts courses are offered to each student every semester including and media programs', 'History', 'History', 'History', 'History', 'History', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it officially the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it officially the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it officially the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it officially the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', '21st Century Public Academy was declared an charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Social Studies Heather Sickenger Language Arts Donna Eldredge joined the team as a Special Ed teacher and principal', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Social Studies Heather Sickenger Language Arts Donna Eldredge joined the team as a Special Ed teacher and principal', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Social Studies Heather Sickenger Language Arts Donna Eldredge joined the team as a Special Ed teacher and principal', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Social Studies Heather Sickenger Language Arts Donna Eldredge joined the team as a Special Ed teacher and principal', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Studies Heather Sickenger Language Arts Donna Eldredge joined the as a Special Ed teacher and principal', '21st Century is still going strong to this day', '21st Century is still going strong to this day', '21st Century is still going strong to this day', '21st Century is still going strong to this day', '21st Century is still going strong to this day', 'Mission', 'Mission', 'Mission', 'Mission', 'Mission', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'It is the mission of 21st Century Public Academy to continually search for positive learning that students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'Vision', 'Vision', 'Vision', 'Vision', 'Vision', '21st Century Public Academy will provide experiences situations and opportunities for students to develop talents and to understand their role in the community The body mind and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and community interdependence', '21st Century Public Academy will provide experiences situations and opportunities for students to develop talents and to understand their role in the community The body mind and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and community interdependence', '21st Century Public Academy will provide experiences situations and opportunities for students to develop talents and to understand their role in the community The body mind and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and community interdependence', '21st Century Public Academy will provide experiences situations and opportunities for students to develop talents and to understand their role in the community The body mind and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and community interdependence', '21st Century Public Academy will provide situations and opportunities for students to develop and to understand their role in the The body and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and interdependence', 'School Hours', 'School Hours', 'School Hours', 'School Hours', 'School Hours', 'Regular School Hours', 'Regular School Hours', 'Regular School Hours', 'Regular School Hours', 'Regular School Hours', '815340 Monday Tuesday Thursday Friday', '815340 Monday Tuesday Thursday Friday', '815340 Monday Tuesday Thursday Friday', '815340 Monday Tuesday Thursday Friday', '815340 Monday Tuesday Thursday Friday', '815300 Wednesday', '815300 Wednesday', '815300 Wednesday', '815300 Wednesday', '815300 Wednesday', 'Students may not be dropped off prior to 800', 'Students may not be dropped off prior to 800', 'Students may not be dropped off prior to 800', 'Students may not be dropped off prior to 800', 'Students may not be dropped off prior to 800', 'Recent News', 'Recent News', 'Recent News', 'Recent News', 'Recent News', '6th Grade OSI to the Petroglyphs', '6th Grade OSI to the Petroglyphs', '6th Grade OSI to the Petroglyphs', '6th Grade OSI to the Petroglyphs', '6th Grade OSI to the Petroglyphs', 'December 13 2017', 'December 13 2017', 'December 13 2017', 'December 13 2017', 'December 13 2017', 'Spelling Bee', 'Spelling Bee', 'Spelling Bee', 'Spelling Bee', 'Spelling Bee', 'December 8 2017', 'December 8 2017', 'December 8 2017', 'December 8 2017', 'December 8 2017', 'Science Bowl Competition', 'Science Bowl Competition', 'Science Bowl Competition', 'Science Bowl Competition', 'Science Bowl Competition', 'December 7 2017', 'December 7 2017', 'December 7 2017', 'December 7 2017', 'December 7 2017', 'Boys Basketball', 'Boys Basketball', 'Boys Basketball', 'Boys Basketball', 'Boys Basketball', 'November 21 2017', 'November 21 2017', 'November 21 2017', 'November 21 2017', 'November 21 2017', '5th Grade OSI to US Eagle Federal Credit Union', '5th Grade OSI to US Eagle Federal Credit Union', '5th Grade OSI to US Eagle Federal Credit Union', '5th Grade OSI to US Eagle Federal Credit Union', '5th Grade OSI to US Eagle Credit Union', 'November 15 2017', 'November 15 2017', 'November 15 2017', 'November 15 2017', 'November 15 2017', 'New West Side Bus Routes for 21stCPA', 'New West Side Bus Routes for 21stCPA', 'New West Side Bus Routes for 21stCPA', 'New West Side Bus Routes for 21stCPA', 'New West Side Bus Routes for 21stCPA', 'November 3 2017', 'November 3 2017', 'November 3 2017', 'November 3 2017', 'November 3 2017', '21st Century Girls Basketball starts October 25', '21st Century Girls Basketball starts October 25', '21st Century Girls Basketball starts October 25', '21st Century Girls Basketball starts October 25', '21st Century Girls Basketball starts October 25', 'October 23 2017', 'October 23 2017', 'October 23 2017', 'October 23 2017', 'October 23 2017', '7th Grade OSI to El Rancho de las Golondrinas', '7th Grade OSI to El Rancho de las Golondrinas', '7th Grade OSI to El Rancho de las Golondrinas', '7th Grade OSI to El Rancho de las Golondrinas', '7th Grade OSI to El Rancho de las Golondrinas', 'October 11 2017', 'October 11 2017', 'October 11 2017', 'October 11 2017', 'October 11 2017', 'Girls Basketball Season', 'Girls Basketball Season', 'Girls Basketball Season', 'Girls Basketball Season', 'Girls Basketball', 'October 9 2017', 'October 9 2017', 'October 9 2017', 'October 9 2017', 'October 9 2017', 'Cross Country Photos', 'Cross Country Photos', 'Cross Country Photos', 'Cross Country Photos', 'Cross Country Photos', 'October 5 2017', 'October 5 2017', 'October 5 2017', 'October 5 2017', 'October 5 2017', '21st Century Public Academy APS Charter Middle School', '21st Century Public Academy APS Charter Middle School', '21st Century Public Academy APS Charter Middle School', '21st Century Public Academy APS Charter Middle School', '21st Century Public Academy APS Charter Middle School', '4300 Cutler Ave NE', '4300 Cutler Ave NE', '4300 Cutler Ave NE', '4300 Cutler Ave NE', '4300 Cutler Ave NE', 'Albuquerque NM 87110', 'Albuquerque NM 87110', 'Albuquerque NM 87110', 'Albuquerque NM 87110', 'Albuquerque NM 87110', 'Phone 5052540280', 'Phone 5052540280', 'Phone 5052540280', 'Phone 5052540280', 'Phone 5052540280', 'Fax 5052548507', 'Fax 5052548507', 'Fax 5052548507', 'Fax 5052548507', 'Fax 5052548507', 'Scroll to top', 'Scroll to top', 'Scroll to top', 'Scroll to top', 'Scroll to top'], 22)



In [49]:

    
def filter_dict_page(pagetext_list, keyslist):
    
    """Filters webtext of a given .html page, which is parsed and in list format, to only those strings 
    within pagetext_list containing an element (word or words) of inputted keyslist. 
    Returns list filteredtext wherein each element has original case (not coerced to lower-case)."""
    
    filteredtext = [] # Initialize empty list to hold strings of page
    
    for string in pagetext_list:
        lowercasestring = str(string).lower() # lower-case string...
        dict_list = [key.lower() for key in list(keyslist)] # ...compared with lower-case element of keyslist
        for key in dict_list:
            if key in lowercasestring and key in lowercasestring.split(' '): # Check that the word is the whole word not part of another one
                filteredtext.append(string)

    return filteredtext



In [50]:

    
if Debug:
    print("Output of filter_dict_page:\n\n", filter_dict_page(example_textlist, all_keywords), "\n\n")









    



Output of filter_dict_page:

 ['21st Century is a charter middle school. We have been a school since 2000. We serve a diverse population of nearly 70 students per grade. All staff bring years of teaching experience into our classrooms, and many have worked together in other settings. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state. Two Associated Arts courses are offered to each student every semester, including music and media programs.', 'Mission', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff. Whenever possible, these lessons will take place in the arena in which they are practiced.', 'Vision']



In [79]:

    
def filter_by_keycount(folder_path): 
    
    """NOT USED.
    Filters webtext for a given school to only those text chunks containing specified keywords.
    Categorizes each block of text by scoring based on keyword count, using already-defined lists of keywords per category:
    mission, philosophy, curriculum, history, "about"/general self-description, combined ideology, and all keywords."""
    
    # TO DO: Fix this function! And compare speed with that of filter_dict_page() above, especially for longer pages.
    
    # Initialize keyword lists to count over (must be defined outside function)
    global mission_keywords,curriculum_keywords,philosophy_keywords,history_keywords,about_keywords,all_ideol,all_keywords
    mission_list,curriculum_list,philosophy_list,history_list,about_list,ideol_list,keys_list, = [],[],[],[],[],[],[]
    
    file_list = list_files(folder_path, ".html")

    for file in tqdm(file_list, desc="Filtering by keys:"):
        try:
            pagetext_list = parsefile_by_tags(file)

            for string in pagetext_list:
                mission_score, curriculum_score, philosophy_score, history_score, about_score, ideol_score, keys_score = 0, 0, 0, 0, 0, 0, 0
                for word in mission_keywords:
                    mission_score+=string.count(word)
                    if 'mission' in string.lower():
                        mission_score = 2

                for word in curriculum_keywords:
                    curriculum_score+=string.count(word)
                    if 'curriculum' in string.lower():
                        curriculum_score = 2

                for word in philosophy_keywords:
                    philosophy_score+=string.count(word)
                    if 'philosophy' in string.lower() or 'value' in string.lower():
                        philosophy_score = 2

                for word in history_keywords:
                    history_score+=string.count(word)
                    if 'history' in string.lower():
                        history_score = 2

                for word in about_keywords:
                    about_score+=string.count(word)
                    if 'about us' in string.lower() or "about-us" in string.lower():
                        about_score = 2

                for word in all_ideol:
                    ideol_score+=string.count(word)

                if mission_score>=2:
                    mission_list.append(string)
                if curriculum_score>=2:
                    curriculum_list.append(string)
                if philosophy_score>=2:
                    philosophy_list.append(string)
                if history_score>=2:
                    history_list.append(string)
                if about_score>=2:
                    about_list.append(string)
                if ideol_score>=2:
                    ideol_list.append(string)
                if ((mission_score + curriculum_score + philosophy_score + about_score) >=2): 
                    keys_list.append(string) # Impute keywords counting using its ideological constitutent elements--which excludes history_score

        except Exception as e:
            if Debug:
                print("    ERROR categorizing " + str(file))
                print(e)
            continue
                    
    return mission_list, curriculum_list, philosophy_list, history_list, about_list, ideol_list, keys_list



In [81]:

    
print("Output of filter_by_keycount:\n\n", filter_by_keycount(example_folder), "\n\n")









    



Filtering by keys:: 100%|██████████| 879/879 [01:30<00:00,  9.75it/s]
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [118]:

    
def dict_bestmatch(folder_path, custom_dict):
    """Parse through all .html files in folder_path, detecting matches with custom_dict,
    to find and return the full text from the html page that has the most matches with that dictionary."""
    
    # Initialization
    file_list = list_files(folder_path, ".html") # Get full list of file paths
    num_pages = len(file_list) # Number of pages in school's folder
    max_page_hits = (-1,-1) # Initialize tuple holding #hits, page number for HTML file with greatest # matches with custom_dict 
    max_weighted_score = (-1,-1) # Same as previous, but weighted by page length
    max_hit_text,max_score_text = [],[] # Empty lists for each best matching pages
    
    # Parse through pages to find maximum number of hits of custom_dict on any page
    for pagenum in tqdm(range(num_pages), desc="Finding best match:"):
        try:
            page_dict_count,page_weighted_score = -1,-1
            page_textlist = parsefile_by_tags(file_list[pagenum]) # Parse page with index pagenum into text list
            
            if len(page_textlist)==0: # If page is empty, don't bother with it
                continue

            dictless_text, page_dict_hits = dict_count(page_textlist, custom_dict) # Count matches between custom_dict and page_textlist using dict_count
            numwords = len('\n'.join(page_textlist).split())
            page_weighted_score = page_dict_hits / numwords # Weight score by number of words on page
            logging.info("Found" + str(page_dict_hits) + "for page #" + str(pagenum) + "and " + str(page_dict_hits) + "weighting for the " + numwords + " words on that page.")

            if page_dict_hits > max_page_hits[0]: # Compare matches for this page with overall max
                max_page_hits = (page_dict_hits, pagenum) # If its greater, then make new page the max
            if page_weighted_score > max_weighted_score[0]: # Same as previous two lines, but weighted by page length
                max_weighted_score = (page_weighted_score, pagenum)

        except Exception as e:
            logging.debug("    ERROR counting dict matches in page #" + str(pagenum))
            logging.debug(str(e))
            continue
                    
    logging.info("Number matches and index of best matching page: " + str(max_page_hits[0]) + " " + str(max_page_hits[1]))
    logging.info("Number matches and index of best WEIGHTED matching page: " + str(max_weighted_score[0]) + " " + str(max_weighted_score[1]))
    
    # Use pagenum to get text for page with highest number of hits and weighted score:
    max_hit_text = parsefile_by_tags(file_list[max_page_hits[1]])
    max_score_text = parsefile_by_tags(file_list[max_weighted_score[1]])
    
    logging.info("Page with the highest number of dictionary hits:\n\n" + str(max_hit_text))
    logging.info("Page with the highest weighted score:\n\n" + str(max_score_text))
    
    return max_hit_text,max_score_text



In [119]:

    
print("Output of dict_bestmatch for all ideologies:\n", dict_bestmatch(example_folder, mission_keywords), "\n\n" )









    



Finding best match:: 100%|██████████| 879/879 [01:48<00:00,  8.14it/s]






    



Output of dict_bestmatch for all ideologies:
 (['About', 'Administration', 'Admissions', 'News', 'Charter School Information', 'Location', 'Frequently Asked Questions', 'Photos/Videos', 'School Facebook Page', 'Financial Reports', 'Nondiscrimination Policy', 'Academics', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Associated Arts', 'Summer Reading >>', '5th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', 'Parents', 'General Information', 'School Calendar >>', 'Download Calendar', 'PlusPortals', 'Before & After School Care', 'Forms >>', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Student Handbook', 'School Uniform Order Form', 'School Supplies >>', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Food Menu', 'PARCC', 'Inclement Weather Schedule', 'West Side Bus Routes', 'Athletics', 'Coach Contact Info', 'Athletics Schedule', 'Sports News', 'Sports Release', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Grade Check', 'Committees', 'Parent Teacher Association >>', 'Contact Info', 'Agendas and Minutes', 'Governance Council >>', 'Contact Info', 'Agendas and Minutes', 'Foundation >>', 'Contact Info', 'Agendas and Minutes', 'Search', 'Scrip and Amazon Smiles Fundraisers!', '21st Century is implementing the Shop with Scrips as well as Amazon Smiles fundraiser programs, which enables families to help us raise funds without having to sell anything, or buy anything, other than what you would buy day to day on a regular basis!  Click here for more information!', 'Welcome to 21st Century!', '21st Century is a charter middle school in Albuquerque, New Mexico. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state.', 'New Location!', 'Our permanent home is opening up at 4300 Cutler NE; Albuquerque, NM 87110.', 'Out of School Instruction', '21st Century is not a typical public school.  We strive to give students educational experiences outside of the school walls, in the community wherein those skills are practiced.  Each grade level attends a minimum of twenty OSIs per year, taking them to locations such as the Natural History Museum, the Roadrunner Food Bank, the Pecos Natural History Park, and many more!  These experiences are designed to support student core learning while also teaching the importance of our community.', 'Smaller Class Sizes', 'As a charter school, 21st CPA is able to offer smaller class sizes, which allows for more individualized instruction for each student.  Each teacher works in both department and grade level teams to design curriculum and plan OSIs that support student learning.', 'Register Here', 'Interested in sending your student to 21st Century?  Click here for more information.', 'Previous', 'Next', '1', '2', '3', '4', '5', '6', 'A Private School Education at a Public School Price', 'About', '21st Century is a charter middle school that has been in place in Albuquerque, New Mexico since 2000. We serve a diverse population of nearly 80 students per grade. All staff bring years of teaching experience into our classrooms and many have worked together in other settings.', 'Academics', 'At 21st Century, coordination and collaboration among teachers and staff is central to our educational philosophy.  Teachers meet in grade level teams every day to coordinate with one another and to prepare collaborative lessons. Departments meet every week to chart progress and address needs within their subject area.', 'Online Gradebook', 'Keep up to date with your student’s grades and coursework by signing in to the ', 'Rediker PlusPortals website', '.  Student grades are updated every two weeks.  If you do not have an account set up yet, please contact the school to get started!', 'School Calendar', 'Latest News', '6th Grade OSI to the Petroglyphs', 'December 13, 2017', '/', 'in ', 'Photos/Videos', '/', 'by ', '21stadmin', 'Read more', '  →', 'Spelling Bee', 'December 8, 2017', '/', 'in ', 'Photos/Videos', ', ', 'Uncategorized', '/', 'by ', '21stadmin', 'Read more', '  →', 'Science Bowl Competition', 'December 7, 2017', '/', 'in ', 'Photos/Videos', '/', 'by ', '21stadmin', 'Read more', '  →', 'Page 9 of 68', '«', '‹', '7', '8', '9', '10', '11', '›', '»', '21st Century Public Academy – APS Charter Middle School', '4300 Cutler Ave NE', 'Albuquerque, NM 87110', 'Phone: (505)254-0280', 'Fax: (505)254-8507', 'Scroll to top'], ['About', 'Administration', 'Admissions', 'News', 'Charter School Information', 'Location', 'Frequently Asked Questions', 'Photos/Videos', 'School Facebook Page', 'Financial Reports', 'Nondiscrimination Policy', 'Academics', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Associated Arts', 'Summer Reading >>', '5th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', 'Parents', 'General Information', 'School Calendar >>', 'Download Calendar', 'PlusPortals', 'Before & After School Care', 'Forms >>', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Student Handbook', 'School Uniform Order Form', 'School Supplies >>', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Food Menu', 'PARCC', 'Inclement Weather Schedule', 'West Side Bus Routes', 'Athletics', 'Coach Contact Info', 'Athletics Schedule', 'Sports News', 'Sports Release', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Grade Check', 'Committees', 'Parent Teacher Association >>', 'Contact Info', 'Agendas and Minutes', 'Governance Council >>', 'Contact Info', 'Agendas and Minutes', 'Foundation >>', 'Contact Info', 'Agendas and Minutes', 'Search', 'Scrip and Amazon Smiles Fundraisers!', '21st Century is implementing the Shop with Scrips as well as Amazon Smiles fundraiser programs, which enables families to help us raise funds without having to sell anything, or buy anything, other than what you would buy day to day on a regular basis!  Click here for more information!', 'Welcome to 21st Century!', '21st Century is a charter middle school in Albuquerque, New Mexico. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state.', 'New Location!', 'Our permanent home is opening up at 4300 Cutler NE; Albuquerque, NM 87110.', 'Out of School Instruction', '21st Century is not a typical public school.  We strive to give students educational experiences outside of the school walls, in the community wherein those skills are practiced.  Each grade level attends a minimum of twenty OSIs per year, taking them to locations such as the Natural History Museum, the Roadrunner Food Bank, the Pecos Natural History Park, and many more!  These experiences are designed to support student core learning while also teaching the importance of our community.', 'Smaller Class Sizes', 'As a charter school, 21st CPA is able to offer smaller class sizes, which allows for more individualized instruction for each student.  Each teacher works in both department and grade level teams to design curriculum and plan OSIs that support student learning.', 'Register Here', 'Interested in sending your student to 21st Century?  Click here for more information.', 'Previous', 'Next', '1', '2', '3', '4', '5', '6', 'A Private School Education at a Public School Price', 'About', '21st Century is a charter middle school that has been in place in Albuquerque, New Mexico since 2000. We serve a diverse population of nearly 80 students per grade. All staff bring years of teaching experience into our classrooms and many have worked together in other settings.', 'Academics', 'At 21st Century, coordination and collaboration among teachers and staff is central to our educational philosophy.  Teachers meet in grade level teams every day to coordinate with one another and to prepare collaborative lessons. Departments meet every week to chart progress and address needs within their subject area.', 'Online Gradebook', 'Keep up to date with your student’s grades and coursework by signing in to the ', 'Rediker PlusPortals website', '.  Student grades are updated every two weeks.  If you do not have an account set up yet, please contact the school to get started!', 'School Calendar', 'Latest News', '6th Grade OSI to the Petroglyphs', 'December 13, 2017', '/', 'in ', 'Photos/Videos', '/', 'by ', '21stadmin', 'Read more', '  →', 'Spelling Bee', 'December 8, 2017', '/', 'in ', 'Photos/Videos', ', ', 'Uncategorized', '/', 'by ', '21stadmin', 'Read more', '  →', 'Science Bowl Competition', 'December 7, 2017', '/', 'in ', 'Photos/Videos', '/', 'by ', '21stadmin', 'Read more', '  →', 'Page 9 of 68', '«', '‹', '7', '8', '9', '10', '11', '›', '»', '21st Century Public Academy – APS Charter Middle School', '4300 Cutler Ave NE', 'Albuquerque, NM 87110', 'Phone: (505)254-0280', 'Fax: (505)254-8507', 'Scroll to top'])



In [15]:

    
def parse_school(school_dict):
    
    """This core function parses webtext for a given school, using helper functions to run analyses and then saving multiple outputs to school_dict:
    counts of the number of matches between all text from a school's html pages and keywords from a defined keyword list, find dict_count();
    and text contents of those individual pages best matching such keywords, via find_best_categories (in development).
    
    For the sake of parsimony and manageable script calls, OTHER similar functions/scripts collect these additional outputs: 
    full (partially cleaned) webtext, by parsing webtext of each .html file (removing inline tags, etc.) within school's folder, via parsefile_by_tags();
    filtered webtext, by keeping only those parsed text elements containing a keyword in previously defined keywords list, via filter_keywords_page();
    and parsed webtext, having removed overlapping headers/footers common to multiple pages, via remove_overlaps()."""
    
    # Allow function to access these variables already defined outside the function (globally)
    global itervar,numschools,parsed,wget_dataloc,URL_var,NAME_var,ADDR_var
    
    datalocation = wget_dataloc # Define path to local data storage
    school_name, school_address, school_URL = school[NAME_var], school[ADDR_var], school[URL_var] # Define varnames
    itervar+=1 # Count this school
    
    print("Parsing " + str(school_name) + ", which is school #" + str(itervar) + " of " + str(numschools) + "...")
    
    # Initialize variables
    school_dict['ess_strength'],school_dict['prog_strength'] = 0.0,0.0
    if not usefile:
        school_dict["duplicate_flag"], school_dict["parse_error_flag"] = 0, 0
    
    # Assign folder names
    folder_name = re.sub(" ","_",(school_name+" "+school_address[-8:-6]))
    school_dict["folder_name"] = folder_name
    school_folder = datalocation + folder_name + "/"
    if school_URL==school_name:
        school_URL = folder_name # Workaround for full_schooldata, which doesn't yet have URLs
    
    # Check if folder exists. If not, exit function
    if not (os.path.exists(school_folder) or os.path.exists(school_folder.lower()) or os.path.exists(school_folder.upper())):
        print("!! NO DIRECTORY FOUND matching " + str(school_folder) + ".\n  Aborting parsing function...\n\n")
        school_dict['wget_fail_flag'] = 1
        return
    
    
    """ # Commented out until dict_bestmatch() works
    try:
        for keylist,title in list(keysnames_tupzip): # Names are: ("mission","curriculum","philosophy","history","about","ideology","keywords")
            bestvar_name = title + "_best" # assign varname to use as dict key

            school_dict[bestvar_name],school_dict[bestvar_name+"_weighted"] = [],[] # initialize dict key/value pair as empty string
            best_page,best_page_weighted = dict_bestmatch(school_folder,keylist) # Find pages best corresponding to keyword category for each in keysnames_tupzip
            school_dict[bestvar_name].extend(best_page)
            school_dict[bestvar_name+"_weighted"].extend(best_page_weighted)
            
    except Exception as e:
        print("    ERROR! Failed to find best pages while parsing webtext of " + str(school_name))
        print("    ",e)
        """
    
    
    try:
        for adict,name in list(dictsnames_tupzip): # Names are: ("ess", "prog", "rit", "all_ideol")
            dict_name = name + "_count"
            school_dict[dict_name] = dict_count(school_folder,adict)[1]
            
        school_dict['ess_strength'] = float(school_dict['ess_count'])/float(school_dict['rit_count'])
        school_dict['prog_strength'] = float(school_dict['prog_count'])/float(school_dict['rit_count'])
            
        print("  SUCCESS! Counted dictionary matches for " + str(school_name) + "...")
        save_to_file(dicts_list, save_dir+"school_dictcounts_temp", "JSON") # Save output so we can pick up where left off, in case something breaks before able to save final output
        return
        
    except:
        print("    ERROR! Failed to count number of dict matches while parsing webtext of " + str(school_name))
        print("    ",e)
        school_dict["parse_error_flag"] = 1
        return



In [16]:

    
# ### Preparing data to be parsed

itervar = 0 # initialize iterator that counts number of schools already parsed
parsed = [] # initialize list of URLs that have already been parsed
dicts_list = [] # initialize list of dictionaries to hold school data

# If input_file was defined by user input in beginning of script, use that to load list of dictionaries. We'll add to it!
if usefile and not dicts_list:
    dicts_list = load_datafile(input_file)
    data_loc = full_schooldata # If loading data, assume we're running on full charter population

else:
    # set charter school data file and corresponding varnames:
    
    data_loc = full_schooldata # Run at scale using URL list of full charter population
    # data_loc = micro_sample13 # This seems nice for debugging--except directories don't match because different data source
        
    # Create dict list from CSV on file, with one dict per school
    with open(data_loc, 'r', encoding = 'Latin1') as csvfile: # open data file
        reader = csv.DictReader(csvfile) # create a reader
        for row in reader: # loop through rows
            dicts_list.append(row) # append each row to the list
        
URL_var,NAME_var,ADDR_var = get_vars(data_loc) # get varnames depending on data source
numschools = len(dicts_list) # Count number of schools in list of dictionaries
        
# Note on data structures: each row, dicts_list[i] is a dictionary with keys as column name and value as info.
# This will be translated into pandas data frame once (rather messy) website text is parsed into consistent variables



In [80]:

    
for school in dicts_list:
    school["folder_name"] = re.sub(" ","_",(school[NAME_var]+" "+school[ADDR_var][-8:-6])) # This gives name and state separated by "_"
    
    school["folder_path"] = str(wget_dataloc) + school["folder_name"] + "/" # This temporary variable simplifies next line of code
    
    if (has_html(school["folder_path"])==False) or not os.path.exists(school["folder_path"]):
        school['wget_fail_flag'] = str(1) # If folder doesn't exist, mark as fail and ignore when loading files
    else:
        school['wget_fail_flag'] = str(0) # make str so can work with currently limited Pandas dtype conversion functionality



In [88]:

    
print(schooldf[schooldf["folder_name"]=="Effie_Kokrine_Charter_School_AK"][["wget_fail_flag","folder_path"]])









    



   wget_fail_flag                                        folder_path
25              1  /home/jovyan/work/wget/parll_wget/Effie_Kokrin...



In [90]:

    
print(schooldf[schooldf["folder_name"]=="Natomas_Charter_CA"][["wget_fail_flag","folder_path"]])









    



    wget_fail_flag                                        folder_path
748              0  /home/jovyan/work/wget/parll_wget/Natomas_Char...



In [84]:

    
schooldf = pd.DataFrame.from_dict(dicts_list) # Convert dicts_list into a DataFrame
schooldf.info()
schooldf.head(4)









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6972 entries, 0 to 6971
Columns: 183 entries, SURVYEAR to folder_path
dtypes: object(183)
memory usage: 9.7+ MB






    Out[84]:






  
    
      
      SURVYEAR
      FIPST
      STABR
      STATENAME
      SEANAME
      LEAID
      ST_LEAID
      LEA_NAME
      SCHID
      ST_SCHID
      ...
      LSTREET116
      LSTREET216
      LSTREET316
      LCITY16
      LSTATE16
      LZIP16
      ADDRESS16
      folder_name
      wget_fail_flag
      folder_path
    
  
  
    
      0
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200001.0
      31
      Lower Kuskokwim School District
      329.0
      319010
      ...
      1010 Fourth Avenue
      
      
      Bethel
      AK
      99559.0
      1010 Fourth Avenue, Bethel AK 99559
      Ayaprun_Elitnaurvik_AK
      0
      /home/jovyan/work/wget/parll_wget/Ayaprun_Elit...
    
    
      1
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200150.0
      25
      Ketchikan Gateway Borough School District
      306.0
      259010
      ...
      410 Schoenbar
      
      
      Ketchikan
      AK
      99901.0
      410 Schoenbar, Ketchikan AK 99901
      Ketchikan_Charter_School_AK
      0
      /home/jovyan/work/wget/parll_wget/Ketchikan_Ch...
    
    
      2
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200150.0
      25
      Ketchikan Gateway Borough School District
      523.0
      259020
      ...
      410 Schoenbar Road
      
      
      Ketchikan
      AK
      99901.0
      410 Schoenbar Road, Ketchikan AK 99901
      Tongass_School_of_Arts_and_Sciences_Charter_Sc...
      0
      /home/jovyan/work/wget/parll_wget/Tongass_Scho...
    
    
      3
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200180.0
      5
      Anchorage School District
      172.0
      59010
      ...
      1705 W 32nd Ave
      
      
      Anchorage
      AK
      99517.0
      1705 W 32nd Ave, Anchorage AK 99517
      Aquarian_Charter_School_AK
      0
      /home/jovyan/work/wget/parll_wget/Aquarian_Cha...
    
  

4 rows × 183 columns



In [43]:

    
schooldf["wget_fail_flag"] = schooldf["wget_fail_flag"].map({"1":True,1:True,"0":False,0:False}) # Convert to binary to use as conditional



In [49]:

    
schooldf[schooldf["wget_fail_flag"]==True][["folder_name","wget_fail_flag"]]









    Out[49]:






  
    
      
      folder_name
      wget_fail_flag
    
  
  
    
      25
      Effie_Kokrine_Charter_School_AK
      True
    
    
      26
      Watershed_Charter_School_AK
      True
    
    
      27
      Desert_Sun_Academy_AZ
      True
    
    
      30
      Horseshoe_Trails_Elementary_School_AZ
      True
    
    
      31
      Center_for_Academic_Success__The_#1_AZ
      True
    
    
      32
      Center_for_Academic_Success__The_#2_AZ
      True
    
    
      33
      Center_for_Academic_Success__The_#3_AZ
      True
    
    
      40
      Ridgeline_Academy-A_Challenge_Foundation_Acade...
      True
    
    
      41
      Mesa_Arts_Academy__A
      True
    
    
      46
      Bennett_Academy_-_Venture_Site__A
      True
    
    
      52
      NFL_YET_College_Prep_Academy__A
      True
    
    
      53
      Heritage_Academy__A
      True
    
    
      61
      Benjamin_Franklin_Charter_School_-_Crismon_AZ
      True
    
    
      62
      Benjamin_Franklin_Charter_School_-_Gilbert_AZ
      True
    
    
      63
      Benjamin_Franklin_Charter_School_-_Power_AZ
      True
    
    
      64
      Benjamin_Franklin_High_School_AZ
      True
    
    
      67
      Montessori_Day_Public_Schools_Chartered_-_Moun...
      True
    
    
      89
      PPEP_TEC_-_Jose_Yepez_Learning_Center__A
      True
    
    
      91
      PPEP_TEC_-_Victor_Soltero_Learning_Center__A
      True
    
    
      102
      Hearn_Academy__The_-_A_Ball_Charter_School_AZ
      True
    
    
      121
      ALA_QC__Elem_AZ
      True
    
    
      129
      Luz-Guerrero_Early_College_High_School_AZ
      True
    
    
      133
      Academy_of_Excellence_-_Central_Arizona_AZ
      True
    
    
      145
      Discovery_Plus_Academy_AZ
      True
    
    
      150
      Glenview_College_Preparatory_High_School__A
      True
    
    
      154
      Learning_Institute__The_AZ
      True
    
    
      167
      Dobson_Academy__The_-_A_Ball_Charter_School_AZ
      True
    
    
      168
      SABIS_International__A
      True
    
    
      174
      Champion_Chandler_AZ
      True
    
    
      178
      Academy_of_Tucson_Middle_School__A
      True
    
    
      ...
      ...
      ...
    
    
      6818
      Kenosha_eSchool_K-12__W
      True
    
    
      6820
      Meeme_LEADS_Charter_WI
      True
    
    
      6821
      Kornerstone_School_WI
      True
    
    
      6829
      Little_Chute_Career_Pathways_Academy__W
      True
    
    
      6830
      Flex_Academy__W
      True
    
    
      6835
      McKinley_Academy__W
      True
    
    
      6836
      Manitowoc_County_Comprehensive_Charter_School__W
      True
    
    
      6840
      Mauston_Montessori_Charter_School_WI
      True
    
    
      6844
      Merrill_Adult_Diploma_Academy_WI
      True
    
    
      6850
      Honey_Creek_Elementary__W
      True
    
    
      6853
      Next_Door_Charter__W
      True
    
    
      6860
      Milwaukee_College_Preparatory_School_--_38th_S...
      True
    
    
      6865
      Universal_Academy_for_the_College_Bound_WI
      True
    
    
      6871
      Woodland_Progressive_School_for_21st_Century_C...
      True
    
    
      6874
      Northwood_High/Middle_WI
      True
    
    
      6876
      Northwood_Virtual_Charter_School_WI
      True
    
    
      6879
      Montello_Junior/Senior_High_WI
      True
    
    
      6885
      Central_Wisconsin_STEM_Academy_WI
      True
    
    
      6886
      Juneau_County_Charter_School_WI
      True
    
    
      6890
      Oconto_Literacy_Charter_School_WI
      True
    
    
      6894
      Osceola_Charter_Preschool__W
      True
    
    
      6901
      Racine_Civil_Leaders_Academy_WI
      True
    
    
      6935
      CARE_WI
      True
    
    
      6936
      Point_of_Discovery_School_WI
      True
    
    
      6937
      Tomah_Area_Montessori_School__W
      True
    
    
      6939
      Laker_Online_Virtual_Charter_School_WI
      True
    
    
      6943
      Exploration_Academy_WI
      True
    
    
      6964
      Lincoln_Inquiry_Charter_School__W
      True
    
    
      6965
      JEDI_Virtual_K-12__W
      True
    
    
      6968
      Snowy_Range_Academy__W
      True
    
  

1571 rows × 2 columns



In [39]:

    
schooldf.head(26)









    Out[39]:






  
    
      
      SURVYEAR
      FIPST
      STABR
      STATENAME
      SEANAME
      LEAID
      ST_LEAID
      LEA_NAME
      SCHID
      ST_SCHID
      ...
      ADDRESS14
      LSTREET116
      LSTREET216
      LSTREET316
      LCITY16
      LSTATE16
      LZIP16
      ADDRESS16
      folder_name
      wget_fail_flag
    
  
  
    
      0
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200001.0
      31
      Lower Kuskokwim School District
      329.0
      319010
      ...
      1010 Fourth Avenue, Bethel AK 99559
      1010 Fourth Avenue
      
      
      Bethel
      AK
      99559.0
      1010 Fourth Avenue, Bethel AK 99559
      Ayaprun_Elitnaurvik_AK
      0
    
    
      1
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200150.0
      25
      Ketchikan Gateway Borough School District
      306.0
      259010
      ...
      410 Schoenbar, Ketchikan AK 99901
      410 Schoenbar
      
      
      Ketchikan
      AK
      99901.0
      410 Schoenbar, Ketchikan AK 99901
      Ketchikan_Charter_School_AK
      0
    
    
      2
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200150.0
      25
      Ketchikan Gateway Borough School District
      523.0
      259020
      ...
      410 Schoenbar Road, Ketchikan AK 99901
      410 Schoenbar Road
      
      
      Ketchikan
      AK
      99901.0
      410 Schoenbar Road, Ketchikan AK 99901
      Tongass_School_of_Arts_and_Sciences_Charter_Sc...
      0
    
    
      3
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200180.0
      5
      Anchorage School District
      172.0
      59010
      ...
      1705 W 32nd Ave, Anchorage AK 99517
      1705 W 32nd Ave
      
      
      Anchorage
      AK
      99517.0
      1705 W 32nd Ave, Anchorage AK 99517
      Aquarian_Charter_School_AK
      0
    
    
      4
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200180.0
      5
      Anchorage School District
      178.0
      56010
      ...
      401 E Fireweed Lane Suite 100, Anchorage AK 99503
      401 E Fireweed Lane Suite 100
      
      
      Anchorage
      AK
      99503.0
      401 E Fireweed Lane Suite 100, Anchorage AK 99503
      Family_Partnership_Charter_School_AK
      0
    
    
      5
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200180.0
      5
      Anchorage School District
      459.0
      59070
      ...
      4802 Bryn MAWR Court, Anchorage AK 99508
      4802 Bryn MAWR Court
      
      
      Anchorage
      AK
      99508.0
      4802 Bryn MAWR Court, Anchorage AK 99508
      Winterberry_School_AK
      0
    
    
      6
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200180.0
      5
      Anchorage School District
      460.0
      59080
      ...
      10901 Mausel St Suite 101, Eagle River AK 99577
      10901 Mausel St Suite 101
      
      
      Eagle River
      AK
      99577.0
      10901 Mausel St Suite 101, Eagle River AK 99577
      Eagle_Academy_Charter_School_AK
      0
    
    
      7
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200180.0
      5
      Anchorage School District
      530.0
      59050
      ...
      400 W Northern Lights Blvd, #9, Anchorage AK 9...
      400 W Northern Lights Blvd #9
      
      
      Anchorage
      AK
      99503.0
      400 W Northern Lights Blvd #9, Anchorage AK 99503
      Frontier_Charter_School_AK
      0
    
    
      8
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200180.0
      5
      Anchorage School District
      555.0
      59060
      ...
      5530 E Northern Lights Suite 1, Anchorage AK 9...
      5530 E Northern Lights Suite 1
      
      
      Anchorage
      AK
      99504.0
      5530 E Northern Lights Suite 1, Anchorage AK 9...
      Highland_Tech_High_Charter_School_AK
      0
    
    
      9
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200180.0
      5
      Anchorage School District
      732.0
      59090
      ...
      650 W International Airport Rd, Anchorage AK 9...
      650 W International Airport Rd
      
      
      Anchorage
      AK
      99507.0
      650 W International Airport Rd, Anchorage AK 9...
      Rilke_Schule_Charter_School_AK
      0
    
    
      10
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200180.0
      5
      Anchorage School District
      736.0
      59100
      ...
      550 Bragaw Street, Anchorage AK 99504
      550 Bragaw Street
      
      
      Anchorage
      AK
      99504.0
      550 Bragaw Street, Anchorage AK 99504
      Alaska_Native_Cultural_Charter_School_AK
      0
    
    
      11
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200210.0
      22
      Juneau Borough School District
      268.0
      229010
      ...
      430 Fourth Street, Juneau AK 99801
      430 Fourth Street
      
      
      Juneau
      AK
      99801.0
      430 Fourth Street, Juneau AK 99801
      Juneau_Community_Charter_School_AK
      0
    
    
      12
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200390.0
      24
      Kenai Peninsula Borough School District
      274.0
      249010
      ...
      705 Frontage Rd Suite A, Kenai AK 99611
      705 Frontage Rd Suite A
      
      
      Kenai
      AK
      99611.0
      705 Frontage Rd Suite A, Kenai AK 99611
      Aurora_Borealis_Charter_School_AK
      0
    
    
      13
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200390.0
      24
      Kenai Peninsula Borough School District
      296.0
      249030
      ...
      995 Soundview Ave, Homer AK 99603
      995 Soundview Ave
      
      
      Homer
      AK
      99603.0
      995 Soundview Ave, Homer AK 99603
      Fireweed_Academy_AK
      0
    
    
      14
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200390.0
      24
      Kenai Peninsula Borough School District
      448.0
      249040
      ...
      162 E Park Street, Soldotna AK 99669
      162 E Park Street
      
      
      Soldotna
      AK
      99669.0
      162 E Park Street, Soldotna AK 99669
      Soldotna_Montessori_Charter_School_AK
      0
    
    
      15
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200390.0
      24
      Kenai Peninsula Borough School District
      463.0
      249050
      ...
      549 N Forest Dr, Kenai AK 99611
      549 N Forest Dr
      
      
      Kenai
      AK
      99611.0
      549 N Forest Dr, Kenai AK 99611
      Kaleidoscope_School_of_Arts_&_Sciences_AK
      0
    
    
      16
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200510.0
      33
      Matanuska-Susitna Borough School District
      311.0
      339010
      ...
      801 East Arctic, Palmer AK 99645
      801 East Arctic
      
      
      Palmer
      AK
      99645.0
      801 East Arctic, Palmer AK 99645
      Academy_Charter_School_AK
      0
    
    
      17
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200510.0
      33
      Matanuska-Susitna Borough School District
      312.0
      339020
      ...
      7362 W Parks #714, Wasilla AK 99623
      7362 W Parks #714
      
      
      Wasilla
      AK
      99623.0
      7362 W Parks #714, Wasilla AK 99623
      Midnight_Sun_Family_Learning_Center_AK
      0
    
    
      18
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200510.0
      33
      Matanuska-Susitna Borough School District
      452.0
      337050
      ...
      7362 W. Parks Hwy #723, Wasilla AK 99623
      7362 W Parks Hwy #723
      
      
      Wasilla
      AK
      99623.0
      7362 W Parks Hwy #723, Wasilla AK 99623
      American_Charter_Academy_AK
      0
    
    
      19
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200510.0
      33
      Matanuska-Susitna Borough School District
      469.0
      339030
      ...
      141 E Seldon Road Suite C, Wasilla AK 99654
      141 E Seldon Road Suite C
      
      
      Wasilla
      AK
      99654.0
      141 E Seldon Road Suite C, Wasilla AK 99654
      Twindly_Bridge_Charter_School_AK
      0
    
    
      20
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200510.0
      33
      Matanuska-Susitna Borough School District
      740.0
      339040
      ...
      7010 E Bogard Road, Wasilla AK 99654
      7010 E Bogard Road
      
      
      Wasilla
      AK
      99654.0
      7010 E Bogard Road, Wasilla AK 99654
      Fronteras_Charter_School_AK
      0
    
    
      21
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200510.0
      33
      Matanuska-Susitna Borough School District
      744.0
      339050
      ...
      7101 E. Palmer Wasilla Highway, Palmer AK 99645
      7101 E Palmer Wasilla Highway
      
      
      Palmer
      AK
      99645.0
      7101 E Palmer Wasilla Highway, Palmer AK 99645
      Birchtree_Charter_School_AK
      0
    
    
      22
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200570.0
      35
      Nome Public Schools
      323.0
      359010
      ...
      Mile 3.5 Nome-Teller Highway, Nome AK 99762
      Mile 3.5 Nome-Teller Highway
      
      
      Nome
      AK
      99762.0
      Mile 3.5 Nome-Teller Highway, Nome AK 99762
      Anvil_City_Science_Academy_AK
      0
    
    
      23
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200600.0
      16
      Fairbanks North Star Borough School District
      162.0
      169010
      ...
      3002 International Street, Fairbanks AK 99701
      3002 International Street
      
      
      Fairbanks
      AK
      99701.0
      3002 International Street, Fairbanks AK 99701
      Chinook_Montessori_Charter_School_AK
      0
    
    
      24
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200600.0
      16
      Fairbanks North Star Borough School District
      462.0
      169030
      ...
      2945 Monk Court, North Pole AK 99705
      2945 Monk Court
      
      
      North Pole
      AK
      99705.0
      2945 Monk Court, North Pole AK 99705
      Star_of_the_North_Secondary_School_AK
      0
    
    
      25
      2014-2015
      2.0
      AK
      ALASKA
      Alaska Department of Education and Early Devel...
      200600.0
      16
      Fairbanks North Star Borough School District
      464.0
      169040
      ...
      601 Loftus Road, Fairbanks AK 99709
      601 Loftus Road
      
      
      Fairbanks
      AK
      99709.0
      601 Loftus Road, Fairbanks AK 99709
      Effie_Kokrine_Charter_School_AK
      1
    
  

26 rows × 182 columns



In [32]:

    
schooldf.wget_fail_flag.sum()









    Out[32]:





'000000000000000000000000011100111100000011000010000011000000011110010000000000000000000001010000000000100000000000000000010000000100010000000000010000100010000000000001100000100010010000000001111010001000000011000110000000100100100000000000000000110000101010000000000001101011010001000100000001000000010010000010000010000010000000000000000000000000000000000101000010000000000000000000000000101110000000010000000000000000100000000000000000001000000010001100000110000010000000010000000000000000001000000001010000010100001000000001000000000001000100000010000010000100000000010000000001011000010000000100000010001000000000000001000010100000000000000000110000100000000000000000000000000000000001000000000000000100100111000010110111110000000000100010010000000000010000000000000100000000000010010010010110000011100111100111100010000100000000010000000000000100000011000011000001000000011000000001010100000000010110000000000001111000100010010000000110000011111000010000110000000100000101000000010000000001001000100000000100010000001100000010000111000000000001000001000100111101010100000010000111000000110000000000000000000000000000000000010000000000001010010000000001000001010110000000000000000001000000000000010100000000000001001011011000010010100100011100000010011110100010000000000000000010011000010000001001000000010000000000000000000000000000001001000000101100000011111111111111100100000000000000011000000000000000111110000000100000001100010110001000000010001000000000000000100000100000001111000000000000000001100010000111000000011000110000010000111011011000000000011000001111001100010110000010010010100001000000001001111111100000000000000000000000001101000000001000000000001100100000000000010010000000011100001000000000000100010000000000000000100010100000010001100010000000001100001000000111100000000110000000100000111100000011000001000000000000010000000111011000000000010010000001000000010000100000100000111100110110000110010001001000001100000001000000100101001000000100010010000000000000000000010010000000010000000000000100011000000000000100000000000010100000101000000000000100000000000000000001000010000000000000100000000000000000110000000110100000000000010110010000010111000000000100000000010000000000010101010000111111100010000000100100111001000000000000111111111111111111010001111111001111000000000100000110000000011000010101100000000111110000000000000000000000001011000101111101101011111101101111110101000000101111010000110111000101101101100001100110101101110100000111111010011010110000010110111000110101010101111111101100101001011011111101011111111101111110101000110010111111011111000100011110001100011111011101111101111111010000000011000000001101000101100000110000010100100110000010001000110010001010000000000000010000000100100000000000000100000000000001000001000000001001110000000000000000010011100001000000001001000011000010000010011000000000001110110000100011011011000000100100000001101100111111001001100000000001000100000001010000000011000000100100000000000000000011101000000011100100110100000000000000100000010011101111000010000000000010000001000100000001000010000000100100000000000000000100000000010010000100001100001010010000110000000001000010000000000000000000000000000100010000100001000001000101000100011000100000010110000100001001000000100000010000000000000000000000010000010000000101000001000001000000000000001000000000000010000100000001011000100110000000000000100001000000010000000001000111010111010001000100000010011100001110000100001100110110101000001000100100000000100100000001000000010001011000000000000000000000000000000000001000000000110000000011100110000101000100001001010100100000100000000110000001000100100110000000000000000010000100010101000000100110000100001000000000000000000000110000000001000000000000000010000010000000000011000000101000010000000001010000000000010011010000000000000000000010000000000100110011001110000000000011110111100100001101001000101000000000000001001000000010100000000010000000000100000011000000010100000100011001001000001111000000001101000011000111100111011110000000110000100000100110000000000000100000000000000000000001000100000000000000000000000000000000001110000000010000010010100000010110000000000000000000000000000000000000000000100100000010010010000011111001001011111111111100000001010011100000011100001000111000001111111100001000010000100110000010000100111010110100100001001000000000000000100000000000010000000000000000000000111000001100010000000010000001010010000001001001010000000000000000000110000000010000001010011000100000010010000000000100001000000010101100000010100001000111100100000001000010000000100001001100000000000000010000000010000000101000000000000000001000001001000000100000100000101000000100010110101000010100010000110000100000000100000000000000000000010001000000000000000010001000000000000001000000000000000100001000000000000000000000000000000100001000000010000100001000100000000000100111110000000000000001001000001000000010010000000000000000000000001000000000000000000000111101110111111111111110000000000000001000000000000000000000000000000000100000000000001010001000000000000110000000000000000001000001100100000000000001100000000001010000011000100000000000000101101000000000000111001000001100100100100100000100000000000000000000000000000011000000000010000000000000011111000101000100011000000000110000010010000000000010000000100010000100011000001100010011111100100111000000000001001010000000000010110111000000000000000000000110000000001000000111010000000000001000110100000000000110000000000000000000010001010000000001100100001000000010010000001010000000101000001000010010100000010000000011010000001001111010000100000001100000010000100001010110100000100000000001010000010000000010000000001010010000000100000011001001100100000010000001000000000000000010110000010000000001000000001000001000001110110000100000010100001000110011100000111010001101010001100000000000001010011001111101000000100011010000000000001000110000000000100000011001001111110000000011000000000000000000000110010000000010000000000000100000000000010001000000010001000010110011011001011010000101110000000000000001000001111110010100010000010000000000000101000000010000000000010111110000000011000100000100000111100011001000100000011111010001001001100000100000110001000100000000000000001110000001110100000000000101000000000000000000000000000000000000000000000100000000000000010000001000010000010000010000000000000100000100100000000010010100000000000001000010000000000000111100000110000000000000000000000000000010000000100011000000000000000000010000000010000000000000011110110000100000100111010000000000000000000000100000001000100001001001001001011100000000000000000001000110010000001010010000000100010000000000000000010100000000000000000001000000010000001001111111111100000000010000010000001000010000101000000000000000000001000001110000010000011100011001010010000000000000001100101011000000011000011000100010000010010000001000010000010010100100000110001000100000010000000000000000000000000000000001110100010000000000000000000011001000'



In [79]:

    
tqdm.pandas(desc="Rocking pandas!")



In [ ]:

    
# ### Run parsing algorithm on schools (requires access to webcrawl output)

test_dicts = dicts_list[0] # Limit number of schools to analyze, in order to refine methods

if Debug:
    for school in test_dicts:
        parse_school(school)
        
else:
    for school in dicts_list:
        parse_school(school)



In [ ]:

    
# Check out results:
if Debug:
    print(test_dicts[0])
else:
    print(dicts_list[0])



In [ ]:

    
# Save output:
if Debug:
    dictfile = "testing_dicts_" + str(datetime.today().strftime("%Y-%m-%d"))
    save_to_file(test_dicts, save_dir+dictfile, "JSON")
else:
    dictfile = "school_dicts_" + str(datetime.today().strftime("%Y-%m-%d"))
    save_to_file(dicts_list, save_dir+dictfile, "JSON")

	SURVYEAR	FIPST	STABR	STATENAME	SEANAME	LEAID	ST_LEAID	LEA_NAME	SCHID	ST_SCHID	...	LSTREET116	LCITY16	LSTATE16	LZIP16	ADDRESS16	folder_name	folder_path
0	2014-2015	2.0	AK	ALASKA	Alaska Department of Education and Early Devel...	200001.0	31	Lower Kuskokwim School District	329.0	319010	...	1010 Fourth Avenue	Bethel	AK	99559.0	1010 Fourth Avenue, Bethel AK 99559	Ayaprun_Elitnaurvik_AK	/home/jovyan/work/wget/parll_wget/Ayaprun_Elit...
1	2014-2015	2.0	AK	ALASKA	Alaska Department of Education and Early Devel...	200150.0	25	Ketchikan Gateway Borough School District	306.0	259010	...	410 Schoenbar	Ketchikan	AK	99901.0	410 Schoenbar, Ketchikan AK 99901	Ketchikan_Charter_School_AK	/home/jovyan/work/wget/parll_wget/Ketchikan_Ch...
2	2014-2015	2.0	AK	ALASKA	Alaska Department of Education and Early Devel...	200150.0	25	Ketchikan Gateway Borough School District	523.0	259020	...	410 Schoenbar Road	Ketchikan	AK	99901.0	410 Schoenbar Road, Ketchikan AK 99901	Tongass_School_of_Arts_and_Sciences_Charter_Sc...	/home/jovyan/work/wget/parll_wget/Tongass_Scho...
3	2014-2015	2.0	AK	ALASKA	Alaska Department of Education and Early Devel...	200180.0	5	Anchorage School District	172.0	59010	...	1705 W 32nd Ave	Anchorage	AK	99517.0	1705 W 32nd Ave, Anchorage AK 99517	Aquarian_Charter_School_AK	/home/jovyan/work/wget/parll_wget/Aquarian_Cha...

	folder_name	wget_fail_flag
25	Effie_Kokrine_Charter_School_AK	True
26	Watershed_Charter_School_AK	True
27	Desert_Sun_Academy_AZ	True
30	Horseshoe_Trails_Elementary_School_AZ	True
31	Center_for_Academic_Success__The_#1_AZ	True
32	Center_for_Academic_Success__The_#2_AZ	True
33	Center_for_Academic_Success__The_#3_AZ	True
40	Ridgeline_Academy-A_Challenge_Foundation_Acade...	True
41	Mesa_Arts_Academy__A	True
46	Bennett_Academy_-_Venture_Site__A	True
52	NFL_YET_College_Prep_Academy__A	True
53	Heritage_Academy__A	True
61	Benjamin_Franklin_Charter_School_-_Crismon_AZ	True
62	Benjamin_Franklin_Charter_School_-_Gilbert_AZ	True
63	Benjamin_Franklin_Charter_School_-_Power_AZ	True
64	Benjamin_Franklin_High_School_AZ	True
67	Montessori_Day_Public_Schools_Chartered_-_Moun...	True
89	PPEP_TEC_-_Jose_Yepez_Learning_Center__A	True
91	PPEP_TEC_-_Victor_Soltero_Learning_Center__A	True
102	Hearn_Academy__The_-_A_Ball_Charter_School_AZ	True
121	ALA_QC__Elem_AZ	True
129	Luz-Guerrero_Early_College_High_School_AZ	True
133	Academy_of_Excellence_-_Central_Arizona_AZ	True
145	Discovery_Plus_Academy_AZ	True
150	Glenview_College_Preparatory_High_School__A	True
154	Learning_Institute__The_AZ	True
167	Dobson_Academy__The_-_A_Ball_Charter_School_AZ	True
168	SABIS_International__A	True
174	Champion_Chandler_AZ	True
178	Academy_of_Tucson_Middle_School__A	True
...	...	...
6818	Kenosha_eSchool_K-12__W	True
6820	Meeme_LEADS_Charter_WI	True
6821	Kornerstone_School_WI	True
6829	Little_Chute_Career_Pathways_Academy__W	True
6830	Flex_Academy__W	True
6835	McKinley_Academy__W	True
6836	Manitowoc_County_Comprehensive_Charter_School__W	True
6840	Mauston_Montessori_Charter_School_WI	True
6844	Merrill_Adult_Diploma_Academy_WI	True
6850	Honey_Creek_Elementary__W	True
6853	Next_Door_Charter__W	True
6860	Milwaukee_College_Preparatory_School_--_38th_S...	True
6865	Universal_Academy_for_the_College_Bound_WI	True
6871	Woodland_Progressive_School_for_21st_Century_C...	True
6874	Northwood_High/Middle_WI	True
6876	Northwood_Virtual_Charter_School_WI	True
6879	Montello_Junior/Senior_High_WI	True
6885	Central_Wisconsin_STEM_Academy_WI	True
6886	Juneau_County_Charter_School_WI	True
6890	Oconto_Literacy_Charter_School_WI	True
6894	Osceola_Charter_Preschool__W	True
6901	Racine_Civil_Leaders_Academy_WI	True
6935	CARE_WI	True
6936	Point_of_Discovery_School_WI	True
6937	Tomah_Area_Montessori_School__W	True
6939	Laker_Online_Virtual_Charter_School_WI	True
6943	Exploration_Academy_WI	True
6964	Lincoln_Inquiry_Charter_School__W	True
6965	JEDI_Virtual_K-12__W	True
6968	Snowy_Range_Academy__W	True

Dictionary Analysis on HTML from wget run!

Initializing

Dictionary Analysis on HTML from `wget` run!