In [6]:
#!/usr/bin/env python
# -*- coding: UTF-8

import sys

sys.path.append("/usr/local/lib/python3.5/dist-packages")

Dictionary Analysis on HTML from wget run!

Initializing


In [84]:
# import necessary libraries
import os, re, fnmatch # for navigating file trees and working with strings
import csv # for reading in CSV files
#from glob import glob # for finding files within nested folders--compare with os.walk
import json, pickle # For saving a loading dictionaries, etc. from file with JSON and pickle formats
from datetime import datetime # For timestamping files
import sys # For working with user input
import logging # for logging output, to help with troubleshooting
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import urllib, urllib.request # for testing pages
from unicodedata import normalize # for cleaning text by converting unicode character encodings into readable format
#import shelve # For working with big dictionary files without having the whole file in memory at once
import pandas as pd # modifies data more efficiently than with a list of dicts
from tqdm import tqdm # For progress information during involved Pandas operations

# Import parser
from bs4 import BeautifulSoup # BS reads and parses even poorly/unreliably coded HTML 
from bs4.element import Comment # helps with detecting inline/junk tags when parsing with BS
import lxml # for fast HTML parsing with BS, compared to "html.parser"
bsparser = "lxml"

In [85]:
# ### Set script options

Debug = False # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = False # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!
workstation = False # If working from office PC

if notebook:
    usefile = False # Prompting user for input file is only useful in command-line

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML

In [86]:
# ### Set directories

if workstation and notebook:
    dir_prefix = "C:\\Users\\Jaren\\Documents\\" # One level further down than the others
elif notebook:
    dir_prefix = "/home/jovyan/work/"
else:
    dir_prefix = "/vol_b/data/"

example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"

save_dir = dir_prefix + "Charter-school-identities" + os.sep + "data" + os.sep # Directory in which to save data files
dicts_dir = dir_prefix + "Charter-school-identities" + os.sep + "dicts" + os.sep # Directory in which to find & save dictionary files
temp_dir = save_dir + "temp" + os.sep # Directory in which to save temporary data files

micro_sample13 = save_dir + "micro-sample13_coded.csv" # Random micro-sample of 300 US charter schools
URL_schooldata = save_dir + "charter_URLs_2014.csv" # 2014 population of 6,973 US charter schools
full_schooldata = save_dir + "charter_merged_2014.csv" # Above merged with PVI, EdFacts, year opened/closed
temp_data = save_dir + "school_parser_temp.json" # Full_schooldata dict with output for some schools
example_file = save_dir + "example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"

In [87]:
# Set logging options
log_file = temp_dir + "dict_parsing_" + str(datetime.today()) + ".log"
logging.basicConfig(filename=log_file,level=logging.INFO)

In [88]:
# Set input file, if any
if usefile and not notebook:
    print("\nWould you like to load from file a list of dictionaries to add to? (Y/N)")
    answer = input()
    if answer == "Y":
        print("Please indicate file path for dictionary list file.")
        answer2 = input()
        if os.path.exists(answer2):
            input_file = answer2
            usefile = True
        else:
            print("Invalid file path. Aborting script.")
            sys.exit()

    elif answer == "N":
        print("OK! This script will create a new file for this list of dictionaries.")
        usefile = False
    
    else:
        print("Response not interpretable. Aborting script.")
        sys.exit()

In [89]:
# ### Define (non-parsing) helper functions

def get_vars(data):
    """Defines variable names based on the data source called."""
    
    if data==URL_schooldata:
        URL_variable = "TRUE_URL"
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS"
        
    elif data==full_schooldata:
        URL_variable = "SCH_NAME" # Work-around until URLs merged into full data file
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS14"
    
    elif data==micro_sample13:
        URL_variable = "URL"
        NAME_variable = "SCHNAM"
        ADDR_variable = "ADDRESS"
    
    else:
        try:
            print("Error processing variables from data file " + str(data) + "!")
        except Exception as e:
            print("ERROR: No data source established!\n")
            print(e)
    
    return(URL_variable,NAME_variable,ADDR_variable)


def tag_visible(element):
    """Returns false if a web element has a non-visible tag, 
    i.e. one site visitors wouldn't actually read--and thus one we don't want to parse"""
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def webtext_from_files(datalocation):
    """Concatenate and return a single string from all webtext (with .txt format) in datalocation"""
    
    string = ""
    for root, dirs, files in os.walk(datalocation):
        for file in files:
            if file.endswith(".txt"):
                fileloc = open(datalocation+file, "r")
                string = string + (fileloc.read())
    return string


def remove_spaces(file_path):
    """Remove spaces from text file at file_path"""
    
    words = [x for x in open(file_path).read().split() if x != ""]
    text = ""
    for word in words:
        text += word + " "
    return text


def write_errors(error_file, error1, error2, error3, file_count):
    """Writes to error_file three binary error flags derived from parse_school(): 
    duplicate_flag, parse_error_flag, wget_fail_flag, and file_count."""
    
    with open(error_file, 'w') as file_handler:
        file_handler.write("duplicate_flag {}\n".format(int(error1)))
        file_handler.write("parse_error_flag {}\n".format(int(error2)))
        file_handler.write("wget_fail_flag {}\n".format(int(error3)))
        file_handler.write("file_count {}".format(int(file_count)))
        return
    

def write_counts(file_path, names_list, counts_list):
    """Writes to file_path the input dict_count names (a list) and counts (another list).
    Assumes these two lists have same length and are in same order--
    e.g., names_list[0]="ess_count" and counts_list[0]=ess_count."""
    
    with open(file_path, 'w') as file_handler:
        for tup in zip(names_list,counts_list): # iterate over zipped list of tuples
            if tup != list(zip(names_list,counts_list))[-1]:
                file_handler.write("{} {}\n".format(tup[0],tup[1]))
            else:
                file_handler.write("{} {}".format(tup[0],tup[1]))
        return

    
def write_list(file_path, textlist):
    """Writes textlist to file_path. Useful for recording output of parse_school()."""
    
    with open(file_path, 'w') as file_handler:
        for elem in textlist:
            file_handler.write("{}\n".format(elem))
        return
    

def load_list(file_path):
    """Loads list into memory. Must be assigned to object."""
    
    textlist = []
    with open(file_path) as file_handler:
        line = file_handler.readline()
        while line:
            textlist.append(line)
            line = file_handler.readline()
    return textlist

        
def save_datafile(data, file, thismode):
    """BROKEN for saving to CSV Pandas DataFrames (only saves header) and lists of dicts (only saves keys).
    Saves data to file using JSON, pickle, or CSV format (whichever was specified).
    Works with Pandas DataFrames or other objects, e.g. a list of dictionaries.
    Deletes file first to reduce risk of data duplication."""
    
    file = str(file)
    thismode = str(thismode)
    
    try:
        if os.path.exists(file):
            os.remove(file) # Delete file first to reduce risk of data duplication
        else:
            pass
        
        if thismode.upper()=="JSON" or thismode.upper()==".JSON":
            if not file.endswith(".json"):
                file += ".json"
            
            if type(data)=="pandas.core.frame.DataFrame":
                data.to_json(file)
            
            else:
                with open(file, 'w') as outfile:
                    json.dump(data, outfile, encoding="utf-8")
            
            #print("Data saved to " + file + "!")

        elif thismode.lower()=="pickle" or thismode.lower()==".pickle":
            if not file.endswith(".pickle"):
                file += ".pickle"
                
            if type(data)=="pandas.core.frame.DataFrame":
                data.to_pickle(file, encoding="utf-8")
                
            else:
                with open(file, "wb") as outfile:
                    pickle.dump(data, outfile, encoding="utf-8")
                    
            #print("Data saved to " + file + "!")
                
        elif thismode.upper()=="CSV" or thismode.upper()==".CSV":
            if not file.endswith(".csv"):
                file += ".csv"
                
            if type(data)=="pandas.core.frame.DataFrame":
                if os.path.exists(file): # If file already exists, assume we are appending to it (with same column names)
                    data.to_csv(file,mode="a",index=False,sep="\t",header=False,encoding="utf-8")
                else: # If file doesn't exist, create it
                    data.to_csv(file,mode="w",index=False,sep="\t",header=data.columns.values,encoding="utf-8")
                
            else:
                with open(file, "w") as outfile:
                    wr = csv.writer(outfile)
                    wr.writerows(data)
                
            #print("Data saved to " + file + "!")

        else:
            print("ERROR! Improper arguments. Please include: data object to save (Pandas DataFrames OK), file path, and file format ('JSON', 'pickle', or 'CSV').")
    
    except Exception as e:
        print("Failed to save to " + str(file) + " into memory using " + str(thismode) + " format. Please check arguments (data, file, file format) and try again.")
        print(e)
    

def load_datafile(file):
    """Loads dicts_list (or whatever) from file, using either JSON or pickle format. 
    The created object should be assigned when called."""
    
    file = str(file)
    
    if file.lower().endswith(".json"):
        with open(file,'r') as infile:
            var = json.load(infile)
    
    if file.lower().endswith(".pickle"):
        with open(file,'rb') as infile:
            var = pickle.load(infile)
        
    print(file + " successfully loaded!")
    return var


def load_dict(custom_dict, file_path):
    """Loads in a dictionary. Adds each entry from the dict at file_path to the defined set custom_dict (the input), 
    which can also be an existing dictionary. This allows the creation of combined dictionaries!"""

    with open(file_path) as file_handler:
        line = file_handler.readline()
        while line:
            custom_dict.add(stemmer.stem(line.replace("\n", ""))) # Add line after stemming dictionary entries and eliminating newlines
            line = file_handler.readline() # Look for anything else in that line, add that too
    return custom_dict


def list_files(folder_path, extension):
    """Outputs a list of every file in folder_path or its subdirectories that has a specified extension.
    Prepends specified extension with '.' if it doesn't start with it already.
    If no extension is specified, it just returns all files in folder_path."""
    
    matches = []
    if extension:
        extension = str(extension) # Coerce to string, just in case
    
    if extension and not extension.startswith("."):
        extension = "." + extension
    
    for dirpath,dirnames,filenames in os.walk(folder_path):
        if extension:
            for filename in fnmatch.filter(filenames, "*" + extension): # Use extension to filter list of files
                matches.append(os.path.join(dirpath,filename))
        else:
                matches.append(os.path.join(dirpath,filename)) # If no extension, just take all files
    return matches


def has_html(folder_path):
    """Simple function that counts .html files and returns a binary:
    'True' if a specified folder has any .html files in it, 'False' otherwise."""
    
    html_list = []
    for dirpath,dirnames,filenames in os.walk(folder_path):
        for file in fnmatch.filter(filenames, "*.html"): # Check if any HTML files in folder_path
            html_list.append(file)
    
    if len(html_list)==0:
        return False
    else:
        return True


def convert_df(df):
    """Makes a Pandas DataFrame more memory-efficient through intelligent use of Pandas data types: 
    specifically, by storing columns with repetitive Python strings not with the object dtype for unique values 
    (entirely stored in memory) but as categoricals, which are represented by repeated integer values. This is a 
    net gain in memory when the reduced memory size of the category type outweighs the added memory cost of storing 
    one more thing. As such, this function checks the degree of redundancy for a given column before converting it."""
    
    converted_df = pd.DataFrame() # Initialize DF for memory-efficient storage of strings (object types)
    # TO DO: Infer dtypes of df
    df_obj = df.select_dtypes(include=['object']).copy() # Filter to only those columns of object data type

    for col in df.columns: 
        if col in df_obj: 
            num_unique_values = len(df_obj[col].unique())
            num_total_values = len(df_obj[col])
            if (num_unique_values / num_total_values) < 0.5: # Only convert data types if at least half of values are duplicates
                converted_df.loc[:,col] = df[col].astype('category') # Store these columns as dtype "category"
            else: 
                converted_df.loc[:,col] = df[col]
        else:    
            converted_df.loc[:,col] = df[col]
                      
    converted_df.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='float')
    converted_df.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    
    return converted_df

In [90]:
os.getcwd()


Out[90]:
'/home/jovyan/work/Charter-school-identities/scripts'

In [91]:
#school_index = next((index for (index, d) in enumerate(dicts_list) if d["folder_name"] == "Natomas_Charter_CA"), None) # Find index of that school
#print(school_index)
#print(dicts_list[748]["folder_name"])

In [92]:
thispath = wget_dataloc + "Natomas_Charter_CA/"

html_list = []
for dirpath,dirnames,filenames in os.walk(thispath):
    for file in fnmatch.filter(filenames, "*.html"): # Check if any HTML files in folder_path
        html_list.append(file)

print(len(html_list))
#for dirpath,dirnames,filenames in os.walk(thispath):
#    print(len([file for file in fnmatch.filter(filenames, "*.html")]))


0

In [93]:
def set_fail_flag2(folder_name):
    """The web_fail_flag indicates whether the webcrawl/download operation failed to capture any .html for a particular folder_name.
    This function sets the web_fail_flag depending on two conditions: 
    (1) Whether or not there exists a web download folder corresponding to folder_name, and
    (2) Whether or not that folder contains at least one file with the .html extension."""
    
    global wget_dataloc,dicts_list # Need access to the dictionary file
    web_fail_flag = "" # make output a str to work with currently limited Pandas dtype conversion functionality
    
    folder_path = str(wget_dataloc) + folder_name + "/"
    if (not os.path.exists(folder_path)) or (has_html(folder_path)==False):
        web_fail_flag = str(1) # If folder doesn't exist, mark as fail and ignore when loading files
    else:
        web_fail_flag = str(0) # make str so can work with currently limited Pandas dtype conversion functionality
    
    match_index = next((index for (index, d) in enumerate(dicts_list) if d["folder_name"] == folder_name), None) # Find dict index of input/folder_name
    dicts_list[match_index]['wget_fail_flag'] = web_fail_flag # Assign output to dict entry for folder_name
    
    return

In [94]:
#print(dicts_list[748]["wget_fail_flag"])
#print(dicts_list[748]["folder_name"])

In [95]:
#set_fail_flag2("Natomas_Charter_CA")
#print(dicts_list[748]["wget_fail_flag"])

In [96]:
# ### Set parsing keywords

keywords = ['values', 'academics', 'skills', 'purpose',
                       'direction', 'mission', 'vision', 'vision', 'mission', 'our purpose',
                       'our ideals', 'ideals', 'our cause', 'curriculum','curricular',
                       'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system',
                       'structure','philosophy', 'philosophical', 'beliefs', 'believe',
                       'principles', 'creed', 'credo', 'values','moral', 'history', 'our story',
                       'the story', 'school story', 'background', 'founding', 'founded',
                       'established','establishment', 'our school began', 'we began',
                       'doors opened', 'school opened', 'about us', 'our school', 'who we are',
                       'our identity', 'profile', 'highlights']

mission_keywords = ['mission','vision', 'vision:', 'mission:', 'our purpose', 'our ideals', 'ideals:', 'our cause', 'cause:', 'goals', 'objective']
curriculum_keywords = ['curriculum', 'curricular', 'program', 'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system', 'structure']
philosophy_keywords = ['philosophy', 'philosophical', 'beliefs', 'believe', 'principles', 'creed', 'credo', 'value',  'moral']
history_keywords = ['history', 'story','our story', 'the story', 'school story', 'background', 'founding', 'founded', 'established', 'establishment', 'our school began', 'we began', 'doors opened', 'school opened']
about_keywords =  ['about us', 'our school', 'who we are', 'overview', 'general information', 'our identity', 'profile', 'highlights']

# Create sets for each aspect and one for all keywords
mission_keywords = set(stemmer.stem(word) for word in mission_keywords)
curriculum_keywords = set(stemmer.stem(word) for word in curriculum_keywords)
philosophy_keywords = set(stemmer.stem(word) for word in philosophy_keywords)
history_keywords = set(stemmer.stem(word) for word in history_keywords)
about_keywords =  set(stemmer.stem(word) for word in about_keywords)
all_keywords = set(stemmer.stem(key) for key in keywords)

if Debug:
    print("\nList of keywords:\n", list(all_keywords))

In [111]:
# ### Create dictionaries for each ideology and one for combined ideologies

ess_dict, prog_dict, rit_dict, all_ideol = set(), set(), set(), set()
all_ideol = load_dict(all_ideol, dicts_dir + "ess_dict.txt")
all_ideol = load_dict(all_ideol, dicts_dir + "prog_dict.txt")
ess_dict = load_dict(ess_dict, dicts_dir + "ess_dict.txt")
prog_dict = load_dict(prog_dict, dicts_dir + "prog_dict.txt")
rit_dict = load_dict(rit_dict, dicts_dir + "rit_dict.txt")

logging.info(str(len(all_ideol)) + "entries loaded into the combined ideology dictionary.")
list_dict = list(all_ideol)
list_dict.sort(key = lambda x: x.lower())
print("First 10 elements of combined ideology dictionary are:\n", list_dict[:10])


First 10 elements of combined ideology dictionary are:
 ['abstract think', 'abstract thought', 'account', 'achievement gain', 'achievement gap', 'activi', 'adapt', 'agricult', 'anim', "another's sho"]

In [17]:
# ### Define list of tuples: keywords lists and their titles, for dictionary analyses

titles_list = ("mission","curriculum","philosophy","history","about","ideology","keywords")
keysnames_tupzip = zip((mission_keywords,curriculum_keywords,philosophy_keywords,history_keywords,about_keywords,\
                              all_ideol,all_keywords), titles_list)

dictsnames_list = ("ess", "prog", "rit", "all_ideol")
dictsnames_tupzip = zip((ess_dict,prog_dict,rit_dict,all_ideol), dictsnames_list)

if Debug:
    print(list(keysnames_tupzip))
    print()
    print(list(dictsnames_tupzip))


[({'our caus', 'cause:', 'object', 'goal', 'our purpos', 'mission:', 'our id', 'vision', 'vision:', 'ideals:', 'mission'}, 'mission'), ({'method', 'approach', 'system', 'model', 'program', 'curricular', 'pedagog', 'structur', 'curriculum', 'pedagogi'}, 'curriculum'), ({'valu', 'believ', 'philosoph', 'belief', 'creed', 'principl', 'credo', 'philosophi', 'moral'}, 'philosophy'), ({'doors open', 'school open', 'histori', 'stori', 'we began', 'our school began', 'found', 'school stori', 'the stori', 'our stori', 'background', 'establish'}, 'history'), ({'overview', 'general inform', 'highlight', 'about u', 'who we ar', 'our school', 'profil', 'our ident'}, 'about'), ({'prompt', 'conform', 'joy', 'observation of teacher state averag', 'jung', 'memory-and-dril', 'spiritu', 'reading & math', 'comedi', 'rebel', 'prison', 'observation on teacher state standard', 'worker-be', 'habits of mind', 'craftspeopl', 'collaborat', 'pace of the child', 'slant', 'organ', 'handicap', 'college-complet', 'art and mus', 'social', 'recover credit', 'suspens', 'back to bas', 'incom', 'of color', 'child-cent', 'recycl', 'nurtur', 'essential knowledg', 'free think', 'emot', 'farm', 'liberal art', 'cold-cal', 'immigr', 'factori', 'achievement gain', 'disabl', 'sincer', 'rules of conduct', 'figur', 'relate to oth', 'renaiss', 'own pac', 'chastis', 'theater', 'democraci', 'self-rul', 'child*cent', 'personaliz', 'social/emot', 'pollut', 'comput', 'paper-and-pencil', 'no child left behind', 'castig', 'compass', 'zest', 'college-bound', 'common cor', 'self-guid', 'critical think', 'explor', 'wage earn', 'rote', 'lessons videotap', 'gift', 'industri', 'self-direct', 'drone', 'regalia', 'outcom', 'discuss', 'sculpt', 'teacher observ', 'drop out', 'stiff', 'datum', 'soul', 'immers', 'fact', 'appetit', 'commerci', 'code of conduct', 'problem solv', 'essentialist', 'weav', 'statist', 'agricult', 'enthusia', 'liabil', 'interperson', 'obey', 'social problem', 'lawyer provid', 'immateri', 'open-mind', 'recorded lesson', 'manifest', 'harsh', 'materiali', 'recondit', 'seminar', 'creativ', 'wast', 'at-wil', 'legal servic', 'untam', 'soft', 'montessori', 'drama', 'prepared for university tough', 'teamwork', 'social & emot', 'at-risk', 'three-fold way', 'whole-child', 'cooperat', 'occup', 'international school', 'cadenc', 'ethnic', 'wildlif', 'respect', 'racist', 'breath', "child's pac", 'provide lawy', "another's sho", 'contend', 'procedur', 'apprais', 'inner world', 'blacksmith', 'standard', 'conver', 'freir', 'well', 'lawyer servic', 'abstract think', 'team', 'college-prep', 'home-school', 'vulner', 'structur', 'shame', 'interact', 'trust', 'college prep', 'nod', 'workplac', 'medical servic', 'talent', 'well-round', 'drill', 'peril', 'social act', 'spoon-fe', 'constructivi', 'compost', 'kipp', 'account', 'traditional math', 'worker be', 'machin', 'plant', 'protocol', 'laptop', 'music', 'socioemot', 'relate to p', 'celebrat', 'basic educ', 'spin', 'corpor', 'social servic', 'nclb', 'achievement gap', 'invent', 'desir', 'free thought', 'whole child', 'consum', 'geniu', 'independen', 'teach for america', 'result', 'feed', 'danc', 'warm', 'resol', 'independent thought', 'cycl', 'order', 'student cent', 'dewey', 'inspir', 'steiner', 'indic', 'landscap', 'impoverish', 'brotherhood', 'origin', 'spoon fe', 'project-bas', 'offici', 'behavioral cod', 'adapt', 'constructed knowledg', 'knowledg', 'lectur', 'groupwork', 'PC', 'traditional educ', 'mind', 'work-rel', 'holist', 'nonconcret', 'credit recoveri', 'craftsperson', 'calculat', 'construction of', 'friend', 'offic', 'chef', 'gaps in incom', 'cue', 'compet', 'racial ident', 'home school', 'basic knowledg', 'legal help', 'reprocess', 'milieu', 'gap in achiev', 'social and emot', 'differentiat', 'ethnic ident', 'reading and math', 'excit', 'correct', 'penal', 'data', 'learner-cent', 'intuit', 'college exemplar', 'prepared for colleg', 'sentiment', 'mainfram', 'rhythm', 'craft', 'inquiry-bas', 'foreign languag', 'artist', 'inner lif', 'puls', 'measur', 'employment servic', 'car', 'performing art', 'express', 'outdoor', 'diagnost', 'individualiz', 'teacher review', 'intang', 'college-ori', 'democrat', 'excess', 'contest', 'rules of behavior', 'health servic', 'tfa', 'multicultur', 'passion', 'genuin', 'the art', 'fine art', 'national averag', 'justic', 'provide a lawy', 'natur', 'gap in incom', 'authent', 'enrich', 'implement', 'tolerat', 'activi', 'college icon', 'defer', 'heartbeat', 'cook', 'expectations for behavior', 'perform', 'inner self', 'disadvantag', 'embarrass', 'enlighten', 'essenti', 'cross-cultur', 'music & art', 'medical help', 'inquiry bas', 'uniform', 'university prep', 'dress cod', 'drop-out', 'villag', 'spoon-f', 'progressivi', 'gaps in achiev', 'discov', 'habits of heart', 'project-ori', 'salvag', 'global', 'dedicat', 'learning contract', 'learner cent', 'kinship', 'quantif', 'reus', 'parent contract', 'play', 'surround', 'test', 'attach', 'recovery of credit', 'self-relian', 'heart', 'unconsci', 'cuisin', 'reggio emilia', 'independent think', 'skeptic', 'job-rel', 'health', 'festiv', 'college enrol', 'graduation r', 'back-to-bas', 'unequ', 'waldorf', 'job market', 'group', 'art & mus', 'reduced-pric', 'critical thought', 'lust', 'assess', 'student-cent', 'bona fid', 'regul', 'hierarchi', 'illustr', 'duti', 'socioeconom', 'non-concret', 'feedback to teach', 'regimen', 'interest', 'sign', 'employe', 'integrat', 'tragedi', 'author', 'enforc', 'traditional read', 'trivium', 'nourish', 'rooted in bas', 'yarn', 'tame', 'socio-emot', 'draw', 'flora', 'review of teach', 'firm', 'experi', 'problem-solv', 'craftsman', 'code of moral conduct', 'self-motivat', 'threaten', 'commun', 'techniqu', 'culpab', 'rigid', 'crave', 'poverti', 'feder', 'credit-recoveri', 'low-incom', 'taped lesson', 'feel', 'expected behavior', 'liveri', 'social-emot', 'career', 'frpl', 'craftsmen', 'contextualiz', 'college bound', 'teacher feedback', 'sustain', 'grounded in bas', 'memory and dril', 'workstat', 'math and read', 'love', 'bound for colleg', 'vocat', 'lessons tap', 'equal', 'income gap', 'divers', 'se', 'embrac', 'confide in', 'regulatori', 'abstract thought', 'behavioral rul', 'demand', 'knowledge i', 'toler', 'music and art', 'math & read', 'exam', 'danger', 'processor', 'social work', 'inner truth', 'season', 'mechan', 'desperat', 'behavior cod', 'college-enrol', 'delicaci', 'personal learning needs technolog', 'free-think', 'embed', 'punit', 'portfolio', 'discriminat', 'meaning', 'punish', 'self-determin', 'miner', 'hard', 'communicat', 'college complet', 'meditat', 'worker', 'inequ', 'paint', 'behavioral expect', 'spoon f', 'anim', 'videotaped lesson', 'well-b', 'disciplin', 'fantasi', 'fauna', 'imagin', 'outspoken', 'progress', 'obedi', 'socrat', 'relationship', 'student contract', 'unorthodox', 'ingeni', 'militari', 'construct', 'classic', 'lessons record', 'college attain', 'garden'}, 'ideology'), ({'school open', 'histori', 'we began', 'our purpos', 'direct', 'our id', 'structur', 'our caus', 'academ', 'our ident', 'philosophi', 'doors open', 'method', 'highlight', 'model', 'pedagog', 'the stori', 'vision', 'our school', 'credo', 'our school began', 'ideal', 'belief', 'valu', 'approach', 'believ', 'creed', 'found', 'about u', 'who we ar', 'mission', 'background', 'profil', 'purpos', 'system', 'philosoph', 'school stori', 'principl', 'our stori', 'curricular', 'establish', 'skill', 'curriculum', 'pedagogi', 'moral'}, 'keywords')]

[({'prompt', 'prepared for colleg', 'racist', 'observation of teacher state averag', 'memory-and-dril', 'mainfram', 'provide lawy', 'reading & math', 'review of teach', 'firm', 'observation on teacher state standard', 'contend', 'code of moral conduct', 'slant', 'threaten', 'procedur', 'handicap', 'apprais', 'culpab', 'measur', 'college-complet', 'rigid', 'recover credit', 'suspens', 'employment servic', 'poverti', 'feder', 'back to bas', 'incom', 'credit-recoveri', 'low-incom', 'of color', 'taped lesson', 'essential knowledg', 'expected behavior', 'diagnost', 'liveri', 'individualiz', 'standard', 'teacher review', 'career', 'frpl', 'college-ori', 'cold-cal', 'contest', 'college bound', 'lawyer servic', 'rules of behavior', 'grounded in bas', 'health servic', 'immigr', 'teacher feedback', 'tfa', 'achievement gain', 'disabl', 'memory and dril', 'workstat', 'rules of conduct', 'college-prep', 'math and read', 'vulner', 'national averag', 'structur', 'bound for colleg', 'vocat', 'lessons tap', 'equal', 'personaliz', 'justic', 'college prep', 'income gap', 'nod', 'workplac', 'medical servic', 'divers', 'provide a lawy', 'gap in incom', 'se', 'implement', 'comput', 'drill', 'peril', 'social act', 'paper-and-pencil', 'regulatori', 'no child left behind', 'kipp', 'account', 'behavioral rul', 'college icon', 'demand', 'machin', 'traditional math', 'defer', 'college-bound', 'common cor', 'protocol', 'math & read', 'laptop', 'exam', 'basic educ', 'wage earn', 'danger', 'expectations for behavior', 'lessons videotap', 'perform', 'processor', 'rote', 'social work', 'disadvantag', 'corpor', 'regalia', 'outcom', 'social servic', 'nclb', 'achievement gap', 'essenti', 'teacher observ', 'drop out', 'medical help', 'desperat', 'behavior cod', 'college-enrol', 'stiff', 'dress cod', 'drop-out', 'personal learning needs technolog', 'uniform', 'university prep', 'datum', 'gaps in achiev', 'teach for america', 'result', 'dedicat', 'learning contract', 'fact', 'discriminat', 'quantif', 'parent contract', 'code of conduct', 'order', 'test', 'recovery of credit', 'punish', 'essentialist', 'indic', 'hard', 'job-rel', 'impoverish', 'college enrol', 'statist', 'college complet', 'back-to-bas', 'graduation r', 'liabil', 'obey', 'inequ', 'job market', 'social problem', 'unequ', 'worker', 'offici', 'behavioral cod', 'behavioral expect', 'lawyer provid', 'lectur', 'reduced-pric', 'recorded lesson', 'PC', 'traditional educ', 'assess', 'regul', 'videotaped lesson', 'work-rel', 'seminar', 'hierarchi', 'duti', 'socioeconom', 'disciplin', 'credit recoveri', 'at-wil', 'legal servic', 'calculat', 'offic', 'obedi', 'feedback to teach', 'regimen', 'gaps in incom', 'cue', 'sign', 'respect', 'prepared for university tough', 'compet', 'employe', 'racial ident', 'basic knowledg', 'legal help', 'author', 'enforc', 'gap in achiev', 'at-risk', 'student contract', 'differentiat', 'ethnic ident', 'reading and math', 'militari', 'traditional read', 'correct', 'rooted in bas', 'occup', 'lessons record', 'penal', 'tame', 'data', 'college attain', 'ethnic', 'college exemplar'}, 'ess'), ({'conform', 'draw', 'joy', 'jung', 'breath', "child's pac", 'flora', 'sentiment', 'spiritu', 'rhythm', "another's sho", 'comedi', 'craft', 'experi', 'inquiry-bas', 'prison', 'rebel', 'habits of mind', 'problem-solv', 'craftsman', 'foreign languag', 'worker-be', 'craftspeopl', 'self-motivat', 'collaborat', 'artist', 'pace of the child', 'inner lif', 'organ', 'commun', 'techniqu', 'puls', 'art and mus', 'social', 'inner world', 'crave', 'car', 'performing art', 'express', 'outdoor', 'child-cent', 'recycl', 'feel', 'nurtur', 'free think', 'blacksmith', 'emot', 'social-emot', 'conver', 'craftsmen', 'farm', 'freir', 'intang', 'democrat', 'liberal art', 'contextualiz', 'excess', 'well', 'abstract think', 'sustain', 'factori', 'multicultur', 'sincer', 'passion', 'genuin', 'team', 'figur', 'the art', 'home-school', 'wildlif', 'love', 'fine art', 'shame', 'relate to oth', 'renaiss', 'interact', 'own pac', 'chastis', 'theater', 'democraci', 'self-rul', 'trust', 'child*cent', 'social/emot', 'natur', 'talent', 'authent', 'well-round', 'enrich', 'pollut', 'spoon-fe', 'embrac', 'activi', 'tolerat', 'confide in', 'constructivi', 'abstract thought', 'compost', 'castig', 'knowledge i', 'worker be', 'compass', 'toler', 'zest', 'heartbeat', 'music and art', 'plant', 'cook', 'music', 'critical think', 'self-guid', 'socioemot', 'relate to p', 'celebrat', 'explor', 'inner self', 'gift', 'industri', 'spin', 'self-direct', 'drone', 'inner truth', 'embarrass', 'discuss', 'season', 'enlighten', 'invent', 'mechan', 'sculpt', 'cross-cultur', 'desir', 'music & art', 'free thought', 'inquiry bas', 'whole child', 'consum', 'villag', 'delicaci', 'geniu', 'spoon-f', 'independen', 'habits of heart', 'discov', 'progressivi', 'project-ori', 'free-think', 'feed', 'embed', 'danc', 'salvag', 'global', 'punit', 'soul', 'immers', 'learner cent', 'portfolio', 'warm', 'resol', 'independent thought', 'appetit', 'cycl', 'kinship', 'meaning', 'reus', 'commerci', 'play', 'problem solv', 'surround', 'attach', 'student cent', 'self-determin', 'dewey', 'inspir', 'self-relian', 'steiner', 'heart', 'unconsci', 'miner', 'cuisin', 'reggio emilia', 'independent think', 'skeptic', 'landscap', 'communicat', 'brotherhood', 'weav', 'health', 'festiv', 'agricult', 'enthusia', 'origin', 'spoon fe', 'meditat', 'interperson', 'project-bas', 'waldorf', 'group', 'paint', 'adapt', 'constructed knowledg', 'knowledg', 'art & mus', 'spoon f', 'critical thought', 'immateri', 'open-mind', 'lust', 'groupwork', 'manifest', 'mind', 'anim', 'harsh', 'materiali', 'student-cent', 'bona fid', 'recondit', 'creativ', 'holist', 'well-b', 'illustr', 'nonconcret', 'fantasi', 'non-concret', 'wast', 'fauna', 'craftsperson', 'untam', 'construction of', 'friend', 'imagin', 'outspoken', 'progress', 'soft', 'socrat', 'interest', 'relationship', 'montessori', 'chef', 'drama', 'reprocess', 'home school', 'integrat', 'tragedi', 'milieu', 'teamwork', 'social & emot', 'social and emot', 'three-fold way', 'ingeni', 'unorthodox', 'whole-child', 'excit', 'nourish', 'trivium', 'cooperat', 'construct', 'yarn', 'classic', 'international school', 'cadenc', 'learner-cent', 'socio-emot', 'garden', 'intuit'}, 'prog'), ({'year', 'stori', 'cultur', 'upgrad', 'behavior', 'practic', 'involv', 'research', 'charact', 'academ', 'smart', 'ambitio', 'product', 'encourag', 'organ', 'autonom', 'digniti', 'special educ', 'commun', 'infu', 'person', 'develop', 'emphasi', 'numeraci', 'believ', 'real-world', 'world', 'need', 'solid', 'train', 'esl', 'participat', 'posit', 'complicat', 'mission', 'after-car', 'math', 'empow', 'literaci', 'goal', 'persev', 'public', 'real world', 'teach', 'principl', 'today', 'tutor', 'attend', 'life', 'confiden', 'teacher', 'citizen', 'excel', 'studi', 'lesson', 'chang', 'student', 'blend', 'responsib', 'process', 'virtual', 'innovat', 'perspect', 'complet', 'except', 'grow', 'foundat', 'uniqu', 'loyalti', 'imag', 'masteri', 'school', 'steadfast', 'motivat', 'children', 'language art', 'belief', 'before-car', 'groundwork', 'pattern', 'middl', 'single-sex', 'mark', 'strive', 'one', 'cultiv', 'civic', 'inform', 'support', 'striv', 'professional develop', 'elementari', 'scienc', 'rigor', 'curricul', 'deep', 'complex', 'surpass', 'quest', 'neg', 'restraint', 'histori', 'task', 'educ', 'direct', 'intellect', 'dream', 'substan', 'fact-find', 'method', 'leader', 'co-e', 'self', 'succ', 'inten', 'focu', 'before car', 'challeng', 'formal', 'valu', 'special ne', 'exercis', 'graduat', 'prep', 'assign', 'achiev', 'elect', 'beyond', 'strengthen', 'hybrid', 'simpl', 'significan', 'decid', 'diligen', 'individu', 'instruct', 'track', 'proud', 'instil', 'read', 'capab', 'meet', 'acceler', 'aptitud', 'philosophi', 'attitud', 'environ', 'reward', 'intellig', 'determin', 'courag', 'after car', 'model', 'modern', 'charter', 'depth', 'pedagog', 'program', 'single sex', 'tough', 'cours', 'ideal', 'progress', 'activ', 'aspir', 'number', 'approach', 'strength', 'patriot', 'staff', 'facil', 'learn', 'high', 'compet', 'creed', 'public polici', 'balanc', 'strong', 'author', 'research-bas', 'english as second languag', 'build', 'system', 'indicat', 'improv', 'repres', 'decis', 'emphas', 'tenaci', 'provid', 'opportunit', 'skill', 'pride', 'extend'}, 'rit'), ({'prompt', 'conform', 'joy', 'observation of teacher state averag', 'jung', 'memory-and-dril', 'spiritu', 'reading & math', 'comedi', 'rebel', 'prison', 'observation on teacher state standard', 'worker-be', 'habits of mind', 'craftspeopl', 'collaborat', 'pace of the child', 'slant', 'organ', 'handicap', 'college-complet', 'art and mus', 'social', 'recover credit', 'suspens', 'back to bas', 'incom', 'of color', 'child-cent', 'recycl', 'nurtur', 'essential knowledg', 'free think', 'emot', 'farm', 'liberal art', 'cold-cal', 'immigr', 'factori', 'achievement gain', 'disabl', 'sincer', 'rules of conduct', 'figur', 'relate to oth', 'renaiss', 'own pac', 'chastis', 'theater', 'democraci', 'self-rul', 'child*cent', 'personaliz', 'social/emot', 'pollut', 'comput', 'paper-and-pencil', 'no child left behind', 'castig', 'compass', 'zest', 'college-bound', 'common cor', 'self-guid', 'critical think', 'explor', 'wage earn', 'rote', 'lessons videotap', 'gift', 'industri', 'self-direct', 'drone', 'regalia', 'outcom', 'discuss', 'sculpt', 'teacher observ', 'drop out', 'stiff', 'datum', 'soul', 'immers', 'fact', 'appetit', 'commerci', 'code of conduct', 'problem solv', 'essentialist', 'weav', 'statist', 'agricult', 'enthusia', 'liabil', 'interperson', 'obey', 'social problem', 'lawyer provid', 'immateri', 'open-mind', 'recorded lesson', 'manifest', 'harsh', 'materiali', 'recondit', 'seminar', 'creativ', 'wast', 'at-wil', 'legal servic', 'untam', 'soft', 'montessori', 'drama', 'prepared for university tough', 'teamwork', 'social & emot', 'at-risk', 'three-fold way', 'whole-child', 'cooperat', 'occup', 'international school', 'cadenc', 'ethnic', 'wildlif', 'respect', 'racist', 'breath', "child's pac", 'provide lawy', "another's sho", 'contend', 'procedur', 'apprais', 'inner world', 'blacksmith', 'standard', 'conver', 'freir', 'well', 'lawyer servic', 'abstract think', 'team', 'college-prep', 'home-school', 'vulner', 'structur', 'shame', 'interact', 'trust', 'college prep', 'nod', 'workplac', 'medical servic', 'talent', 'well-round', 'drill', 'peril', 'social act', 'spoon-fe', 'constructivi', 'compost', 'kipp', 'account', 'traditional math', 'worker be', 'machin', 'plant', 'protocol', 'laptop', 'music', 'socioemot', 'relate to p', 'celebrat', 'basic educ', 'spin', 'corpor', 'social servic', 'nclb', 'achievement gap', 'invent', 'desir', 'free thought', 'whole child', 'consum', 'geniu', 'independen', 'teach for america', 'result', 'feed', 'danc', 'warm', 'resol', 'independent thought', 'cycl', 'order', 'student cent', 'dewey', 'inspir', 'steiner', 'indic', 'landscap', 'impoverish', 'brotherhood', 'origin', 'spoon fe', 'project-bas', 'offici', 'behavioral cod', 'adapt', 'constructed knowledg', 'knowledg', 'lectur', 'groupwork', 'PC', 'traditional educ', 'mind', 'work-rel', 'holist', 'nonconcret', 'credit recoveri', 'craftsperson', 'calculat', 'construction of', 'friend', 'offic', 'chef', 'gaps in incom', 'cue', 'compet', 'racial ident', 'home school', 'basic knowledg', 'legal help', 'reprocess', 'milieu', 'gap in achiev', 'social and emot', 'differentiat', 'ethnic ident', 'reading and math', 'excit', 'correct', 'penal', 'data', 'learner-cent', 'intuit', 'college exemplar', 'prepared for colleg', 'sentiment', 'mainfram', 'rhythm', 'craft', 'inquiry-bas', 'foreign languag', 'artist', 'inner lif', 'puls', 'measur', 'employment servic', 'car', 'performing art', 'express', 'outdoor', 'diagnost', 'individualiz', 'teacher review', 'intang', 'college-ori', 'democrat', 'excess', 'contest', 'rules of behavior', 'health servic', 'tfa', 'multicultur', 'passion', 'genuin', 'the art', 'fine art', 'national averag', 'justic', 'provide a lawy', 'natur', 'gap in incom', 'authent', 'enrich', 'implement', 'tolerat', 'activi', 'college icon', 'defer', 'heartbeat', 'cook', 'expectations for behavior', 'perform', 'inner self', 'disadvantag', 'embarrass', 'enlighten', 'essenti', 'cross-cultur', 'music & art', 'medical help', 'inquiry bas', 'uniform', 'university prep', 'dress cod', 'drop-out', 'villag', 'spoon-f', 'progressivi', 'gaps in achiev', 'discov', 'habits of heart', 'project-ori', 'salvag', 'global', 'dedicat', 'learning contract', 'learner cent', 'kinship', 'quantif', 'reus', 'parent contract', 'play', 'surround', 'test', 'attach', 'recovery of credit', 'self-relian', 'heart', 'unconsci', 'cuisin', 'reggio emilia', 'independent think', 'skeptic', 'job-rel', 'health', 'festiv', 'college enrol', 'graduation r', 'back-to-bas', 'unequ', 'waldorf', 'job market', 'group', 'art & mus', 'reduced-pric', 'critical thought', 'lust', 'assess', 'student-cent', 'bona fid', 'regul', 'hierarchi', 'illustr', 'duti', 'socioeconom', 'non-concret', 'feedback to teach', 'regimen', 'interest', 'sign', 'employe', 'integrat', 'tragedi', 'author', 'enforc', 'traditional read', 'trivium', 'nourish', 'rooted in bas', 'yarn', 'tame', 'socio-emot', 'draw', 'flora', 'review of teach', 'firm', 'experi', 'problem-solv', 'craftsman', 'code of moral conduct', 'self-motivat', 'threaten', 'commun', 'techniqu', 'culpab', 'rigid', 'crave', 'poverti', 'feder', 'credit-recoveri', 'low-incom', 'taped lesson', 'feel', 'expected behavior', 'liveri', 'social-emot', 'career', 'frpl', 'craftsmen', 'contextualiz', 'college bound', 'teacher feedback', 'sustain', 'grounded in bas', 'memory and dril', 'workstat', 'math and read', 'love', 'bound for colleg', 'vocat', 'lessons tap', 'equal', 'income gap', 'divers', 'se', 'embrac', 'confide in', 'regulatori', 'abstract thought', 'behavioral rul', 'demand', 'knowledge i', 'toler', 'music and art', 'math & read', 'exam', 'danger', 'processor', 'social work', 'inner truth', 'season', 'mechan', 'desperat', 'behavior cod', 'college-enrol', 'delicaci', 'personal learning needs technolog', 'free-think', 'embed', 'punit', 'portfolio', 'discriminat', 'meaning', 'punish', 'self-determin', 'miner', 'hard', 'communicat', 'college complet', 'meditat', 'worker', 'inequ', 'paint', 'behavioral expect', 'spoon f', 'anim', 'videotaped lesson', 'well-b', 'disciplin', 'fantasi', 'fauna', 'imagin', 'outspoken', 'progress', 'obedi', 'socrat', 'relationship', 'student contract', 'unorthodox', 'ingeni', 'militari', 'construct', 'classic', 'lessons record', 'college attain', 'garden'}, 'all_ideol')]

In [45]:
# ### Define parsing helper functions

def parsefile_by_tags(HTML_file):
    
    """Cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
    Reads in HTML from storage using a given filename, HTML_file."""

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(open(HTML_file), "html5lib")
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    # Split text into list using random string while also eliminating tabs and converting unicode to readable text:
    visible_text = list(normalize("NFKC",elem.replace("\t","")) for elem in visible_text.split(random_string))
    # TO DO: Eliminate anything with a '\x' in it (after splitting by punctuation)
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
    # Consider joining list elements together with newline in between by prepending with: "\n".join

    return(visible_text)

In [46]:
if Debug:
    example_textlist = parsefile_by_tags(example_file)
    print("Output of parsefile_by_tags:\n\n", example_textlist, "\n\n")


Output of parsefile_by_tags:

 ['About', 'Administration', 'Admissions', 'News', 'Charter School Information', 'Location', 'Frequently Asked Questions', 'Photos/Videos', 'School Facebook Page', 'Financial Reports', 'Nondiscrimination Policy', 'Academics', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Associated Arts', 'Summer Reading >>', '5th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', 'Parents', 'General Information', 'School Calendar >>', 'Download Calendar', 'PlusPortals', 'Before & After School Care', 'Forms >>', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Student Handbook', 'School Uniform Order Form', 'School Supplies >>', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Food Menu', 'PARCC', 'Inclement Weather Schedule', 'West Side Bus Routes', 'Athletics', 'Coach Contact Info', 'Athletics Schedule', 'Sports News', 'Sports Release', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Grade Check', 'Committees', 'Parent Teacher Association >>', 'Contact Info', 'Agendas and Minutes', 'Governance Council >>', 'Contact Info', 'Agendas and Minutes', 'Foundation >>', 'Contact Info', 'Agendas and Minutes', 'Search', 'About', 'You are here:', 'Home', '/', 'About', 'About 21st Century', '21st Century is a charter middle school. We have been a school since 2000. We serve a diverse population of nearly 70 students per grade. All staff bring years of teaching experience into our classrooms, and many have worked together in other settings. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state. Two Associated Arts courses are offered to each student every semester, including music and media programs.', 'History', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education. 40 6th grade students were permitted to enter the doors for the first time in September, 2000, making it officially the first charter middle school in Albuquerque, New Mexico. The school’s first 8th grade graduation was held in May, 2003.', 'The school was started by teachers who had worked together under a charter at Taylor Middle School: Art Silva, Math; Kitty Krivitzky, Science; Darlene Arias, Social Studies; Heather Sickenger, Language Arts. Donna Eldredge joined the team as a Special Ed teacher and principal.', '21st Century is still going strong to this day.', 'Mission', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff. Whenever possible, these lessons will take place in the arena in which they are practiced.', 'Vision', '21st Century Public Academy will provide experiences, situations, and opportunities for students to develop talents and to understand their role in the community. The body, mind, and spirit of each person will grow through lessons learned at school. Students will acquire a sense of personal responsibility, independence, and community interdependence.', 'School Hours', 'Regular School Hours:', '8:15-3:40 Monday, Tuesday, Thursday, Friday', '8:15-3:00 Wednesday', 'Students may not be dropped off prior to 8:00.', 'Recent News', '6th Grade OSI to the Petroglyphs', '(December 13, 2017)', 'Spelling Bee', '(December 8, 2017)', 'Science Bowl Competition', '(December 7, 2017)', 'Boys Basketball', '(November 21, 2017)', '5th Grade OSI to US Eagle Federal Credit Union', '(November 15, 2017)', 'New West Side Bus Routes for 21stCPA', '(November 3, 2017)', '21st Century Girl’s Basketball starts October 25', '(October 23, 2017)', '7th Grade OSI to El Rancho de las Golondrinas', '(October 11, 2017)', 'Girls Basketball Season', '(October 9, 2017)', 'Cross Country Photos', '(October 5, 2017)', '21st Century Public Academy – APS Charter Middle School', '4300 Cutler Ave NE', 'Albuquerque, NM 87110', 'Phone: (505)254-0280', 'Fax: (505)254-8507', 'Scroll to top'] 



In [64]:
# ### Define dictionary matching helper functions

def dict_count(text_list, custom_dict):
    
    """Performs dictionary analysis, returning number of dictionary hits found.
    Removes punctuation and stems the phrase being analyzed. 
    Compatible with multiple-word dictionary elements."""
    
    counts = 0 # number of matches between text_list and custom_dict
    dictless_list = [] # Updated text_list with dictionary hits removed
    max_entry_length = max([len(entry.split()) for entry in custom_dict]) # Get length (in words) of longest entry in combined dictionary
    
    for chunk in text_list: # chunk may be several sentences or possibly paragraphs long
        chunk = re.sub(r'[^\w\s]', '', chunk) # Remove punctuation with regex that keeps only letters and spaces

        # Do dictionary analysis for word chunks of lengths max_entry_length down to 1, removing matches each time.
        # This means longer dict entries will get removed first, useful in case they contain smaller entries.
        for length in range(max_entry_length, 0, -1):
            dictless_chunk,len_counts = dict_match_len(chunk,custom_dict,length)
            dictless_list.append(dictless_chunk)
            counts += len_counts
    
    return dictless_list,int(counts)


def dict_match_len(phrase, custom_dict, length):
    
    """Helper function to dict_match. 
    Returns # dictionary hits and updated copy of phrase with dictionary hits removed. 
    Stems phrases before checking for matches."""
    
    hits_indices, counts = [], 0
    splitted_phrase = phrase.split()
    if len(splitted_phrase) < length:
        return phrase, 0 # If text chunk is shorter than length of dict entries being matched, don't continue.
    
    for i in range(len(splitted_phrase) - length + 1):
        to_stem = ""
        for j in range(length):
            to_stem += splitted_phrase[i+j] + " " # Builds chunk of 'length' words
        stemmed_word = stemmer.stem(to_stem[:-1]) # stem chunk
        if stemmed_word in custom_dict:
            hits_indices.append(i) # Store the index of the word that has a dictionary hit
            counts += 1
            #print(stemmed_word)
                
    # Iterate through list of matching word indices and remove the matches
    for i in range(len(hits_indices)-1, -1, -1):
        splitted_phrase = splitted_phrase[:hits_indices[i]] + \
        splitted_phrase[hits_indices[i] + length:]
    modified_phrase = ""
    for sp in splitted_phrase: # Rebuild the modified phrase, with matches removed
        modified_phrase += sp + " "
    return modified_phrase[:-1], counts

                  
# @timeout_decorator.timeout(20, use_signals=False)
def dictmatch_file_helper(file, listlists, allmatch_count):
    """Counts number of matches in file for each list of terms given, and also collects the terms not matched.
    listlists is a list of lists, each list containing:
    a list of key terms--e.g., for dictsnames_biglist, currently essentialism, progressivism, ritualism, and all three combined (ess_dict, prog_dict, rit_dict, all_dicts);
    the variables used to store the number of matches for each term lit (e.g., ess_count, prog_count, rit_count, alldict_count); 
    and the not-matches--that is, the list of words leftover from the file after all matches are removed (e.g., ess_dictless, prog_dictless, rit_dictless, alldict_dictless). """         
    
    for i in range(len(dictsnames_biglist)): # Iterate over dicts to find matches with parsed text of file
        # For dictsnames_list, dicts are: (ess_dict, prog_dict, rit_dict, alldict_count); count_names are: (ess_count, prog_count, rit_count, alldict_count); dictless_names are: (ess_dictless, prog_dictless, rit_dictless, alldict_dictless)
        # adict,count_name,dictless_name = dictsnames_tupzip[i]
        dictless_add,count_add = dict_count(parsed_pagetext,listlists[i][0])
        listlists[i][1] += count_add
        listlists[i][2] += dictless_add
        allmatch_count += count_add
        
        print("Discovered " + str(count_add) + " matches for " + str(file) + \
                     ", a total thus far of " + str(allmatch_count) + " matches...")
                  
    return listlists,allmatch_count

In [48]:
if Debug:
    print("\nOutput of dict_count with ideology dict:\n\n", dict_count(example_textlist,all_ideol), "\n\n")


uniform
order
divers
experi
social
well
experi
commun
music
offici
offici
social
team
experi
enrich
experi
talent
commun
mind
commun
feder
season

Output of dict_count with ideology dict:

 (['About', 'About', 'About', 'About', 'About', 'Administration', 'Administration', 'Administration', 'Administration', 'Administration', 'Admissions', 'Admissions', 'Admissions', 'Admissions', 'Admissions', 'News', 'News', 'News', 'News', 'News', 'Charter School Information', 'Charter School Information', 'Charter School Information', 'Charter School Information', 'Charter School Information', 'Location', 'Location', 'Location', 'Location', 'Location', 'Frequently Asked Questions', 'Frequently Asked Questions', 'Frequently Asked Questions', 'Frequently Asked Questions', 'Frequently Asked Questions', 'PhotosVideos', 'PhotosVideos', 'PhotosVideos', 'PhotosVideos', 'PhotosVideos', 'School Facebook Page', 'School Facebook Page', 'School Facebook Page', 'School Facebook Page', 'School Facebook Page', 'Financial Reports', 'Financial Reports', 'Financial Reports', 'Financial Reports', 'Financial Reports', 'Nondiscrimination Policy', 'Nondiscrimination Policy', 'Nondiscrimination Policy', 'Nondiscrimination Policy', 'Nondiscrimination Policy', 'Academics', 'Academics', 'Academics', 'Academics', 'Academics', '5th Grade', '5th Grade', '5th Grade', '5th Grade', '5th Grade', '6th Grade', '6th Grade', '6th Grade', '6th Grade', '6th Grade', '7th Grade', '7th Grade', '7th Grade', '7th Grade', '7th Grade', '8th Grade', '8th Grade', '8th Grade', '8th Grade', '8th Grade', 'Associated Arts', 'Associated Arts', 'Associated Arts', 'Associated Arts', 'Associated Arts', 'Summer Reading ', 'Summer Reading ', 'Summer Reading ', 'Summer Reading', 'Summer Reading', '5th Grade Reading List', '5th Grade Reading List', '5th Grade Reading List', '5th Grade Reading List', '5th Grade Reading List', '6th Grade Reading List', '6th Grade Reading List', '6th Grade Reading List', '6th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '7th Grade Reading List', '7th Grade Reading List', '7th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', '8th Grade Reading List', '8th Grade Reading List', '8th Grade Reading List', '8th Grade Reading List', 'Parents', 'Parents', 'Parents', 'Parents', 'Parents', 'General Information', 'General Information', 'General Information', 'General Information', 'General Information', 'School Calendar ', 'School Calendar ', 'School Calendar ', 'School Calendar', 'School Calendar', 'Download Calendar', 'Download Calendar', 'Download Calendar', 'Download Calendar', 'Download Calendar', 'PlusPortals', 'PlusPortals', 'PlusPortals', 'PlusPortals', 'PlusPortals', 'Before  After School Care', 'Before After School Care', 'Before After School Care', 'Before After School Care', 'Before After School Care', 'Forms ', 'Forms ', 'Forms ', 'Forms ', 'Forms', 'New Student Registration Packet', 'New Student Registration Packet', 'New Student Registration Packet', 'New Student Registration Packet', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Free and Reduced Lunch', 'Free and Reduced Lunch', 'Free and Reduced Lunch', 'Free and Reduced Lunch', 'Student Handbook', 'Student Handbook', 'Student Handbook', 'Student Handbook', 'Student Handbook', 'School Uniform Order Form', 'School Uniform Order Form', 'School Uniform Order Form', 'School Uniform Order Form', 'School Form', 'School Supplies ', 'School Supplies ', 'School Supplies ', 'School Supplies', 'School Supplies', '5th Grade', '5th Grade', '5th Grade', '5th Grade', '5th Grade', '6th Grade', '6th Grade', '6th Grade', '6th Grade', '6th Grade', '7th Grade', '7th Grade', '7th Grade', '7th Grade', '7th Grade', '8th Grade', '8th Grade', '8th Grade', '8th Grade', '8th Grade', 'Food Menu', 'Food Menu', 'Food Menu', 'Food Menu', 'Food Menu', 'PARCC', 'PARCC', 'PARCC', 'PARCC', 'PARCC', 'Inclement Weather Schedule', 'Inclement Weather Schedule', 'Inclement Weather Schedule', 'Inclement Weather Schedule', 'Inclement Weather Schedule', 'West Side Bus Routes', 'West Side Bus Routes', 'West Side Bus Routes', 'West Side Bus Routes', 'West Side Bus Routes', 'Athletics', 'Athletics', 'Athletics', 'Athletics', 'Athletics', 'Coach Contact Info', 'Coach Contact Info', 'Coach Contact Info', 'Coach Contact Info', 'Coach Contact Info', 'Athletics Schedule', 'Athletics Schedule', 'Athletics Schedule', 'Athletics Schedule', 'Athletics Schedule', 'Sports News', 'Sports News', 'Sports News', 'Sports News', 'Sports News', 'Sports Release', 'Sports Release', 'Sports Release', 'Sports Release', 'Sports Release', 'Physical Form', 'Physical Form', 'Physical Form', 'Physical Form', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Contract', 'Student Athlete Contract', 'Student Athlete Contract', 'Student Athlete Contract', 'Student Athlete Grade Check', 'Student Athlete Grade Check', 'Student Athlete Grade Check', 'Student Athlete Grade Check', 'Student Athlete Grade Check', 'Committees', 'Committees', 'Committees', 'Committees', 'Committees', 'Parent Teacher Association ', 'Parent Teacher Association ', 'Parent Teacher Association', 'Parent Teacher Association', 'Parent Teacher Association', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Governance Council ', 'Governance Council ', 'Governance Council ', 'Governance Council', 'Governance Council', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Foundation ', 'Foundation ', 'Foundation ', 'Foundation ', 'Foundation', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Contact Info', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Agendas and Minutes', 'Search', 'Search', 'Search', 'Search', 'Search', 'About', 'About', 'About', 'About', 'About', 'You are here', 'You are here', 'You are here', 'You are here', 'You are here', 'Home', 'Home', 'Home', 'Home', 'Home', '', '', '', '', '', 'About', 'About', 'About', 'About', 'About', 'About 21st Century', 'About 21st Century', 'About 21st Century', 'About 21st Century', 'About 21st Century', '21st Century is a charter middle school We have been a school since 2000 We serve a diverse population of nearly 70 students per grade All staff bring years of teaching experience into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Social Studies and Language Arts as well as learning experiences in the community city and state Two Associated Arts courses are offered to each student every semester including music and media programs', '21st Century is a charter middle school We have been a school since 2000 We serve a diverse population of nearly 70 students per grade All staff bring years of teaching experience into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Social Studies and Language Arts as well as learning experiences in the community city and state Two Associated Arts courses are offered to each student every semester including music and media programs', '21st Century is a charter middle school We have been a school since 2000 We serve a diverse population of nearly 70 students per grade All staff bring years of teaching experience into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Social Studies and Language Arts as well as learning experiences in the community city and state Two Associated Arts courses are offered to each student every semester including music and media programs', '21st Century is a charter middle school We have been a school since 2000 We serve a diverse population of nearly 70 students per grade All staff bring years of teaching experience into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Social Studies and Language Arts as well as learning experiences in the community city and state Two Associated Arts courses are offered to each student every semester including music and media programs', '21st Century is a charter middle school We have been a school since 2000 We serve a population of nearly 70 students per grade All staff bring years of teaching into our classrooms and many have worked together in other settings We emphasize the core curriculum of Math Science Studies and Language Arts as as learning in the city and state Two Associated Arts courses are offered to each student every semester including and media programs', 'History', 'History', 'History', 'History', 'History', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it officially the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it officially the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it officially the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', '21st Century Public Academy was declared an official charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it officially the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', '21st Century Public Academy was declared an charter in 1999 by the Board of the Albuquerque Public Schools and State Department of Education 40 6th grade students were permitted to enter the doors for the first time in September 2000 making it the first charter middle school in Albuquerque New Mexico The schools first 8th grade graduation was held in May 2003', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Social Studies Heather Sickenger Language Arts Donna Eldredge joined the team as a Special Ed teacher and principal', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Social Studies Heather Sickenger Language Arts Donna Eldredge joined the team as a Special Ed teacher and principal', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Social Studies Heather Sickenger Language Arts Donna Eldredge joined the team as a Special Ed teacher and principal', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Social Studies Heather Sickenger Language Arts Donna Eldredge joined the team as a Special Ed teacher and principal', 'The school was started by teachers who had worked together under a charter at Taylor Middle School Art Silva Math Kitty Krivitzky Science Darlene Arias Studies Heather Sickenger Language Arts Donna Eldredge joined the as a Special Ed teacher and principal', '21st Century is still going strong to this day', '21st Century is still going strong to this day', '21st Century is still going strong to this day', '21st Century is still going strong to this day', '21st Century is still going strong to this day', 'Mission', 'Mission', 'Mission', 'Mission', 'Mission', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'It is the mission of 21st Century Public Academy to continually search for positive learning that students and staff Whenever possible these lessons will take place in the arena in which they are practiced', 'Vision', 'Vision', 'Vision', 'Vision', 'Vision', '21st Century Public Academy will provide experiences situations and opportunities for students to develop talents and to understand their role in the community The body mind and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and community interdependence', '21st Century Public Academy will provide experiences situations and opportunities for students to develop talents and to understand their role in the community The body mind and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and community interdependence', '21st Century Public Academy will provide experiences situations and opportunities for students to develop talents and to understand their role in the community The body mind and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and community interdependence', '21st Century Public Academy will provide experiences situations and opportunities for students to develop talents and to understand their role in the community The body mind and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and community interdependence', '21st Century Public Academy will provide situations and opportunities for students to develop and to understand their role in the The body and spirit of each person will grow through lessons learned at school Students will acquire a sense of personal responsibility independence and interdependence', 'School Hours', 'School Hours', 'School Hours', 'School Hours', 'School Hours', 'Regular School Hours', 'Regular School Hours', 'Regular School Hours', 'Regular School Hours', 'Regular School Hours', '815340 Monday Tuesday Thursday Friday', '815340 Monday Tuesday Thursday Friday', '815340 Monday Tuesday Thursday Friday', '815340 Monday Tuesday Thursday Friday', '815340 Monday Tuesday Thursday Friday', '815300 Wednesday', '815300 Wednesday', '815300 Wednesday', '815300 Wednesday', '815300 Wednesday', 'Students may not be dropped off prior to 800', 'Students may not be dropped off prior to 800', 'Students may not be dropped off prior to 800', 'Students may not be dropped off prior to 800', 'Students may not be dropped off prior to 800', 'Recent News', 'Recent News', 'Recent News', 'Recent News', 'Recent News', '6th Grade OSI to the Petroglyphs', '6th Grade OSI to the Petroglyphs', '6th Grade OSI to the Petroglyphs', '6th Grade OSI to the Petroglyphs', '6th Grade OSI to the Petroglyphs', 'December 13 2017', 'December 13 2017', 'December 13 2017', 'December 13 2017', 'December 13 2017', 'Spelling Bee', 'Spelling Bee', 'Spelling Bee', 'Spelling Bee', 'Spelling Bee', 'December 8 2017', 'December 8 2017', 'December 8 2017', 'December 8 2017', 'December 8 2017', 'Science Bowl Competition', 'Science Bowl Competition', 'Science Bowl Competition', 'Science Bowl Competition', 'Science Bowl Competition', 'December 7 2017', 'December 7 2017', 'December 7 2017', 'December 7 2017', 'December 7 2017', 'Boys Basketball', 'Boys Basketball', 'Boys Basketball', 'Boys Basketball', 'Boys Basketball', 'November 21 2017', 'November 21 2017', 'November 21 2017', 'November 21 2017', 'November 21 2017', '5th Grade OSI to US Eagle Federal Credit Union', '5th Grade OSI to US Eagle Federal Credit Union', '5th Grade OSI to US Eagle Federal Credit Union', '5th Grade OSI to US Eagle Federal Credit Union', '5th Grade OSI to US Eagle Credit Union', 'November 15 2017', 'November 15 2017', 'November 15 2017', 'November 15 2017', 'November 15 2017', 'New West Side Bus Routes for 21stCPA', 'New West Side Bus Routes for 21stCPA', 'New West Side Bus Routes for 21stCPA', 'New West Side Bus Routes for 21stCPA', 'New West Side Bus Routes for 21stCPA', 'November 3 2017', 'November 3 2017', 'November 3 2017', 'November 3 2017', 'November 3 2017', '21st Century Girls Basketball starts October 25', '21st Century Girls Basketball starts October 25', '21st Century Girls Basketball starts October 25', '21st Century Girls Basketball starts October 25', '21st Century Girls Basketball starts October 25', 'October 23 2017', 'October 23 2017', 'October 23 2017', 'October 23 2017', 'October 23 2017', '7th Grade OSI to El Rancho de las Golondrinas', '7th Grade OSI to El Rancho de las Golondrinas', '7th Grade OSI to El Rancho de las Golondrinas', '7th Grade OSI to El Rancho de las Golondrinas', '7th Grade OSI to El Rancho de las Golondrinas', 'October 11 2017', 'October 11 2017', 'October 11 2017', 'October 11 2017', 'October 11 2017', 'Girls Basketball Season', 'Girls Basketball Season', 'Girls Basketball Season', 'Girls Basketball Season', 'Girls Basketball', 'October 9 2017', 'October 9 2017', 'October 9 2017', 'October 9 2017', 'October 9 2017', 'Cross Country Photos', 'Cross Country Photos', 'Cross Country Photos', 'Cross Country Photos', 'Cross Country Photos', 'October 5 2017', 'October 5 2017', 'October 5 2017', 'October 5 2017', 'October 5 2017', '21st Century Public Academy APS Charter Middle School', '21st Century Public Academy APS Charter Middle School', '21st Century Public Academy APS Charter Middle School', '21st Century Public Academy APS Charter Middle School', '21st Century Public Academy APS Charter Middle School', '4300 Cutler Ave NE', '4300 Cutler Ave NE', '4300 Cutler Ave NE', '4300 Cutler Ave NE', '4300 Cutler Ave NE', 'Albuquerque NM 87110', 'Albuquerque NM 87110', 'Albuquerque NM 87110', 'Albuquerque NM 87110', 'Albuquerque NM 87110', 'Phone 5052540280', 'Phone 5052540280', 'Phone 5052540280', 'Phone 5052540280', 'Phone 5052540280', 'Fax 5052548507', 'Fax 5052548507', 'Fax 5052548507', 'Fax 5052548507', 'Fax 5052548507', 'Scroll to top', 'Scroll to top', 'Scroll to top', 'Scroll to top', 'Scroll to top'], 22) 



In [49]:
def filter_dict_page(pagetext_list, keyslist):
    
    """Filters webtext of a given .html page, which is parsed and in list format, to only those strings 
    within pagetext_list containing an element (word or words) of inputted keyslist. 
    Returns list filteredtext wherein each element has original case (not coerced to lower-case)."""
    
    filteredtext = [] # Initialize empty list to hold strings of page
    
    for string in pagetext_list:
        lowercasestring = str(string).lower() # lower-case string...
        dict_list = [key.lower() for key in list(keyslist)] # ...compared with lower-case element of keyslist
        for key in dict_list:
            if key in lowercasestring and key in lowercasestring.split(' '): # Check that the word is the whole word not part of another one
                filteredtext.append(string)

    return filteredtext

In [50]:
if Debug:
    print("Output of filter_dict_page:\n\n", filter_dict_page(example_textlist, all_keywords), "\n\n")


Output of filter_dict_page:

 ['21st Century is a charter middle school. We have been a school since 2000. We serve a diverse population of nearly 70 students per grade. All staff bring years of teaching experience into our classrooms, and many have worked together in other settings. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state. Two Associated Arts courses are offered to each student every semester, including music and media programs.', 'Mission', 'It is the mission of 21st Century Public Academy to continually search for positive learning experiences that enrich students and staff. Whenever possible, these lessons will take place in the arena in which they are practiced.', 'Vision'] 



In [79]:
def filter_by_keycount(folder_path): 
    
    """NOT USED.
    Filters webtext for a given school to only those text chunks containing specified keywords.
    Categorizes each block of text by scoring based on keyword count, using already-defined lists of keywords per category:
    mission, philosophy, curriculum, history, "about"/general self-description, combined ideology, and all keywords."""
    
    # TO DO: Fix this function! And compare speed with that of filter_dict_page() above, especially for longer pages.
    
    # Initialize keyword lists to count over (must be defined outside function)
    global mission_keywords,curriculum_keywords,philosophy_keywords,history_keywords,about_keywords,all_ideol,all_keywords
    mission_list,curriculum_list,philosophy_list,history_list,about_list,ideol_list,keys_list, = [],[],[],[],[],[],[]
    
    file_list = list_files(folder_path, ".html")

    for file in tqdm(file_list, desc="Filtering by keys:"):
        try:
            pagetext_list = parsefile_by_tags(file)

            for string in pagetext_list:
                mission_score, curriculum_score, philosophy_score, history_score, about_score, ideol_score, keys_score = 0, 0, 0, 0, 0, 0, 0
                for word in mission_keywords:
                    mission_score+=string.count(word)
                    if 'mission' in string.lower():
                        mission_score = 2

                for word in curriculum_keywords:
                    curriculum_score+=string.count(word)
                    if 'curriculum' in string.lower():
                        curriculum_score = 2

                for word in philosophy_keywords:
                    philosophy_score+=string.count(word)
                    if 'philosophy' in string.lower() or 'value' in string.lower():
                        philosophy_score = 2

                for word in history_keywords:
                    history_score+=string.count(word)
                    if 'history' in string.lower():
                        history_score = 2

                for word in about_keywords:
                    about_score+=string.count(word)
                    if 'about us' in string.lower() or "about-us" in string.lower():
                        about_score = 2

                for word in all_ideol:
                    ideol_score+=string.count(word)

                if mission_score>=2:
                    mission_list.append(string)
                if curriculum_score>=2:
                    curriculum_list.append(string)
                if philosophy_score>=2:
                    philosophy_list.append(string)
                if history_score>=2:
                    history_list.append(string)
                if about_score>=2:
                    about_list.append(string)
                if ideol_score>=2:
                    ideol_list.append(string)
                if ((mission_score + curriculum_score + philosophy_score + about_score) >=2): 
                    keys_list.append(string) # Impute keywords counting using its ideological constitutent elements--which excludes history_score

        except Exception as e:
            if Debug:
                print("    ERROR categorizing " + str(file))
                print(e)
            continue
                    
    return mission_list, curriculum_list, philosophy_list, history_list, about_list, ideol_list, keys_list

In [81]:
print("Output of filter_by_keycount:\n\n", filter_by_keycount(example_folder), "\n\n")


Filtering by keys:: 100%|██████████| 879/879 [01:30<00:00,  9.75it/s]
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)


In [118]:
def dict_bestmatch(folder_path, custom_dict):
    """Parse through all .html files in folder_path, detecting matches with custom_dict,
    to find and return the full text from the html page that has the most matches with that dictionary."""
    
    # Initialization
    file_list = list_files(folder_path, ".html") # Get full list of file paths
    num_pages = len(file_list) # Number of pages in school's folder
    max_page_hits = (-1,-1) # Initialize tuple holding #hits, page number for HTML file with greatest # matches with custom_dict 
    max_weighted_score = (-1,-1) # Same as previous, but weighted by page length
    max_hit_text,max_score_text = [],[] # Empty lists for each best matching pages
    
    # Parse through pages to find maximum number of hits of custom_dict on any page
    for pagenum in tqdm(range(num_pages), desc="Finding best match:"):
        try:
            page_dict_count,page_weighted_score = -1,-1
            page_textlist = parsefile_by_tags(file_list[pagenum]) # Parse page with index pagenum into text list
            
            if len(page_textlist)==0: # If page is empty, don't bother with it
                continue

            dictless_text, page_dict_hits = dict_count(page_textlist, custom_dict) # Count matches between custom_dict and page_textlist using dict_count
            numwords = len('\n'.join(page_textlist).split())
            page_weighted_score = page_dict_hits / numwords # Weight score by number of words on page
            logging.info("Found" + str(page_dict_hits) + "for page #" + str(pagenum) + "and " + str(page_dict_hits) + "weighting for the " + numwords + " words on that page.")

            if page_dict_hits > max_page_hits[0]: # Compare matches for this page with overall max
                max_page_hits = (page_dict_hits, pagenum) # If its greater, then make new page the max
            if page_weighted_score > max_weighted_score[0]: # Same as previous two lines, but weighted by page length
                max_weighted_score = (page_weighted_score, pagenum)

        except Exception as e:
            logging.debug("    ERROR counting dict matches in page #" + str(pagenum))
            logging.debug(str(e))
            continue
                    
    logging.info("Number matches and index of best matching page: " + str(max_page_hits[0]) + " " + str(max_page_hits[1]))
    logging.info("Number matches and index of best WEIGHTED matching page: " + str(max_weighted_score[0]) + " " + str(max_weighted_score[1]))
    
    # Use pagenum to get text for page with highest number of hits and weighted score:
    max_hit_text = parsefile_by_tags(file_list[max_page_hits[1]])
    max_score_text = parsefile_by_tags(file_list[max_weighted_score[1]])
    
    logging.info("Page with the highest number of dictionary hits:\n\n" + str(max_hit_text))
    logging.info("Page with the highest weighted score:\n\n" + str(max_score_text))
    
    return max_hit_text,max_score_text

In [119]:
print("Output of dict_bestmatch for all ideologies:\n", dict_bestmatch(example_folder, mission_keywords), "\n\n" )


Finding best match:: 100%|██████████| 879/879 [01:48<00:00,  8.14it/s]
Output of dict_bestmatch for all ideologies:
 (['About', 'Administration', 'Admissions', 'News', 'Charter School Information', 'Location', 'Frequently Asked Questions', 'Photos/Videos', 'School Facebook Page', 'Financial Reports', 'Nondiscrimination Policy', 'Academics', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Associated Arts', 'Summer Reading >>', '5th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', 'Parents', 'General Information', 'School Calendar >>', 'Download Calendar', 'PlusPortals', 'Before & After School Care', 'Forms >>', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Student Handbook', 'School Uniform Order Form', 'School Supplies >>', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Food Menu', 'PARCC', 'Inclement Weather Schedule', 'West Side Bus Routes', 'Athletics', 'Coach Contact Info', 'Athletics Schedule', 'Sports News', 'Sports Release', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Grade Check', 'Committees', 'Parent Teacher Association >>', 'Contact Info', 'Agendas and Minutes', 'Governance Council >>', 'Contact Info', 'Agendas and Minutes', 'Foundation >>', 'Contact Info', 'Agendas and Minutes', 'Search', 'Scrip and Amazon Smiles Fundraisers!', '21st Century is implementing the Shop with Scrips as well as Amazon Smiles fundraiser programs, which enables families to help us raise funds without having to sell anything, or buy anything, other than what you would buy day to day on a regular basis!  Click here for more information!', 'Welcome to 21st Century!', '21st Century is a charter middle school in Albuquerque, New Mexico. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state.', 'New Location!', 'Our permanent home is opening up at 4300 Cutler NE; Albuquerque, NM 87110.', 'Out of School Instruction', '21st Century is not a typical public school.  We strive to give students educational experiences outside of the school walls, in the community wherein those skills are practiced.  Each grade level attends a minimum of twenty OSIs per year, taking them to locations such as the Natural History Museum, the Roadrunner Food Bank, the Pecos Natural History Park, and many more!  These experiences are designed to support student core learning while also teaching the importance of our community.', 'Smaller Class Sizes', 'As a charter school, 21st CPA is able to offer smaller class sizes, which allows for more individualized instruction for each student.  Each teacher works in both department and grade level teams to design curriculum and plan OSIs that support student learning.', 'Register Here', 'Interested in sending your student to 21st Century?  Click here for more information.', 'Previous', 'Next', '1', '2', '3', '4', '5', '6', 'A Private School Education at a Public School Price', 'About', '21st Century is a charter middle school that has been in place in Albuquerque, New Mexico since 2000. We serve a diverse population of nearly 80 students per grade. All staff bring years of teaching experience into our classrooms and many have worked together in other settings.', 'Academics', 'At 21st Century, coordination and collaboration among teachers and staff is central to our educational philosophy.  Teachers meet in grade level teams every day to coordinate with one another and to prepare collaborative lessons. Departments meet every week to chart progress and address needs within their subject area.', 'Online Gradebook', 'Keep up to date with your student’s grades and coursework by signing in to the ', 'Rediker PlusPortals website', '.  Student grades are updated every two weeks.  If you do not have an account set up yet, please contact the school to get started!', 'School Calendar', 'Latest News', '6th Grade OSI to the Petroglyphs', 'December 13, 2017', '/', 'in ', 'Photos/Videos', '/', 'by ', '21stadmin', 'Read more', '  →', 'Spelling Bee', 'December 8, 2017', '/', 'in ', 'Photos/Videos', ', ', 'Uncategorized', '/', 'by ', '21stadmin', 'Read more', '  →', 'Science Bowl Competition', 'December 7, 2017', '/', 'in ', 'Photos/Videos', '/', 'by ', '21stadmin', 'Read more', '  →', 'Page 9 of 68', '«', '‹', '7', '8', '9', '10', '11', '›', '»', '21st Century Public Academy – APS Charter Middle School', '4300 Cutler Ave NE', 'Albuquerque, NM 87110', 'Phone: (505)254-0280', 'Fax: (505)254-8507', 'Scroll to top'], ['About', 'Administration', 'Admissions', 'News', 'Charter School Information', 'Location', 'Frequently Asked Questions', 'Photos/Videos', 'School Facebook Page', 'Financial Reports', 'Nondiscrimination Policy', 'Academics', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Associated Arts', 'Summer Reading >>', '5th Grade Reading List', '6th Grade Reading List', '7th Grade Reading List', '8th Grade Reading List', 'Parents', 'General Information', 'School Calendar >>', 'Download Calendar', 'PlusPortals', 'Before & After School Care', 'Forms >>', 'New Student Registration Packet', 'Free and Reduced Lunch', 'Student Handbook', 'School Uniform Order Form', 'School Supplies >>', '5th Grade', '6th Grade', '7th Grade', '8th Grade', 'Food Menu', 'PARCC', 'Inclement Weather Schedule', 'West Side Bus Routes', 'Athletics', 'Coach Contact Info', 'Athletics Schedule', 'Sports News', 'Sports Release', 'Physical Form', 'Student Athlete Contract', 'Student Athlete Grade Check', 'Committees', 'Parent Teacher Association >>', 'Contact Info', 'Agendas and Minutes', 'Governance Council >>', 'Contact Info', 'Agendas and Minutes', 'Foundation >>', 'Contact Info', 'Agendas and Minutes', 'Search', 'Scrip and Amazon Smiles Fundraisers!', '21st Century is implementing the Shop with Scrips as well as Amazon Smiles fundraiser programs, which enables families to help us raise funds without having to sell anything, or buy anything, other than what you would buy day to day on a regular basis!  Click here for more information!', 'Welcome to 21st Century!', '21st Century is a charter middle school in Albuquerque, New Mexico. We emphasize the core curriculum of Math, Science, Social Studies, and Language Arts, as well as learning experiences in the community, city, and state.', 'New Location!', 'Our permanent home is opening up at 4300 Cutler NE; Albuquerque, NM 87110.', 'Out of School Instruction', '21st Century is not a typical public school.  We strive to give students educational experiences outside of the school walls, in the community wherein those skills are practiced.  Each grade level attends a minimum of twenty OSIs per year, taking them to locations such as the Natural History Museum, the Roadrunner Food Bank, the Pecos Natural History Park, and many more!  These experiences are designed to support student core learning while also teaching the importance of our community.', 'Smaller Class Sizes', 'As a charter school, 21st CPA is able to offer smaller class sizes, which allows for more individualized instruction for each student.  Each teacher works in both department and grade level teams to design curriculum and plan OSIs that support student learning.', 'Register Here', 'Interested in sending your student to 21st Century?  Click here for more information.', 'Previous', 'Next', '1', '2', '3', '4', '5', '6', 'A Private School Education at a Public School Price', 'About', '21st Century is a charter middle school that has been in place in Albuquerque, New Mexico since 2000. We serve a diverse population of nearly 80 students per grade. All staff bring years of teaching experience into our classrooms and many have worked together in other settings.', 'Academics', 'At 21st Century, coordination and collaboration among teachers and staff is central to our educational philosophy.  Teachers meet in grade level teams every day to coordinate with one another and to prepare collaborative lessons. Departments meet every week to chart progress and address needs within their subject area.', 'Online Gradebook', 'Keep up to date with your student’s grades and coursework by signing in to the ', 'Rediker PlusPortals website', '.  Student grades are updated every two weeks.  If you do not have an account set up yet, please contact the school to get started!', 'School Calendar', 'Latest News', '6th Grade OSI to the Petroglyphs', 'December 13, 2017', '/', 'in ', 'Photos/Videos', '/', 'by ', '21stadmin', 'Read more', '  →', 'Spelling Bee', 'December 8, 2017', '/', 'in ', 'Photos/Videos', ', ', 'Uncategorized', '/', 'by ', '21stadmin', 'Read more', '  →', 'Science Bowl Competition', 'December 7, 2017', '/', 'in ', 'Photos/Videos', '/', 'by ', '21stadmin', 'Read more', '  →', 'Page 9 of 68', '«', '‹', '7', '8', '9', '10', '11', '›', '»', '21st Century Public Academy – APS Charter Middle School', '4300 Cutler Ave NE', 'Albuquerque, NM 87110', 'Phone: (505)254-0280', 'Fax: (505)254-8507', 'Scroll to top']) 



In [15]:
def parse_school(school_dict):
    
    """This core function parses webtext for a given school, using helper functions to run analyses and then saving multiple outputs to school_dict:
    counts of the number of matches between all text from a school's html pages and keywords from a defined keyword list, find dict_count();
    and text contents of those individual pages best matching such keywords, via find_best_categories (in development).
    
    For the sake of parsimony and manageable script calls, OTHER similar functions/scripts collect these additional outputs: 
    full (partially cleaned) webtext, by parsing webtext of each .html file (removing inline tags, etc.) within school's folder, via parsefile_by_tags();
    filtered webtext, by keeping only those parsed text elements containing a keyword in previously defined keywords list, via filter_keywords_page();
    and parsed webtext, having removed overlapping headers/footers common to multiple pages, via remove_overlaps()."""
    
    # Allow function to access these variables already defined outside the function (globally)
    global itervar,numschools,parsed,wget_dataloc,URL_var,NAME_var,ADDR_var
    
    datalocation = wget_dataloc # Define path to local data storage
    school_name, school_address, school_URL = school[NAME_var], school[ADDR_var], school[URL_var] # Define varnames
    itervar+=1 # Count this school
    
    print("Parsing " + str(school_name) + ", which is school #" + str(itervar) + " of " + str(numschools) + "...")
    
    # Initialize variables
    school_dict['ess_strength'],school_dict['prog_strength'] = 0.0,0.0
    if not usefile:
        school_dict["duplicate_flag"], school_dict["parse_error_flag"] = 0, 0
    
    # Assign folder names
    folder_name = re.sub(" ","_",(school_name+" "+school_address[-8:-6]))
    school_dict["folder_name"] = folder_name
    school_folder = datalocation + folder_name + "/"
    if school_URL==school_name:
        school_URL = folder_name # Workaround for full_schooldata, which doesn't yet have URLs
    
    # Check if folder exists. If not, exit function
    if not (os.path.exists(school_folder) or os.path.exists(school_folder.lower()) or os.path.exists(school_folder.upper())):
        print("!! NO DIRECTORY FOUND matching " + str(school_folder) + ".\n  Aborting parsing function...\n\n")
        school_dict['wget_fail_flag'] = 1
        return
    
    
    """ # Commented out until dict_bestmatch() works
    try:
        for keylist,title in list(keysnames_tupzip): # Names are: ("mission","curriculum","philosophy","history","about","ideology","keywords")
            bestvar_name = title + "_best" # assign varname to use as dict key

            school_dict[bestvar_name],school_dict[bestvar_name+"_weighted"] = [],[] # initialize dict key/value pair as empty string
            best_page,best_page_weighted = dict_bestmatch(school_folder,keylist) # Find pages best corresponding to keyword category for each in keysnames_tupzip
            school_dict[bestvar_name].extend(best_page)
            school_dict[bestvar_name+"_weighted"].extend(best_page_weighted)
            
    except Exception as e:
        print("    ERROR! Failed to find best pages while parsing webtext of " + str(school_name))
        print("    ",e)
        """
    
    
    try:
        for adict,name in list(dictsnames_tupzip): # Names are: ("ess", "prog", "rit", "all_ideol")
            dict_name = name + "_count"
            school_dict[dict_name] = dict_count(school_folder,adict)[1]
            
        school_dict['ess_strength'] = float(school_dict['ess_count'])/float(school_dict['rit_count'])
        school_dict['prog_strength'] = float(school_dict['prog_count'])/float(school_dict['rit_count'])
            
        print("  SUCCESS! Counted dictionary matches for " + str(school_name) + "...")
        save_to_file(dicts_list, save_dir+"school_dictcounts_temp", "JSON") # Save output so we can pick up where left off, in case something breaks before able to save final output
        return
        
    except:
        print("    ERROR! Failed to count number of dict matches while parsing webtext of " + str(school_name))
        print("    ",e)
        school_dict["parse_error_flag"] = 1
        return

In [16]:
# ### Preparing data to be parsed

itervar = 0 # initialize iterator that counts number of schools already parsed
parsed = [] # initialize list of URLs that have already been parsed
dicts_list = [] # initialize list of dictionaries to hold school data

# If input_file was defined by user input in beginning of script, use that to load list of dictionaries. We'll add to it!
if usefile and not dicts_list:
    dicts_list = load_datafile(input_file)
    data_loc = full_schooldata # If loading data, assume we're running on full charter population

else:
    # set charter school data file and corresponding varnames:
    
    data_loc = full_schooldata # Run at scale using URL list of full charter population
    # data_loc = micro_sample13 # This seems nice for debugging--except directories don't match because different data source
        
    # Create dict list from CSV on file, with one dict per school
    with open(data_loc, 'r', encoding = 'Latin1') as csvfile: # open data file
        reader = csv.DictReader(csvfile) # create a reader
        for row in reader: # loop through rows
            dicts_list.append(row) # append each row to the list
        
URL_var,NAME_var,ADDR_var = get_vars(data_loc) # get varnames depending on data source
numschools = len(dicts_list) # Count number of schools in list of dictionaries
        
# Note on data structures: each row, dicts_list[i] is a dictionary with keys as column name and value as info.
# This will be translated into pandas data frame once (rather messy) website text is parsed into consistent variables

In [80]:
for school in dicts_list:
    school["folder_name"] = re.sub(" ","_",(school[NAME_var]+" "+school[ADDR_var][-8:-6])) # This gives name and state separated by "_"
    
    school["folder_path"] = str(wget_dataloc) + school["folder_name"] + "/" # This temporary variable simplifies next line of code
    
    if (has_html(school["folder_path"])==False) or not os.path.exists(school["folder_path"]):
        school['wget_fail_flag'] = str(1) # If folder doesn't exist, mark as fail and ignore when loading files
    else:
        school['wget_fail_flag'] = str(0) # make str so can work with currently limited Pandas dtype conversion functionality

In [88]:
print(schooldf[schooldf["folder_name"]=="Effie_Kokrine_Charter_School_AK"][["wget_fail_flag","folder_path"]])


   wget_fail_flag                                        folder_path
25              1  /home/jovyan/work/wget/parll_wget/Effie_Kokrin...

In [90]:
print(schooldf[schooldf["folder_name"]=="Natomas_Charter_CA"][["wget_fail_flag","folder_path"]])


    wget_fail_flag                                        folder_path
748              0  /home/jovyan/work/wget/parll_wget/Natomas_Char...

In [84]:
schooldf = pd.DataFrame.from_dict(dicts_list) # Convert dicts_list into a DataFrame
schooldf.info()
schooldf.head(4)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6972 entries, 0 to 6971
Columns: 183 entries, SURVYEAR to folder_path
dtypes: object(183)
memory usage: 9.7+ MB
Out[84]:
SURVYEAR FIPST STABR STATENAME SEANAME LEAID ST_LEAID LEA_NAME SCHID ST_SCHID ... LSTREET116 LSTREET216 LSTREET316 LCITY16 LSTATE16 LZIP16 ADDRESS16 folder_name wget_fail_flag folder_path
0 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200001.0 31 Lower Kuskokwim School District 329.0 319010 ... 1010 Fourth Avenue Bethel AK 99559.0 1010 Fourth Avenue, Bethel AK 99559 Ayaprun_Elitnaurvik_AK 0 /home/jovyan/work/wget/parll_wget/Ayaprun_Elit...
1 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200150.0 25 Ketchikan Gateway Borough School District 306.0 259010 ... 410 Schoenbar Ketchikan AK 99901.0 410 Schoenbar, Ketchikan AK 99901 Ketchikan_Charter_School_AK 0 /home/jovyan/work/wget/parll_wget/Ketchikan_Ch...
2 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200150.0 25 Ketchikan Gateway Borough School District 523.0 259020 ... 410 Schoenbar Road Ketchikan AK 99901.0 410 Schoenbar Road, Ketchikan AK 99901 Tongass_School_of_Arts_and_Sciences_Charter_Sc... 0 /home/jovyan/work/wget/parll_wget/Tongass_Scho...
3 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200180.0 5 Anchorage School District 172.0 59010 ... 1705 W 32nd Ave Anchorage AK 99517.0 1705 W 32nd Ave, Anchorage AK 99517 Aquarian_Charter_School_AK 0 /home/jovyan/work/wget/parll_wget/Aquarian_Cha...

4 rows × 183 columns


In [43]:
schooldf["wget_fail_flag"] = schooldf["wget_fail_flag"].map({"1":True,1:True,"0":False,0:False}) # Convert to binary to use as conditional

In [49]:
schooldf[schooldf["wget_fail_flag"]==True][["folder_name","wget_fail_flag"]]


Out[49]:
folder_name wget_fail_flag
25 Effie_Kokrine_Charter_School_AK True
26 Watershed_Charter_School_AK True
27 Desert_Sun_Academy_AZ True
30 Horseshoe_Trails_Elementary_School_AZ True
31 Center_for_Academic_Success__The_#1_AZ True
32 Center_for_Academic_Success__The_#2_AZ True
33 Center_for_Academic_Success__The_#3_AZ True
40 Ridgeline_Academy-A_Challenge_Foundation_Acade... True
41 Mesa_Arts_Academy__A True
46 Bennett_Academy_-_Venture_Site__A True
52 NFL_YET_College_Prep_Academy__A True
53 Heritage_Academy__A True
61 Benjamin_Franklin_Charter_School_-_Crismon_AZ True
62 Benjamin_Franklin_Charter_School_-_Gilbert_AZ True
63 Benjamin_Franklin_Charter_School_-_Power_AZ True
64 Benjamin_Franklin_High_School_AZ True
67 Montessori_Day_Public_Schools_Chartered_-_Moun... True
89 PPEP_TEC_-_Jose_Yepez_Learning_Center__A True
91 PPEP_TEC_-_Victor_Soltero_Learning_Center__A True
102 Hearn_Academy__The_-_A_Ball_Charter_School_AZ True
121 ALA_QC__Elem_AZ True
129 Luz-Guerrero_Early_College_High_School_AZ True
133 Academy_of_Excellence_-_Central_Arizona_AZ True
145 Discovery_Plus_Academy_AZ True
150 Glenview_College_Preparatory_High_School__A True
154 Learning_Institute__The_AZ True
167 Dobson_Academy__The_-_A_Ball_Charter_School_AZ True
168 SABIS_International__A True
174 Champion_Chandler_AZ True
178 Academy_of_Tucson_Middle_School__A True
... ... ...
6818 Kenosha_eSchool_K-12__W True
6820 Meeme_LEADS_Charter_WI True
6821 Kornerstone_School_WI True
6829 Little_Chute_Career_Pathways_Academy__W True
6830 Flex_Academy__W True
6835 McKinley_Academy__W True
6836 Manitowoc_County_Comprehensive_Charter_School__W True
6840 Mauston_Montessori_Charter_School_WI True
6844 Merrill_Adult_Diploma_Academy_WI True
6850 Honey_Creek_Elementary__W True
6853 Next_Door_Charter__W True
6860 Milwaukee_College_Preparatory_School_--_38th_S... True
6865 Universal_Academy_for_the_College_Bound_WI True
6871 Woodland_Progressive_School_for_21st_Century_C... True
6874 Northwood_High/Middle_WI True
6876 Northwood_Virtual_Charter_School_WI True
6879 Montello_Junior/Senior_High_WI True
6885 Central_Wisconsin_STEM_Academy_WI True
6886 Juneau_County_Charter_School_WI True
6890 Oconto_Literacy_Charter_School_WI True
6894 Osceola_Charter_Preschool__W True
6901 Racine_Civil_Leaders_Academy_WI True
6935 CARE_WI True
6936 Point_of_Discovery_School_WI True
6937 Tomah_Area_Montessori_School__W True
6939 Laker_Online_Virtual_Charter_School_WI True
6943 Exploration_Academy_WI True
6964 Lincoln_Inquiry_Charter_School__W True
6965 JEDI_Virtual_K-12__W True
6968 Snowy_Range_Academy__W True

1571 rows × 2 columns


In [39]:
schooldf.head(26)


Out[39]:
SURVYEAR FIPST STABR STATENAME SEANAME LEAID ST_LEAID LEA_NAME SCHID ST_SCHID ... ADDRESS14 LSTREET116 LSTREET216 LSTREET316 LCITY16 LSTATE16 LZIP16 ADDRESS16 folder_name wget_fail_flag
0 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200001.0 31 Lower Kuskokwim School District 329.0 319010 ... 1010 Fourth Avenue, Bethel AK 99559 1010 Fourth Avenue Bethel AK 99559.0 1010 Fourth Avenue, Bethel AK 99559 Ayaprun_Elitnaurvik_AK 0
1 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200150.0 25 Ketchikan Gateway Borough School District 306.0 259010 ... 410 Schoenbar, Ketchikan AK 99901 410 Schoenbar Ketchikan AK 99901.0 410 Schoenbar, Ketchikan AK 99901 Ketchikan_Charter_School_AK 0
2 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200150.0 25 Ketchikan Gateway Borough School District 523.0 259020 ... 410 Schoenbar Road, Ketchikan AK 99901 410 Schoenbar Road Ketchikan AK 99901.0 410 Schoenbar Road, Ketchikan AK 99901 Tongass_School_of_Arts_and_Sciences_Charter_Sc... 0
3 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200180.0 5 Anchorage School District 172.0 59010 ... 1705 W 32nd Ave, Anchorage AK 99517 1705 W 32nd Ave Anchorage AK 99517.0 1705 W 32nd Ave, Anchorage AK 99517 Aquarian_Charter_School_AK 0
4 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200180.0 5 Anchorage School District 178.0 56010 ... 401 E Fireweed Lane Suite 100, Anchorage AK 99503 401 E Fireweed Lane Suite 100 Anchorage AK 99503.0 401 E Fireweed Lane Suite 100, Anchorage AK 99503 Family_Partnership_Charter_School_AK 0
5 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200180.0 5 Anchorage School District 459.0 59070 ... 4802 Bryn MAWR Court, Anchorage AK 99508 4802 Bryn MAWR Court Anchorage AK 99508.0 4802 Bryn MAWR Court, Anchorage AK 99508 Winterberry_School_AK 0
6 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200180.0 5 Anchorage School District 460.0 59080 ... 10901 Mausel St Suite 101, Eagle River AK 99577 10901 Mausel St Suite 101 Eagle River AK 99577.0 10901 Mausel St Suite 101, Eagle River AK 99577 Eagle_Academy_Charter_School_AK 0
7 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200180.0 5 Anchorage School District 530.0 59050 ... 400 W Northern Lights Blvd, #9, Anchorage AK 9... 400 W Northern Lights Blvd #9 Anchorage AK 99503.0 400 W Northern Lights Blvd #9, Anchorage AK 99503 Frontier_Charter_School_AK 0
8 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200180.0 5 Anchorage School District 555.0 59060 ... 5530 E Northern Lights Suite 1, Anchorage AK 9... 5530 E Northern Lights Suite 1 Anchorage AK 99504.0 5530 E Northern Lights Suite 1, Anchorage AK 9... Highland_Tech_High_Charter_School_AK 0
9 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200180.0 5 Anchorage School District 732.0 59090 ... 650 W International Airport Rd, Anchorage AK 9... 650 W International Airport Rd Anchorage AK 99507.0 650 W International Airport Rd, Anchorage AK 9... Rilke_Schule_Charter_School_AK 0
10 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200180.0 5 Anchorage School District 736.0 59100 ... 550 Bragaw Street, Anchorage AK 99504 550 Bragaw Street Anchorage AK 99504.0 550 Bragaw Street, Anchorage AK 99504 Alaska_Native_Cultural_Charter_School_AK 0
11 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200210.0 22 Juneau Borough School District 268.0 229010 ... 430 Fourth Street, Juneau AK 99801 430 Fourth Street Juneau AK 99801.0 430 Fourth Street, Juneau AK 99801 Juneau_Community_Charter_School_AK 0
12 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200390.0 24 Kenai Peninsula Borough School District 274.0 249010 ... 705 Frontage Rd Suite A, Kenai AK 99611 705 Frontage Rd Suite A Kenai AK 99611.0 705 Frontage Rd Suite A, Kenai AK 99611 Aurora_Borealis_Charter_School_AK 0
13 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200390.0 24 Kenai Peninsula Borough School District 296.0 249030 ... 995 Soundview Ave, Homer AK 99603 995 Soundview Ave Homer AK 99603.0 995 Soundview Ave, Homer AK 99603 Fireweed_Academy_AK 0
14 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200390.0 24 Kenai Peninsula Borough School District 448.0 249040 ... 162 E Park Street, Soldotna AK 99669 162 E Park Street Soldotna AK 99669.0 162 E Park Street, Soldotna AK 99669 Soldotna_Montessori_Charter_School_AK 0
15 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200390.0 24 Kenai Peninsula Borough School District 463.0 249050 ... 549 N Forest Dr, Kenai AK 99611 549 N Forest Dr Kenai AK 99611.0 549 N Forest Dr, Kenai AK 99611 Kaleidoscope_School_of_Arts_&_Sciences_AK 0
16 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200510.0 33 Matanuska-Susitna Borough School District 311.0 339010 ... 801 East Arctic, Palmer AK 99645 801 East Arctic Palmer AK 99645.0 801 East Arctic, Palmer AK 99645 Academy_Charter_School_AK 0
17 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200510.0 33 Matanuska-Susitna Borough School District 312.0 339020 ... 7362 W Parks #714, Wasilla AK 99623 7362 W Parks #714 Wasilla AK 99623.0 7362 W Parks #714, Wasilla AK 99623 Midnight_Sun_Family_Learning_Center_AK 0
18 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200510.0 33 Matanuska-Susitna Borough School District 452.0 337050 ... 7362 W. Parks Hwy #723, Wasilla AK 99623 7362 W Parks Hwy #723 Wasilla AK 99623.0 7362 W Parks Hwy #723, Wasilla AK 99623 American_Charter_Academy_AK 0
19 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200510.0 33 Matanuska-Susitna Borough School District 469.0 339030 ... 141 E Seldon Road Suite C, Wasilla AK 99654 141 E Seldon Road Suite C Wasilla AK 99654.0 141 E Seldon Road Suite C, Wasilla AK 99654 Twindly_Bridge_Charter_School_AK 0
20 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200510.0 33 Matanuska-Susitna Borough School District 740.0 339040 ... 7010 E Bogard Road, Wasilla AK 99654 7010 E Bogard Road Wasilla AK 99654.0 7010 E Bogard Road, Wasilla AK 99654 Fronteras_Charter_School_AK 0
21 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200510.0 33 Matanuska-Susitna Borough School District 744.0 339050 ... 7101 E. Palmer Wasilla Highway, Palmer AK 99645 7101 E Palmer Wasilla Highway Palmer AK 99645.0 7101 E Palmer Wasilla Highway, Palmer AK 99645 Birchtree_Charter_School_AK 0
22 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200570.0 35 Nome Public Schools 323.0 359010 ... Mile 3.5 Nome-Teller Highway, Nome AK 99762 Mile 3.5 Nome-Teller Highway Nome AK 99762.0 Mile 3.5 Nome-Teller Highway, Nome AK 99762 Anvil_City_Science_Academy_AK 0
23 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200600.0 16 Fairbanks North Star Borough School District 162.0 169010 ... 3002 International Street, Fairbanks AK 99701 3002 International Street Fairbanks AK 99701.0 3002 International Street, Fairbanks AK 99701 Chinook_Montessori_Charter_School_AK 0
24 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200600.0 16 Fairbanks North Star Borough School District 462.0 169030 ... 2945 Monk Court, North Pole AK 99705 2945 Monk Court North Pole AK 99705.0 2945 Monk Court, North Pole AK 99705 Star_of_the_North_Secondary_School_AK 0
25 2014-2015 2.0 AK ALASKA Alaska Department of Education and Early Devel... 200600.0 16 Fairbanks North Star Borough School District 464.0 169040 ... 601 Loftus Road, Fairbanks AK 99709 601 Loftus Road Fairbanks AK 99709.0 601 Loftus Road, Fairbanks AK 99709 Effie_Kokrine_Charter_School_AK 1

26 rows × 182 columns


In [32]:
schooldf.wget_fail_flag.sum()


Out[32]:
'000000000000000000000000011100111100000011000010000011000000011110010000000000000000000001010000000000100000000000000000010000000100010000000000010000100010000000000001100000100010010000000001111010001000000011000110000000100100100000000000000000110000101010000000000001101011010001000100000001000000010010000010000010000010000000000000000000000000000000000101000010000000000000000000000000101110000000010000000000000000100000000000000000001000000010001100000110000010000000010000000000000000001000000001010000010100001000000001000000000001000100000010000010000100000000010000000001011000010000000100000010001000000000000001000010100000000000000000110000100000000000000000000000000000000001000000000000000100100111000010110111110000000000100010010000000000010000000000000100000000000010010010010110000011100111100111100010000100000000010000000000000100000011000011000001000000011000000001010100000000010110000000000001111000100010010000000110000011111000010000110000000100000101000000010000000001001000100000000100010000001100000010000111000000000001000001000100111101010100000010000111000000110000000000000000000000000000000000010000000000001010010000000001000001010110000000000000000001000000000000010100000000000001001011011000010010100100011100000010011110100010000000000000000010011000010000001001000000010000000000000000000000000000001001000000101100000011111111111111100100000000000000011000000000000000111110000000100000001100010110001000000010001000000000000000100000100000001111000000000000000001100010000111000000011000110000010000111011011000000000011000001111001100010110000010010010100001000000001001111111100000000000000000000000001101000000001000000000001100100000000000010010000000011100001000000000000100010000000000000000100010100000010001100010000000001100001000000111100000000110000000100000111100000011000001000000000000010000000111011000000000010010000001000000010000100000100000111100110110000110010001001000001100000001000000100101001000000100010010000000000000000000010010000000010000000000000100011000000000000100000000000010100000101000000000000100000000000000000001000010000000000000100000000000000000110000000110100000000000010110010000010111000000000100000000010000000000010101010000111111100010000000100100111001000000000000111111111111111111010001111111001111000000000100000110000000011000010101100000000111110000000000000000000000001011000101111101101011111101101111110101000000101111010000110111000101101101100001100110101101110100000111111010011010110000010110111000110101010101111111101100101001011011111101011111111101111110101000110010111111011111000100011110001100011111011101111101111111010000000011000000001101000101100000110000010100100110000010001000110010001010000000000000010000000100100000000000000100000000000001000001000000001001110000000000000000010011100001000000001001000011000010000010011000000000001110110000100011011011000000100100000001101100111111001001100000000001000100000001010000000011000000100100000000000000000011101000000011100100110100000000000000100000010011101111000010000000000010000001000100000001000010000000100100000000000000000100000000010010000100001100001010010000110000000001000010000000000000000000000000000100010000100001000001000101000100011000100000010110000100001001000000100000010000000000000000000000010000010000000101000001000001000000000000001000000000000010000100000001011000100110000000000000100001000000010000000001000111010111010001000100000010011100001110000100001100110110101000001000100100000000100100000001000000010001011000000000000000000000000000000000001000000000110000000011100110000101000100001001010100100000100000000110000001000100100110000000000000000010000100010101000000100110000100001000000000000000000000110000000001000000000000000010000010000000000011000000101000010000000001010000000000010011010000000000000000000010000000000100110011001110000000000011110111100100001101001000101000000000000001001000000010100000000010000000000100000011000000010100000100011001001000001111000000001101000011000111100111011110000000110000100000100110000000000000100000000000000000000001000100000000000000000000000000000000001110000000010000010010100000010110000000000000000000000000000000000000000000100100000010010010000011111001001011111111111100000001010011100000011100001000111000001111111100001000010000100110000010000100111010110100100001001000000000000000100000000000010000000000000000000000111000001100010000000010000001010010000001001001010000000000000000000110000000010000001010011000100000010010000000000100001000000010101100000010100001000111100100000001000010000000100001001100000000000000010000000010000000101000000000000000001000001001000000100000100000101000000100010110101000010100010000110000100000000100000000000000000000010001000000000000000010001000000000000001000000000000000100001000000000000000000000000000000100001000000010000100001000100000000000100111110000000000000001001000001000000010010000000000000000000000001000000000000000000000111101110111111111111110000000000000001000000000000000000000000000000000100000000000001010001000000000000110000000000000000001000001100100000000000001100000000001010000011000100000000000000101101000000000000111001000001100100100100100000100000000000000000000000000000011000000000010000000000000011111000101000100011000000000110000010010000000000010000000100010000100011000001100010011111100100111000000000001001010000000000010110111000000000000000000000110000000001000000111010000000000001000110100000000000110000000000000000000010001010000000001100100001000000010010000001010000000101000001000010010100000010000000011010000001001111010000100000001100000010000100001010110100000100000000001010000010000000010000000001010010000000100000011001001100100000010000001000000000000000010110000010000000001000000001000001000001110110000100000010100001000110011100000111010001101010001100000000000001010011001111101000000100011010000000000001000110000000000100000011001001111110000000011000000000000000000000110010000000010000000000000100000000000010001000000010001000010110011011001011010000101110000000000000001000001111110010100010000010000000000000101000000010000000000010111110000000011000100000100000111100011001000100000011111010001001001100000100000110001000100000000000000001110000001110100000000000101000000000000000000000000000000000000000000000100000000000000010000001000010000010000010000000000000100000100100000000010010100000000000001000010000000000000111100000110000000000000000000000000000010000000100011000000000000000000010000000010000000000000011110110000100000100111010000000000000000000000100000001000100001001001001001011100000000000000000001000110010000001010010000000100010000000000000000010100000000000000000001000000010000001001111111111100000000010000010000001000010000101000000000000000000001000001110000010000011100011001010010000000000000001100101011000000011000011000100010000010010000001000010000010010100100000110001000100000010000000000000000000000000000000001110100010000000000000000000011001000'

In [79]:
tqdm.pandas(desc="Rocking pandas!")




In [ ]:
# ### Run parsing algorithm on schools (requires access to webcrawl output)

test_dicts = dicts_list[0] # Limit number of schools to analyze, in order to refine methods

if Debug:
    for school in test_dicts:
        parse_school(school)
        
else:
    for school in dicts_list:
        parse_school(school)

In [ ]:
# Check out results:
if Debug:
    print(test_dicts[0])
else:
    print(dicts_list[0])

In [ ]:
# Save output:
if Debug:
    dictfile = "testing_dicts_" + str(datetime.today().strftime("%Y-%m-%d"))
    save_to_file(test_dicts, save_dir+dictfile, "JSON")
else:
    dictfile = "school_dicts_" + str(datetime.today().strftime("%Y-%m-%d"))
    save_to_file(dicts_list, save_dir+dictfile, "JSON")