In [1]:
#!/usr/bin/env python
# -*- coding: UTF-8

# # Parsing & Categorizing HTML from `wget` run with multiprocessing


"""This script parses .html files previously downloaded into local folders for those schools (or organizations, generally) listed in a .csv directory file. It uses the BeautifulSoup, multiprocessing, and pandas modules to efficiently clean, filter, and merge webtext into various lists; it also uses dictionary methods to count the number of times any word from the provided dictionaries (essentialist and progressivist school ideologies, in this case) occurs in any page for a given school. The script then stores these lists to each school's folder as text files; incorporates them into a large pandas DataFrame; and then finally saves this as an analysis-ready pickle-formatted file.

Author: Jaren Haber, PhD Candidate in UC Berkeley Sociology. 
Date: January 7th, 2018."""


Out[1]:
"This script parses .html files previously downloaded into local folders for those schools (or organizations, generally) listed in a .csv directory file. It uses the BeautifulSoup, multiprocessing, and pandas modules to efficiently clean, filter, and merge webtext into various lists; it also uses dictionary methods to count the number of times any word from the provided dictionaries (essentialist and progressivist school ideologies, in this case) occurs in any page for a given school. The script then stores these lists to each school's folder as text files; incorporates them into a large pandas DataFrame; and then finally saves this as an analysis-ready pickle-formatted file.\n\nAuthor: Jaren Haber, PhD Candidate in UC Berkeley Sociology. \nDate: January 7th, 2018."

In [2]:
# ## Initializing

# import necessary libraries
import os, re, fnmatch # for navigating file trees and working with strings
import csv # for reading in CSV files
#from glob import glob,iglob # for finding files within nested folders--compare with os.walk
import json, pickle, csv # For saving a loading dictionaries, DataFrames, lists, etc. in JSON, pickle, and CSV formats
from datetime import datetime # For timestamping files
import time, timeout_decorator # To prevent troublesome files from bottlenecking the parsing process, use timeouts
import sys # For working with user input
import logging # for logging output, to help with troubleshooting
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import urllib, urllib.request # for testing pages
from unicodedata import normalize # for cleaning text by converting unicode character encodings into readable format
from multiprocessing import Pool # for multiprocessing, to increase parsing speed
import pandas as pd # modifies data more efficiently than with a list of dicts
from tqdm import tqdm # For progress information over iterations, including with Pandas operations via "progress_apply"

# Import parser
from bs4 import BeautifulSoup # BS reads and parses even poorly/unreliably coded HTML 
from bs4.element import Comment # helps with detecting inline/junk tags when parsing with BS
import lxml # for fast HTML parsing with BS
bsparser = "lxml"

In [3]:
# ### Set script options

Debug = False # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = False # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!
workstation = False # If working from office PC
numcpus = int(6) # For multiprocessing

if notebook:
    usefile = False # Prompting user for input file is only useful in command-line

inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
               "em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
               "span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML

In [4]:
# ### Set directories

if workstation and notebook:
    dir_prefix = "C:\\Users\\Jaren\\Documents\\Charter-school-identities\\" # One level further down than the others
elif notebook:
    dir_prefix = "/home/jovyan/work/"
else:
    dir_prefix = "/vol_b/data/"

example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"

if workstation and notebook:
    micro_sample13 = dir_prefix + "data\\micro-sample13_coded.csv" # Random micro-sample of 300 US charter schools
    URL_schooldata = dir_prefix + "data\\charter_URLs_2014.csv" # 2014 population of 6,973 US charter schools
    full_schooldata = dir_prefix + "data\\charter_merged_2014.csv" # Above merged with PVI, EdFacts, year opened/closed
    temp_data = dir_prefix + "data\\school_parser_temp.json" # Full_schooldata dict with output for some schools
    example_file = dir_prefix + "data\\example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"
    dicts_dir = dir_prefix + "dicts\\" # Directory in which to find & save dictionary files
    save_dir = dir_prefix + "data\\" # Directory in which to save data files
    temp_dir = dir_prefix + "data\\temp\\" # Directory in which to save temporary data files

else:
    wget_dataloc = dir_prefix + "wget/parll_wget/" #data location for schools downloaded with wget in parallel (requires server access)
    example_folder = wget_dataloc + "TWENTY-FIRST_CENTURY_NM/" # Random charter school folder
    example_file = dir_prefix + "wget/example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"

    micro_sample13 = dir_prefix + "Charter-school-identities/data/micro-sample13_coded.csv" #data location for random micro-sample of 300 US charter schools
    URL_schooldata = dir_prefix + "Charter-school-identities/data/charter_URLs_2014.csv" #data location for 2014 population of US charter schools
    full_schooldata = dir_prefix + "Charter-school-identities/data/charter_merged_2014.csv" # Above merged with PVI, EdFacts, year opened/closed
    temp_data = dir_prefix + "Charter-school-identities/data/school_parser_temp.json" # Full_schooldata dict with output for some schools
    dicts_dir = dir_prefix + "Charter-school-identities/dicts/" # Directory in which to find & save dictionary files
    save_dir = dir_prefix + "Charter-school-identities/data/" # Directory in which to save data files
    temp_dir = dir_prefix + "Charter-school-identities/data/temp/" # Directory in which to save temporary data files

In [5]:
# Set logging options
log_file = temp_dir + "logfile_" + str(datetime.today()) + ".log"
logging.basicConfig(filename=log_file,level=logging.INFO)

In [6]:
# Set input file, if any
if usefile and not notebook:
    print("\nWould you like to load from file a list of dictionaries to add to? (Y/N)")
    answer = input()
    if answer == "Y":
        print("Please indicate file path for dictionary list file.")
        answer2 = input()
        if os.path.exists(answer2):
            input_file = answer2
            usefile = True
        else:
            print("Invalid file path" + str(answer2) + " \nAborting script.")
            sys.exit()

    elif answer == "N":
        print("OK! This script will create a new data file at " + str(save_dir) + ".")
        usefile = False
    
    else:
        print("Error: " + str(answer) + " not an interpretable response. Aborting script.")
        sys.exit()

In [7]:
# ### Define (non-parsing) helper functions

def get_vars(data):
    """Defines variable names based on the data source called."""
    
    if data==URL_schooldata:
        URL_variable = "TRUE_URL"
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS"
        
    elif data==full_schooldata:
        URL_variable = "SCH_NAME" # Work-around until URLs merged into full data file
        NAME_variable = "SCH_NAME"
        ADDR_variable = "ADDRESS14"
    
    elif data==micro_sample13:
        URL_variable = "URL"
        NAME_variable = "SCHNAM"
        ADDR_variable = "ADDRESS"
    
    else:
        try:
            print("Error processing variables from data file " + str(data) + "!")
        except Exception as e:
            print("ERROR: No data source established!\n")
            print(e)
    
    return(URL_variable,NAME_variable,ADDR_variable)


def tag_visible(element):
    """Returns false if a web element has a non-visible tag, 
    i.e. one site visitors wouldn't actually read--and thus one we don't want to parse"""
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def webtext_from_files(datalocation):
    """Concatenate and return a single string from all webtext (with .txt format) in datalocation"""
    
    string = ""
    for root, dirs, files in os.walk(datalocation):
        for file in files:
            if file.endswith(".txt"):
                fileloc = open(datalocation+file, "r")
                string = string + (fileloc.read())
    return string


def remove_spaces(file_path):
    """Remove spaces from text file at file_path"""
    
    words = [x for x in open(file_path).read().split() if x != ""]
    text = ""
    for word in words:
        text += word + " "
    return text


def write_errors(error_file, error1, error2, error3, file_count):
    """Writes to error_file three binary error flags derived from parse_school(): 
    duplicate_flag, parse_error_flag, wget_fail_flag, and file_count."""
    
    with open(error_file, 'w') as file_handler:
        file_handler.write("duplicate_flag {}\n".format(int(error1)))
        file_handler.write("parse_error_flag {}\n".format(int(error2)))
        file_handler.write("wget_fail_flag {}\n".format(int(error3)))
        file_handler.write("file_count {}".format(int(file_count)))
        return
    

def write_counts(file_path, names_list, counts_list):
    """Writes to file_path the input dict_count names (a list) and counts (another list).
    Assumes these two lists have same length and are in same order--
    e.g., names_list[0]="ess_count" and counts_list[0]=ess_count."""
    
    with open(file_path, 'w') as file_handler:
        for tup in zip(names_list,counts_list): # iterate over zipped list of tuples
            if tup != list(zip(names_list,counts_list))[-1]:
                file_handler.write("{} {}\n".format(tup[0],tup[1]))
            else:
                file_handler.write("{} {}".format(tup[0],tup[1]))
        return

    
def write_list(file_path, textlist):
    """Writes textlist to file_path. Useful for recording output of parse_school()."""
    
    with open(file_path, 'w') as file_handler:
        for elem in textlist:
            file_handler.write("{}\n".format(elem))
        return
    

def load_list(file_path):
    """Loads list into memory. Must be assigned to object."""
    
    textlist = []
    with open(file_path) as file_handler:
        line = file_handler.readline()
        while line:
            textlist.append(line)
            line = file_handler.readline()
    return textlist

        
def save_datafile(data, file, thismode):
    """Saves data to file using JSON, pickle, or CSV format (whichever was specified).
    Works with Pandas DataFrames or other objects, e.g. a list of dictionaries.
    Deletes file first to reduce risk of data duplication."""
    
    file = str(file)
    thismode = str(thismode)
    
    try:
        if os.path.exists(file):
            os.remove(file) # Delete file first to reduce risk of data duplication
        else:
            pass
        
        if thismode.upper()=="JSON" or thismode.upper()==".JSON":
            if not file.endswith(".json"):
                file += ".json"
            with open(file, 'w') as outfile:
                if type(data)=="pandas.core.frame.DataFrame":
                    data.to_json(outfile)
                else:
                    json.dump(data, outfile)
                print("Data saved to " + file + "!")

        elif thismode.lower()=="pickle" or thismode.lower()==".pickle":
            if not file.endswith(".pickle"):
                file += ".pickle"
            with open(file, "wb") as outfile:
                if type(data)=="pandas.core.frame.DataFrame":
                    data.to_pickle(outfile)
                else:
                    pickle.dump(data, outfile)
                print("Data saved to " + file + "!")
                
        elif thismode.upper()=="CSV" or thismode.upper()==".CSV":
            if not file.endswith(".csv"):
                file += ".csv"
            with open(file, "w") as outfile:
                if type(data)=="pandas.core.frame.DataFrame":
                    data.to_csv(outfile,mode="w",index=False) # ,header=data.columns.values
                else:
                    wr = csv.writer(outfile)
                    wr.writerows(data)
                print("Data saved to " + file + "!")

        else:
            print("ERROR! Improper arguments. Please include: data object to save (Pandas DataFrames OK), file path, and file format ('JSON', 'pickle', or 'CSV').")
    
    except Exception as e:
        print("Failed to save to " + str(file) + " into memory using " + str(thismode) + " format. Please check arguments (data, file, file format) and try again.")
        print(e)
    

def load_datafile(file):
    """Loads dicts_list (or whatever) from file, using either JSON or pickle format. 
    The created object should be assigned when called."""
    
    file = str(file)
    
    if file.lower().endswith(".json"):
        with open(file,'r') as infile:
            var = json.load(infile)
    
    if file.lower().endswith(".pickle"):
        with open(file,'rb') as infile:
            var = pickle.load(infile)
        
    print(file + " successfully loaded!")
    return var


def load_dict(custom_dict, file_path):
    """Loads in a dictionary. Adds each entry from the dict at file_path to the defined set custom_dict (the input), 
    which can also be an existing dictionary. This allows the creation of combined dictionaries!"""

    with open(file_path) as file_handler:
        line = file_handler.readline()
        while line:
            custom_dict.add(stemmer.stem(line.replace("\n", ""))) # Add line after stemming dictionary entries and eliminating newlines
            line = file_handler.readline() # Look for anything else in that line, add that too
    return custom_dict


def list_files(folder_path, extension):
    """Outputs a list of every file in folder_path or its subdirectories that has a specified extension.
    Prepends specified extension with '.' if it doesn't start with it already.
    If no extension is specified, it just returns all files in folder_path."""
    
    matches = []
    if extension:
        extension = str(extension) # Coerce to string, just in case
    
    if extension and not extension.startswith("."):
        extension = "." + extension
    
    for dirpath,dirnames,filenames in os.walk(folder_path):
        if extension:
            for filename in fnmatch.filter(filenames, "*" + extension): # Use extension to filter list of files
                matches.append(os.path.join(dirpath,filename))
        else:
                matches.append(os.path.join(dirpath,filename)) # If no extension, just take all files
    return matches


def has_html(folder_path):
    """Simple function that counts .html files and returns a binary:
    'True' if a specified folder has any .html files in it, 'False' otherwise."""
    
    html_list = []
    for dirpath,dirnames,filenames in os.walk(folder_path):
        for file in fnmatch.filter(filenames, "*.html"): # Check if any HTML files in folder_path
            html_list.append(file)
    
    if len(html_list)==0:
        return False
    else:
        return True

    
def set_failflag(folder_name):
    """The web_fail_flag indicates whether the webcrawl/download operation failed to capture any .html for a particular folder_name.
    This function sets the web_fail_flag depending on two conditions: 
    (1) Whether or not there exists a web download folder corresponding to folder_name, and
    (2) Whether or not that folder contains at least one file with the .html extension."""
    
    global wget_dataloc #,dicts_list # Need access to the list of dictionaries
    web_fail_flag = "" # make output a str to work with currently limited Pandas dtype conversion functionality
    
    folder_path = str(wget_dataloc) + folder_name + "/"
    if (not os.path.exists(folder_path)) or (has_html(folder_path)==False):
        web_fail_flag = str(1) # If folder doesn't exist, mark as fail and ignore when loading files
    else:
        web_fail_flag = str(0) # make str so can work with currently limited Pandas dtype conversion functionality
    
    #match_index = next((index for (index, d) in enumerate(dicts_list) if d["folder_name"] == folder_name), None) # Find dict index of input/folder_name
    #dicts_list[match_index]['wget_fail_flag'] = web_fail_flag # Assign output to dict entry for folder_name
    
    return web_fail_flag


def convert_df(df):
    """Makes a Pandas DataFrame more memory-efficient through intelligent use of Pandas data types: 
    specifically, by storing columns with repetitive Python strings not with the object dtype for unique values 
    (entirely stored in memory) but as categoricals, which are represented by repeated integer values. This is a 
    net gain in memory when the reduced memory size of the category type outweighs the added memory cost of storing 
    one more thing. As such, this function checks the degree of redundancy for a given column before converting it.
    
    # TO DO: Filter out non-object columns, make that more efficient by downcasting numeric types using pd.to_numeric(), 
    merge  that with the converted object columns (see https://www.dataquest.io/blog/pandas-big-data/). 
    For now, since the current DF is ENTIRELY composed of object types, code is left as is. 
    But note that the current code will eliminate any non-object type columns."""
    
    converted_df = pd.DataFrame() # Initialize DF for memory-efficient storage of strings (object types)
    df_obj = df.select_dtypes(include=['object']).copy() # Filter to only those columns of object data type

    for col in df.columns: 
        if col in df_obj: 
            num_unique_values = len(df_obj[col].unique())
            num_total_values = len(df_obj[col])
            if (num_unique_values / num_total_values) < 0.5: # Only convert data types if at least half of values are duplicates
                converted_df.loc[:,col] = df[col].astype('category') # Store these columns as dtype "category"
            else: 
                converted_df.loc[:,col] = df[col]
        else:    
            converted_df.loc[:,col] = df[col]
                      
    converted_df.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='float')
    converted_df.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    
    return converted_df

In [8]:
# ### Set parsing keywords

keywords = ['values', 'academics', 'skills', 'purpose',
                       'direction', 'mission', 'vision', 'vision', 'mission', 'our purpose',
                       'our ideals', 'ideals', 'our cause', 'curriculum','curricular',
                       'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system',
                       'structure','philosophy', 'philosophical', 'beliefs', 'believe',
                       'principles', 'creed', 'credo', 'values','moral', 'history', 'our story',
                       'the story', 'school story', 'background', 'founding', 'founded',
                       'established','establishment', 'our school began', 'we began',
                       'doors opened', 'school opened', 'about us', 'our school', 'who we are',
                       'our identity', 'profile', 'highlights']

mission_keywords = ['mission','vision', 'vision:', 'mission:', 'our purpose', 'our ideals', 'ideals:', 'our cause', 'cause:', 'goals', 'objective']
curriculum_keywords = ['curriculum', 'curricular', 'program', 'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system', 'structure']
philosophy_keywords = ['philosophy', 'philosophical', 'beliefs', 'believe', 'principles', 'creed', 'credo', 'value',  'moral']
history_keywords = ['history', 'story','our story', 'the story', 'school story', 'background', 'founding', 'founded', 'established', 'establishment', 'our school began', 'we began', 'doors opened', 'school opened']
about_keywords =  ['about us', 'our school', 'who we are', 'overview', 'general information', 'our identity', 'profile', 'highlights']

mission_keywords = set(stemmer.stem(word) for word in mission_keywords)
curriculum_keywords = set(stemmer.stem(word) for word in curriculum_keywords)
philosophy_keywords = set(stemmer.stem(word) for word in philosophy_keywords)
history_keywords = set(stemmer.stem(word) for word in history_keywords)
about_keywords =  set(stemmer.stem(word) for word in about_keywords)
all_keywords = set(stemmer.stem(key) for key in keywords)

logging.info("List of keywords:\n" + str(list(all_keywords)))

In [9]:
# ### Create dictionaries for each ideology and one for combined ideologies

ess_dict, prog_dict, rit_dict, all_ideol, all_dicts = set(), set(), set(), set(), set()
all_ideol = load_dict(all_ideol, dicts_dir + "ess_dict.txt")
all_ideol = load_dict(all_ideol, dicts_dir + "prog_dict.txt") # For complete ideological list, append second ideological dict
all_dicts = load_dict(all_ideol, dicts_dir + "rit_dict.txt") # For complete dict list, append ritual dict terms too
ess_dict = load_dict(ess_dict, dicts_dir + "ess_dict.txt")
prog_dict = load_dict(prog_dict, dicts_dir + "prog_dict.txt")
rit_dict = load_dict(rit_dict, dicts_dir + "rit_dict.txt")

logging.info(str(len(all_ideol)) + " entries loaded into the combined ideology dictionary.")
list_dict = list(all_ideol)
list_dict.sort(key = lambda x: x.lower())
logging.info("First 10 elements of combined ideology dictionary are:\n" + str(list_dict[:10]))
    

# Create tuples for keyword lists and dictionary terms:
keys_tuple = tuple([mission_keywords,curriculum_keywords,philosophy_keywords,history_keywords,about_keywords,\
                        all_ideol,all_keywords])
dicts_tuple = tuple([ess_dict,prog_dict,rit_dict,all_dicts])
    
logging.info(str(list(keys_tuple)))
logging.info(str(list(dicts_tuple)))

In [10]:
# ### Define dictionary matching helper functions

def dict_count(text_list, custom_dict):
    
    """Performs dictionary analysis, returning number of dictionary hits found.
    Removes punctuation and stems the phrase being analyzed. 
    Compatible with multiple-word dictionary elements."""
    
    counts = 0 # number of matches between text_list and custom_dict
    dictless_list = [] # Updated text_list with dictionary hits removed
    max_entry_length = max([len(entry.split()) for entry in custom_dict]) # Get length (in words) of longest entry in combined dictionary
    
    for chunk in text_list: # chunk may be several sentences or possibly paragraphs long
        chunk = re.sub(r'[^\w\s]', '', chunk) # Remove punctuation with regex that keeps only letters and spaces

        # Do dictionary analysis for word chunks of lengths max_entry_length down to 1, removing matches each time.
        # This means longer dict entries will get removed first, useful in case they contain smaller entries.
        for length in range(max_entry_length, 0, -1):
            dictless_chunk,len_counts = dict_match_len(chunk,custom_dict,length)
            dictless_list.append(dictless_chunk)
            counts += len_counts
    
    return dictless_list,int(counts)

def dict_match_len(phrase, custom_dict, length):
    
    """Helper function to dict_match. 
    Returns # dictionary hits and updated copy of phrase with dictionary hits removed. 
    Stems phrases before checking for matches."""
    
    hits_indices, counts = [], 0
    splitted_phrase = phrase.split()
    if len(splitted_phrase) < length:
        return phrase, 0 # If text chunk is shorter than length of dict entries being matched, don't continue.
    
    for i in range(len(splitted_phrase) - length + 1):
        to_stem = ""
        for j in range(length):
            to_stem += splitted_phrase[i+j] + " " # Builds chunk of 'length' words
        stemmed_word = stemmer.stem(to_stem[:-1]) # stem chunk
        if stemmed_word in custom_dict:
            hits_indices.append(i) # Store the index of the word that has a dictionary hit
            counts += 1
            logging.info(stemmed_word)
                
    # Iterate through list of matching word indices and remove the matches
    for i in range(len(hits_indices)-1, -1, -1):
        splitted_phrase = splitted_phrase[:hits_indices[i]] + \
        splitted_phrase[hits_indices[i] + length:]
    modified_phrase = ""
    for sp in splitted_phrase: # Rebuild the modified phrase, with matches removed
        modified_phrase += sp + " "
    return modified_phrase[:-1], counts

                  
@timeout_decorator.timeout(20, use_signals=False)
def dictmatch_file_helper(file,dictsnames_biglist,all_keywords,all_ideol,all_matches):
    """Counts number of matches in file for each list of terms given, and also collects the terms not matched.
    Dictsnames_biglist is a list of lists, each list containing:
    a list of key terms, currently essentialism, progressivism, ritualism, and all three combined (ess_dict, prog_dict, rit_dict, all_dicts);
    the variables used to store the number of matches for each term lit (ess_count, prog_count, rit_count, alldict_count);
    and the not-matches--that is, the list of words leftover from the file after all matches are removed (ess_dictless, prog_dictless, rit_dictless, alldict_dictless). """
    
    """OLD METHOD:
    for adict,count_name,dictless_name in dictsnames_tupzip: # Iterate over dicts to find matches with parsed text of file
        count_add = 0 # Initialize iterator for dict-specific count matches
        dictless_add,count_add = dict_count(parsed_pagetext,adict)
        count_name += count_add
        dictless_name += dictless_add
        all_matches += count_add
                    
        # Now use this count to update zipped list of tuples, so each iteration adds to the total dict_count:
        dictsnames_list = [ess_count, prog_count, rit_count, alldict_count] 
        dictsnames_tupzip = zip(dicts_tuple, dictsnames_list, dictlessnames_list) 
                    
        logging.info("  Discovered " + str(count_add) + " matches for " + str(file) + ", a total thus far of " + str(count_name) + " matches...")"""
                  
    for i in range(len(dictsnames_biglist)): # Iterate over dicts to find matches with parsed text of file
        # Dicts are: (ess_dict, prog_dict, rit_dict, alldict_count); count_names are: (ess_count, prog_count, rit_count, alldict_count); dictless_names are: (ess_dictless, prog_dictless, rit_dictless, alldict_dictless)
        # adict,count_name,dictless_name = dictsnames_tupzip[i]
        dictless_add,count_add = dict_count(parsed_pagetext,dictsnames_biglist[i][0])
        dictsnames_biglist[i][1] += count_add
        dictsnames_biglist[i][2] += dictless_add
        all_matches += count_add
                    
        logging.info("Discovered " + str(count_add) + " matches for " + str(file) + ", a total thus far of " + str(dictsnames_biglist[i][1]) + " matches...")
                  
    return dictsnames_biglist,all_matches

In [11]:
# ### Define parsing helper functions

def parsefile_by_tags(HTML_file):
    
    """Cleans HTML by removing inline tags, ripping out non-visible tags, 
    replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
    Reads in HTML from storage using a given filename, HTML_file."""

    random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
    soup = BeautifulSoup(open(HTML_file), bsparser)
    
    [s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
    for it in inline_tags:
        [s.extract() for s in soup("</" + it + ">")] # Remove inline tags
    
    visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
    # Split text into list using random string while also eliminating tabs and converting unicode to readable text:
    visible_text = list(normalize("NFKC",elem.replace("\t","")) for elem in visible_text.split(random_string))
    visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
    # Consider joining list elements together with newline in between by prepending with: "\n".join

    return(visible_text)


example_textlist = parsefile_by_tags(example_file)
logging.info("Output of parsefile_by_tags:\n" + str(example_textlist))

    
@timeout_decorator.timeout(20, use_signals=False)
def parse_file_helper(file,webtext,keywords_text,ideology_text):
    """Parses file into (visible) webtext, both complete and filtered by terms in 'keywords' and 'ideology' lists."""
    
    parsed_pagetext = []
    parsed_pagetext = parsefile_by_tags(file) # Parse page text

    if len(parsed_pagetext) == 0: # Don't waste time adding empty pages
        logging.warning("    Nothing to parse in " + str(file) + "!")
    
    else:
        webtext.extend(parsed_pagetext) # Add new parsed text to long list
        keywords_text.extend(filter_dict_page(parsed_pagetext, all_keywords)) # Filter using keywords
        ideology_text.extend(filter_dict_page(parsed_pagetext, all_ideol)) # Filter using ideology words

        logging.info("Successfully parsed and filtered file " + str(file) + "...")
        
    return webtext,keywords_text,ideology_text
                  

def filter_dict_page(pagetext_list, keyslist):
    
    """Filters webtext of a given .html page, which is parsed and in list format, to only those strings 
    within pagetext_list containing an element (word or words) of inputted keyslist. 
    Returns list filteredtext wherein each element has original case (not coerced to lower-case)."""
    
    filteredtext = [] # Initialize empty list to hold strings of page
    
    for string in pagetext_list:
        lowercasestring = str(string).lower() # lower-case string...
        dict_list = [key.lower() for key in list(keyslist)] # ...compared with lower-case element of keyslist
        for key in dict_list:
            if key in lowercasestring and key in lowercasestring.split(' '): # Check that the word is the whole word not part of another one
                filteredtext.append(string)

    return filteredtext


logging.info("Output of filter_keywords_page with keywords:\n" + str(filter_dict_page(example_textlist, all_keywords)))   
logging.info("Output of filter_keywords_page with ideology words:\n\n" + str(filter_dict_page(example_textlist, all_ideol)))

In [12]:
def parse_school(schooltup):
    
    """This core function parses webtext for a given school. Input is tuple: (name, address, url).
    It uses helper functions to run analyses and then returning multiple outputs:
    full (partially cleaned) webtext, by parsing webtext of each .html file (removing inline tags, etc.) within school's folder, via parsefile_by_tags();
    all text associated with specific categories by filtering webtext to those with elements from a defined keyword list, via filter_keywords_page();
    AND COUNTS FOR DICT MATCHES
    
    For the sake of parsimony and manageable script calls, OTHER similar functions/scripts return these additional outputs: 
    parsed webtext, having removed overlapping headers/footers common to multiple pages, via remove_overlaps();
    all text associated with specific categories by filtering webtext according to keywords for 
    mission, curriculum, philosophy, history, and about/general self-description, via categorize_page(); and
    contents of those individual pages best matching each of these categories, via find_best_categories."""
    
    global itervar,numschools,parsed,wget_dataloc,dicts_list,keys_tuple,dicts_tuple # Access variables defined outside function (globally)
        
    itervar +=1 # Count school
    datalocation = wget_dataloc # Define path to local data storage
    school_name,school_address,school_URL,folder_name = schooltup[0],schooltup[1],schooltup[2],schooltup[3] # Assign variables from input tuple (safe because element order for a tuple is immutable)
    
    logging.info("Parsing " + str(school_name) + " at " + str(school_address) + " in folder <" + datalocation + str(folder_name) + "/>, which is ROUGHLY #" + str(6*itervar) + " / " + str(numschools) + " schools...")
    
    school_folder = datalocation + folder_name + "/"
    error_file = school_folder + "error_flags.txt" # Define file path for error text log
    counts_file = school_folder + "dict_counts.txt" # File path for dictionary counts output
    
    if school_URL==school_name:
        school_URL = folder_name # Workaround for full_schooldata, which doesn't yet have URLs
    
    # PRELIMINARY TEST 1: Check if parsing is already done. If so, no need to parse--stop function!
    if os.path.exists(error_file) and os.path.exists(counts_file):
        logging.info("Parsing output already detected in " + str(school_folder) + ", aborting parser...")
        return
    
    # PRELIMINARY TEST 2: Check if folder exists. If not, nothing to parse. Thus, do not pass go; do not continue function.
    duplicate_flag,parse_error_flag,wget_fail_flag,file_count = 0,0,0,0 # initialize error flags
    
    if not (os.path.exists(school_folder) or os.path.exists(school_folder.lower()) or os.path.exists(school_folder.upper())):
        logging.warning("NO DIRECTORY FOUND, creating " + str(school_folder) + " for 'error_flags.txt' and aborting...")
        wget_fail_flag = 1
        try:
            os.makedirs(school_folder) # Create empty folder for school to hold error_flags.txt (and nothing else)
            write_errors(error_file, duplicate_flag, parse_error_flag, wget_fail_flag, file_count)
            write_counts(counts_file, ["ess_count","prog_count","rit_count"], [0,0,0]) # empty counts file simplifies parsing
            return
        except Exception as e:
            logging.debug("Uh-oh! Failed to log error flags for " + str(school_name) + ".\n" + e)
            return
    
    # PRELIMINARY TEST 3: Check if this school has already been parsed--via its unique school_URL. If so, skip this school to avoid duplication bias.
    if school_URL in parsed: 
        logging.error("DUPLICATE URL DETECTED. Skipping " + str(school_folder) + "...")
        duplicate_flag = 1
        write_errors(error_file, duplicate_flag, parse_error_flag, wget_fail_flag, file_count)
        write_counts(counts_file, ["ess_count","prog_count","rit_count"], [0,0,0]) # empty counts file simplifies parsing
        return
    
    logging.info("Preliminary tests passed. Parsing data in " + str(school_folder) + "...")
    
    # Next, initialize local (within-function) variables for text output
    webtext,keywords_text,ideology_text,dictless_words = [],[],[],[] # text category lists
    file_list = [] # list of HTML files in school_folder
    
    mission,curriculum,philosophy,history,about,ideology,keywords = [],[],[],[],[],[],[] # matched keyword lists
    ess_count, prog_count, rit_count, alldict_count, all_matches = 0,0,0,0,0 # dict match counts
    ess_dictless, prog_dictless, rit_dictless, alldict_dictless = [],[],[],[] # lists of unmatched words. Why?
    # Later we can revise the dictionaries by looking at what content words were not counted by current dictionaries. 

    titles_list = [mission,curriculum,philosophy,history,about,ideology,keywords] # list of matched keyword lists
    dictsnames_list = [ess_count, prog_count, rit_count, alldict_count] # list of dict match counts
    dictlessnames_list = [ess_dictless, prog_dictless, rit_dictless, alldict_dictless] # list of unmatched word lists

    # Now link together dict terms lists with variables holding their matches and their not-matches:
    keysnames_tupzip = zip(keys_tuple, titles_list) # zips together keyword lists with the variables holding their matches
    #dictsnames_tuplist = zip(dicts_tuple, dictsnames_list, dictlessnames_list)
    dictsnames_biglist = [[dicts_tuple[i],dictsnames_list[i],dictlessnames_list[i]] for i in range(len(dicts_tuple))]

    logging.info(str(list(keysnames_tupzip)))
    logging.info(str(list(dictsnames_tupzip)))
    
    # Now to parsing:
    try:
        # Parse file only if it contains HTML. This is easy: use the "*.html" wildcard pattern--
        # also wget gave the ".html" file extension to appropriate files when downloading (`--adjust-extension` option)
        # Less efficient ways to check if files contain HTML (e.g., for data not downloaded by wget):
        # if bool(BeautifulSoup(open(fname), bsparser).find())==True: # if file.endswith(".html"):
        # Another way to do this, maybe faster but broken: files_iter = iglob(school_folder + "**/*.html", recursive=True)
            
        file_list = list_files(school_folder, ".html") # Get list of HTML files in school_folder
            
        if file_list==(None or school_folder or "" or []) or not file_list or len(file_list)==0:
            logging.info("No .html files found. Aborting parser for " + str(school_name) + "...")
            wget_fail_flag = 1
            write_errors(error_file, duplicate_flag, parse_error_flag, wget_fail_flag, file_count)
            write_counts(counts_file, ["ess_count","prog_count","rit_count"], [0,0,0]) # empty counts file simplifies parsing
            return
        
        for file in tqdm(file_list, desc=("Parsing files")):
                
            logging.info("Parsing HTML in " + str(file) + "...")
                    
            # Parse and categorize page text:
            try:                    
                webtext,keywords_text,ideology_text = parse_file_helper(file, webtext, keywords_text, ideology_text)
                        
                file_count+=1 # add to count of parsed files

            except Exception as e:
                logging.error("ERROR! Failed to parse file...\n" + e)
                        
            # Count dict matches:
            try:
                dictsnames_biglist,all_matches = dictmatch_file_helper(file,dictsnames_biglist, all_keywords, all_ideol, all_matches)

            except Exception as e:
                logging.info("ERROR! Failed to count number of dict matches while parsing " + str(file) + "...\n" + e)
                    
        # Report and save output to disk:
        print("Got here 1")
        parsed.append(school_URL)
        file_count = int(file_count-1)
        print("  PARSED " + str(file_count) + " .html file(s) from website of " + str(school_name) + "...")
            
        write_list(school_folder + "webtext.txt", webtext)
        write_list(school_folder + "keywords_text.txt", keywords_text)
        write_list(school_folder + "ideology_text.txt", ideology_text)
            
        print("  Found " + str(all_matches) + " total dictionary matches and " + str(len(dictsnames_biglist[3][2])) + " uncounted words for " + str(school_name) + "...")

        write_counts(counts_file, ["ess_count","prog_count","rit_count"], [dictsnames_biglist[0][1], dictsnames_biglist[1][1], dictsnames_biglist[2][1]])
        print("Got here 2")
        write_list(school_folder + "dictless_words.txt", dictsnames_biglist[3][2])
        print("Got here 3")
                    
        write_errors(error_file, duplicate_flag, parse_error_flag, wget_fail_flag, file_count)
        print("Got here 4")

    except Exception as e:
        logging.error("ERROR! Failed to parse, categorize, and get dict matches on webtext of " + str(school_name) + "...\n" + e)
        parse_error_flag = 1
        write_errors(error_file, duplicate_flag, parse_error_flag, wget_fail_flag, file_count)
        write_counts(counts_file, ["ess_count","prog_count","rit_count"], [0,0,0]) # empty counts file simplifies parsing

    return

In [13]:
'''def dictify_webtext(school_dict):
    """Reads parsing output from text files and saves to school_dict multiple parsing outputs:
    webtext, keywords_text, ideology_text, file_count, etc."""
    
    # Allow function to access these variables already defined outside the function (globally)
    global itervar,numschools,parsed,wget_dataloc,URL_var,NAME_var,ADDR_var,save_dir
    
    datalocation = wget_dataloc # Define path to local data storage
    school_name, school_address, school_URL = school_dict[NAME_var], school_dict[ADDR_var], school_dict[URL_var] # Define varnames
    itervar+=1 # Count this school
    
    print("Loading into dict parsing output for " + str(school_name) + ", which is school #" + str(itervar) + " of " + str(numschools) + "...")
    
    school_dict["webtext"], school_dict["keywords_text"], school_dict["ideology_text"] = [[] for _ in range(3)]
    school_dict["duplicate_flag"], school_dict["parse_error_flag"], school_dict["wget_fail_flag"] = [0 for _ in range(3)]
    school_dict['ess_strength'],school_dict['prog_strength'] = [0.0 for _ in range(2)]
    
    folder_name = school_dict["folder_name"]
    
    school_folder = datalocation + folder_name + "/"
    error_file = school_folder + "error_flags.txt" # Define file path for error text log
    
    if school_URL==school_name:
        school_URL = folder_name # Workaround for full_schooldata, which doesn't yet have URLs

    # Check if folder exists. If not, exit function
    if not (os.path.exists(school_folder) or os.path.exists(school_folder.lower()) or os.path.exists(school_folder.upper())):
        print("  !! NO DIRECTORY FOUND matching " + str(school_folder) + ". Aborting dictify function...")
        school_dict['wget_fail_flag'] = 1
        return
    
    try:
        # Load school parse output from disk into dictionary 
        school_dict["webtext"] = load_list(school_folder + "webtext.txt")
        school_dict["keywords_text"] = load_list(school_folder + "keywords_text.txt")
        school_dict["ideology_text"] = load_list(school_folder + "ideology_text.txt")                        
        
        """ # Comment out until dict_count is run
        school_dict["ess_count"] = load_list(school_folder + "ess_count.txt")
        school_dict["prog_count"] = load_list(school_folder + "prog_count.txt")
        school_dict["rit_count"] = load_list(school_folder + "rit_count.txt")
        school_dict['ess_strength'] = float(school_dict['ess_count'])/float(school_dict['rit_count'])
        school_dict['prog_strength'] = float(school_dict['prog_count'])/float(school_dict['rit_count'])
        """

        # load error_file as a list with four pieces, the last element of each of which is the flag value itself:
        error_text = load_list(error_file) 
        school_dict["duplicate_flag"] = error_text[0].split()[-1] # last element of first piece of error_text
        school_dict["parse_error_flag"] = error_text[1].split()[-1]
        school_dict["wget_fail_flag"] = error_text[2].split()[-1]
        school_dict["html_file_count"] = error_text[3].split()[-1]
        
        if int(school_dict["html_file_count"])==0:
            school_dict["wget_fail_flag"] = 1 # If no HTML, then web download failed!
        
        print("  LOADED " + school_dict["html_file_count"] + " .html file(s) from website of " + str(school_name) + "...")
        #save_datafile(dicts_list, save_dir+"school_parser_temp", "JSON") # Save output so we can pick up where left off, in case something breaks before able to save final output
        return school_dict
    
    except Exception as e:
        print("  ERROR! Failed to load into dict parsing output for " + str(school_name))
        print("  ",e)
        school_dict["parse_error_flag"] = 1
        return'''


Out[13]:
'def dictify_webtext(school_dict):\n    """Reads parsing output from text files and saves to school_dict multiple parsing outputs:\n    webtext, keywords_text, ideology_text, file_count, etc."""\n    \n    # Allow function to access these variables already defined outside the function (globally)\n    global itervar,numschools,parsed,wget_dataloc,URL_var,NAME_var,ADDR_var,save_dir\n    \n    datalocation = wget_dataloc # Define path to local data storage\n    school_name, school_address, school_URL = school_dict[NAME_var], school_dict[ADDR_var], school_dict[URL_var] # Define varnames\n    itervar+=1 # Count this school\n    \n    print("Loading into dict parsing output for " + str(school_name) + ", which is school #" + str(itervar) + " of " + str(numschools) + "...")\n    \n    school_dict["webtext"], school_dict["keywords_text"], school_dict["ideology_text"] = [[] for _ in range(3)]\n    school_dict["duplicate_flag"], school_dict["parse_error_flag"], school_dict["wget_fail_flag"] = [0 for _ in range(3)]\n    school_dict[\'ess_strength\'],school_dict[\'prog_strength\'] = [0.0 for _ in range(2)]\n    \n    folder_name = school_dict["folder_name"]\n    \n    school_folder = datalocation + folder_name + "/"\n    error_file = school_folder + "error_flags.txt" # Define file path for error text log\n    \n    if school_URL==school_name:\n        school_URL = folder_name # Workaround for full_schooldata, which doesn\'t yet have URLs\n\n    # Check if folder exists. If not, exit function\n    if not (os.path.exists(school_folder) or os.path.exists(school_folder.lower()) or os.path.exists(school_folder.upper())):\n        print("  !! NO DIRECTORY FOUND matching " + str(school_folder) + ". Aborting dictify function...")\n        school_dict[\'wget_fail_flag\'] = 1\n        return\n    \n    try:\n        # Load school parse output from disk into dictionary \n        school_dict["webtext"] = load_list(school_folder + "webtext.txt")\n        school_dict["keywords_text"] = load_list(school_folder + "keywords_text.txt")\n        school_dict["ideology_text"] = load_list(school_folder + "ideology_text.txt")                        \n        \n        """ # Comment out until dict_count is run\n        school_dict["ess_count"] = load_list(school_folder + "ess_count.txt")\n        school_dict["prog_count"] = load_list(school_folder + "prog_count.txt")\n        school_dict["rit_count"] = load_list(school_folder + "rit_count.txt")\n        school_dict[\'ess_strength\'] = float(school_dict[\'ess_count\'])/float(school_dict[\'rit_count\'])\n        school_dict[\'prog_strength\'] = float(school_dict[\'prog_count\'])/float(school_dict[\'rit_count\'])\n        """\n\n        # load error_file as a list with four pieces, the last element of each of which is the flag value itself:\n        error_text = load_list(error_file) \n        school_dict["duplicate_flag"] = error_text[0].split()[-1] # last element of first piece of error_text\n        school_dict["parse_error_flag"] = error_text[1].split()[-1]\n        school_dict["wget_fail_flag"] = error_text[2].split()[-1]\n        school_dict["html_file_count"] = error_text[3].split()[-1]\n        \n        if int(school_dict["html_file_count"])==0:\n            school_dict["wget_fail_flag"] = 1 # If no HTML, then web download failed!\n        \n        print("  LOADED " + school_dict["html_file_count"] + " .html file(s) from website of " + str(school_name) + "...")\n        #save_datafile(dicts_list, save_dir+"school_parser_temp", "JSON") # Save output so we can pick up where left off, in case something breaks before able to save final output\n        return school_dict\n    \n    except Exception as e:\n        print("  ERROR! Failed to load into dict parsing output for " + str(school_name))\n        print("  ",e)\n        school_dict["parse_error_flag"] = 1\n        return'

In [57]:
def pandify_webtext(df):
    """Reads parsing output from text files and saves to DataFrame df multiple parsing outputs:
    webtext, keywords_text, ideology_text, file_count, dict_count outputs, etc."""
    
    # Allow function to access these variables already defined outside the function (globally)
    global numschools,parsed,wget_dataloc,URL_var,NAME_var,ADDR_var,save_dir
    
    datalocation = wget_dataloc # Define path to local data storage
    # VARNAMES ARE: school_name, school_address, school_URL = df[NAME_var], df[ADDR_var], df[URL_var]
    
    #print("Loading into DataFrame parsing output for " + str(len(df)) + " school websites out of a total of " + str(numschools) + "...")
    # df["folder_name"] = df[[NAME_var, ADDR_var]].apply(lambda x: re.sub(" ","_",("{} {}".format(str(x[0], x[1][-8:-6])))), axis=1) # This gives name and state separated by "_"  ## school["folder_name"] = re.sub(" ","_",(school[NAME_var]+" "+school[ADDR_var][-8:-6]))  ### Comment out while fixing parser
    df.loc[:,"school_folder"] = df.loc[:,"folder_name"].apply(lambda x: str(datalocation) + '{}/'.format(str(x)))
    df.loc[:,"error_file"] = df.loc[:,"school_folder"].apply(lambda x: '{}error_flags.txt'.format(str(x))) # Define file path for error text log
    df.loc[:,"counts_file"] = df.loc[:,"school_folder"].apply(lambda x: '{}dict_counts.txt'.format(str(x)))
    
    # Initialize text strings and counts as empty, then convert data types:
    empty = ["" for elem in range(len(df["NCESSCH"]))] # Create empty string column length of longest variable (NCESCCH used for matching)
    df = df.assign(webtext=empty, keywords_text=empty, ideology_text=empty, ess_count=empty, prog_count=empty, rit_count=empty) # Add empty columns to df
    df.loc[:,["webtext", "keywords_text", "ideology_text"]] = df.loc[:,["webtext", "keywords_text", "ideology_text"]].apply(lambda x: x.astype(object)) # Convert to object type--holds text
    df.loc[:,["ess_count", "prog_count", "rit_count"]] = df.loc[:,["ess_count", "prog_count", "rit_count"]].apply(pd.to_numeric, downcast="unsigned") # Convert to int dtype--holds positive numbers (no decimals)
    
    try:
        # load error_file as a list with four pieces, the last element of each of which is the flag value itself:
        df.loc[:,"error_text"] = df.loc[:,"error_file"].apply(lambda x: load_list('{}'.format(str(x))))
        df.loc[:,"duplicate_flag"] = df.loc[:,"error_text"].apply(lambda x: '{}'.format(str(x[0].split()[-1]))) # int(df.error_text[0].split()[-1]) # last element of first piece of error_text
        df.loc[:,"parse_error_flag"] = df.loc[:,"error_text"].apply(lambda x: '{}'.format(str(x[1].split()[-1]))) #int(df.error_text[1].split()[-1])
        df.loc[:,"wget_fail_flag"] = df.loc[:,"error_text"].apply(lambda x: '{}'.format(str(x[2].split()[-1]))) #int(df.error_text[2].split()[-1])
        df.loc[:,"html_file_count"] = df.loc[:,"error_text"].apply(lambda x: '{}'.format(str(x[3].split()[-1]))) #int(df.error_text[3].split()[-1])
        
        #if df["html_file_count"]==0:
        #    df["wget_fail_flag"] = 1 # If no HTML, then web download failed! ## REDUNDANT with parse_school()
        
        #df['wget_fail_flag'] = df.folder_name.progress_apply(lambda x: set_failflag(x)) # Comment out while fixing parser
        downloaded = df["wget_fail_flag"].map({"1":True,1:True,"0":False,0:False}) == False # This binary conditional filters df to only those rows with downloaded web content (where wget_fail_flag==False and thus does NOT signal download failure)
        
        print("Loading webtext from disk into DF...")
        
        # Load school parse output from disk into DataFrame:
        # df.loc[:,(downloaded,"keywords_text")] = df.loc[:,(downloaded,"school_folder")].progress_apply...
        df.loc[downloaded,"webtext"] = df.loc[downloaded,"school_folder"].progress_apply(lambda x: load_list("{}webtext.txt".format(str(x)))) # df["wget_fail_flag"]==False
        df.loc[downloaded,"keywords_text"] = df.loc[downloaded,"school_folder"].progress_apply(lambda x: load_list("{}keywords_text.txt".format(str(x))))
        df.loc[downloaded,"ideology_text"] = df.loc[downloaded,"school_folder"].progress_apply(lambda x: load_list("{}ideology_text.txt".format(str(x))))
        
        df["counts_text"] = df.counts_file.apply(lambda x: load_list("{}".format(str(x))))
        df.loc[downloaded,"ess_count"] = df.loc[downloaded,"counts_text"].apply(lambda x: "{}".format(str(x[0].split()[-1]))).apply(pd.to_numeric,downcast='unsigned') # 2nd element of 1st row in counts_text: take as uint dtype (no negatives)
        df.loc[downloaded,"prog_count"] = df.loc[downloaded,"counts_text"].apply(lambda x: "{}".format(str(x[1].split()[-1]))).apply(pd.to_numeric,downcast='unsigned') # 2nd element of 2nd row
        df.loc[downloaded,"rit_count"] = df.loc[downloaded,"counts_text"].apply(lambda x: "{}".format(str(x[2].split()[-1]))).apply(pd.to_numeric,downcast='unsigned') # 2nd element of 3nd row
        df.loc[downloaded,"ess_strength"] = (df.loc[downloaded,"ess_count"]/df.loc[downloaded, "rit_count"]).apply(pd.to_numeric, downcast='float') # calculate ideology ratio, use most memory-efficient float dtype
        df.loc[downloaded,"prog_strength"] = (df.loc[downloaded,"prog_count"]/df.loc[downloaded, "rit_count"]).apply(pd.to_numeric, downcast='float') 
        logging.info(str(df.loc[downloaded,'prog_strength']))
        
        print("Finished loading webtext into DF.")
        
        df.drop(["school_folder","error_text","error_file","counts_text"],axis=1) # Clean up temp variables
        
        #print("  LOADED " + df["html_file_count"].sum() + " .html files from into DataFrame!")
        #save_datafile(df, save_dir+"df_parser_temp", "pickle") # Save output so we can pick up where left off, in case something breaks before able to save final output
        return df
    
    except Exception as e:
        logging.critical("ERROR! Pandify function failed to load parsing output into DataFrame.\n" + str(e))
        print("    ERROR! Pandify function failed to load parsing output into DataFrame.")
        print("  ",str(e))
        sys.exit()

In [45]:
def slice_pandify(bigdf, numsplits, df_filepath):
    """This function uses pandify_webtext() to load the parsing output from local storage into a DataFrame.
    It gets around system memory limitations--which otherwise lead terminal to kill any attempts to pandify() all of bigdf--
    by splitting bigdf into numsplits smaller dfslices, parsing webtext into each slice, and recombining them
    by appending them to a big CSV on file. 
    The number of slices equals numsplits, and bigdf is split by numschools/ numsplits."""
    
    global numschools # Access numschools from within function (this is roughly 7000)
    wheresplit = int(round(float(numschools)/numsplits)) # Get number on which to split (e.g., 1000) based on total number of schools data. This splitting number will be iterated over using numsplits
    
    for num in range(numsplits): # tqdm(range(numsplits), desc="Loading dfslices"): # Wrap iterator with tqdm to show progress bar
        try:
            dfslice = pd.DataFrame()
            startnum, endnum = wheresplit*int(num),wheresplit*int(num+1)
            dfslice = bigdf.iloc[startnum:endnum,:]
            print("Loading into DataFrame parsing output for schools from " + str(startnum) + " to " + str(endnum) + " out of a total of " + str(numschools) + " school websites...")
            dfslice = pandify_webtext(dfslice) # Load parsed output into the DF
            if num==1:
                save_datafile(dfslice,df_filepath,"CSV") # Save this first chunk of results to new file, overwriting if needed
            else:
                dfslice.to_csv(df_filepath,mode="a",index=False) # Append this next chunk of results to existing saved results
                print("Data saved to " + df_filepath + "!")
            del dfslice # Free memory by deleting this temporary, smaller slice
            
        except Exception as e:
            logging.critical("ERROR! Script failed to load parsing output into DataFrame slice #" + str(num) + " of " + str(numsplits) + ".\n" + e)
            print("  ERROR! Script failed to load parsing output into DataFrame slice #" + str(num) + " of " + str(numsplits) + ".")
            print("  ",e)
            sys.exit()
            
    return

In [15]:
# ### Preparing data to be parsed

itervar = 0 # initialize iterator that counts number of schools already parsed--useless when multiprocessing
parsed = [] # initialize list of URLs that have already been parsed
dicts_list = [] # initialize list of dictionaries to hold school data
schooldf = pd.DataFrame() # initialize DataFrame to hold school data

# If input_file was defined by user input in beginning of script, use that to load list of dictionaries. We'll add to it!
if usefile and not dicts_list:
    dicts_list = load_datafile(input_file)
    data_loc = full_schooldata # If loading data, assume we're running on full charter population

else:
    # set charter school data file and corresponding varnames:
    
    data_loc = full_schooldata # Run at scale using URL list of full charter population
    # data_loc = micro_sample13 # This seems nice for debugging--except directories don't match because different data source
        
    # Create dict list from CSV on file, with one dict per school
    with open(data_loc, 'r', encoding = 'Latin1') as csvfile: # open data file
        reader = csv.DictReader(csvfile) # create a reader
        for row in reader: # loop through rows
            dicts_list.append(row) # append each row to the list
        
URL_var,NAME_var,ADDR_var = get_vars(data_loc) # get varnames depending on data source
numschools = int(len(dicts_list)) # Count number of schools in list of dictionaries
names,addresses,urls,folder_names = [[] for _ in range(4)]


for school in dicts_list: # tqdm(dicts_list, desc="Setting web_fail_flags"): # Wrap iterator with tqdm to show progress bar
    names.append(school[NAME_var])
    addresses.append(school[ADDR_var])
    urls.append(school[URL_var])
    school["folder_name"] = re.sub(" ","_",(school[NAME_var]+" "+school[ADDR_var][-8:-6])) # This gives name and state separated by "_"
    folder_names.append(school["folder_name"])
    # school['wget_fail_flag'] = set_failflag(school["folder_name"]) # REDUNDANT with parse_school()
    # save_datafile(dicts_list, temp_dir+"school_parser_temp", "JSON") # Save output so we can pick up where left off, in case something breaks
    
tuplist_zip = zip(names, addresses, urls, folder_names) # Create list of tuples to pass to parser function

""" # REDUNDANT with parse_school
if __name__ == '__main__':
    with Pool(numcpus) as p: # Use multiprocessing.Pool(numcpus) to speed things up
        p.map(set_failflag, tqdm(folder_names, desc="Setting web_fail_flags"), chunksize=numcpus)
        """


Out[15]:
' # REDUNDANT with parse_school\nif __name__ == \'__main__\':\n    with Pool(numcpus) as p: # Use multiprocessing.Pool(numcpus) to speed things up\n        p.map(set_failflag, tqdm(folder_names, desc="Setting web_fail_flags"), chunksize=numcpus)\n        '

In [16]:
schooldf = pd.DataFrame(dicts_list)
thislist = load_list(wget_dataloc + 'Ayaprun_Elitnaurvik_AK/webtext.txt')
print(thislist)


['Skip to main content\n', 'State of Alaska\n', 'myAlaska\n', 'My Government\n', 'Resident\n', 'Business in Alaska\n', 'Visiting Alaska\n', 'State Employees\n', 'Department of Education & Early Development\n', 'EED Website\n', 'State of Alaska\n', 'Home\n', 'Parents & Students\n', 'Teaching & Learning\n', 'Forms & Grants\n', 'Finance & Facilities\n', 'Statistics & Reports\n', 'About EED\n', 'State of Alaska\n', ' > \n', 'DEED\n', ' > \n', 'Alaska Public Schools Database\n', '                    > Ayaprun Elitnaurvik            \n', 'Ayaprun Elitnaurvik\n', 'Lower Kuskokwim School District\n', 'School Calendar for 2017-2018\n', 'School ID\n', '319010\n', 'Address\n', 'PO Box 1468\n', 'Bethel, AK 99559\n', 'Physical Address\n', '1010 Fourth Ave\n', 'Bethel, AK 99559\n', 'Telephone\n', '(907) 543-1645\n', 'Fax\n', '         (907) 543-1647\n', 'School Website\n', 'Ayaprun Elitnaurvik\n', 'School Email\n', 'Contact Name\n', 'Sam Crow\n', ', Principal\n', 'Lowest Grade\n', 'PK\n', 'Highest Grade\n', '6\n', 'Enrollment\n', '2016-2017 Enrollment: 163\n', 'Home\n', 'School Details\n', 'Website Information\n', 'State of Alaska Homepage\n', 'Contact Information\n', '                    Alaska Dept. of Education                    \n', '                    & Early Development                \n', '                    801 West 10th Street, Suite 200                    \n', '                    PO Box 110500                    \n', '                    Juneau, AK 99811-0500                \n', '                    Telephone: \n', '(907) 465-2800\n', '                    Teacher Certification: \n', '(907) 465-2831\n', '                    TTY/TTD: \n', '(907) 465-2815\n', '                    Fax: (907) 465-4156                \n', 'eed.webmaster@alaska.gov\n', 'Teaching & Learning Support Program Contacts\n', 'More. . .\n', 'Department Links\n', 'Alaska State Council on the Arts\n', 'Alaska Commission on Postsecondary Education\n', 'Professional Teaching Practices Commission\n', 'Review and Comment on Proposed Regulation\n', 'State Board of Education & Early Development\n', 'Teacher Certification\n', 'State of Alaska\n', 'myAlaska\n', 'My Government\n', 'Resident\n', 'Business in Alaska\n', 'Visiting Alaska\n', 'State Employees\n', 'State of Alaska\n', '© 2013\n', 'Webmaster\n']

In [17]:
'''
empty = ["" for elem in range(len(schooldf["NCESSCH"]))] # Create empty column length of the longest variable
schooldf = schooldf.assign(webtext=empty, keywords_text=empty, ideology_text=empty, ess_count=empty, prog_count=empty, rit_count=empty)
print(len(empty_column))

schooldf = schooldf.assign(keywords_text = empty_column, webtext = empty_column)
schooldf["keywords_text"] = schooldf["keywords_text"].astype(object)
print(type(schooldf.loc[0,"keywords_text"]), type(schooldf), type(schooldf.keywords_text))
print()
print(schooldf)

#schooldf[["webtext", "keywords_text", "ideology_text"]] = schooldf[["webtext", "keywords_text", "ideology_text"]].astype(object)
#schooldf[["ess_count", "prog_count", "rit_count"]] = schooldf[["ess_count", "prog_count", "rit_count"]].apply(pd.to_numeric,downcast='float')

#schooldf.at[:,"keywords_text"] = schooldf["keywords_text"].astype(object)
#schooldf.at[:,"keywords_text"] = []
#schooldf.at[0,"keywords_text"] = schooldf["keywords_text"].astype(object)
# schooldf = schooldf.assign("ess_count")
#schooldf["ess_count"] = schooldf["ess_count"].apply(pd.to_numeric)

# converted_df.loc[:,col] = df[col].astype('category')
# converted_df.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='float')

#.astype(object), df["keywords_text"].astype(object), df["ideology_text"].astype(object), df["ess_count"].astype(int), df["prog_count"].astype(int), df["rit_count"].astype(int)
'''


Out[17]:
'\nempty = ["" for elem in range(len(schooldf["NCESSCH"]))] # Create empty column length of the longest variable\nschooldf = schooldf.assign(webtext=empty, keywords_text=empty, ideology_text=empty, ess_count=empty, prog_count=empty, rit_count=empty)\nprint(len(empty_column))\n\nschooldf = schooldf.assign(keywords_text = empty_column, webtext = empty_column)\nschooldf["keywords_text"] = schooldf["keywords_text"].astype(object)\nprint(type(schooldf.loc[0,"keywords_text"]), type(schooldf), type(schooldf.keywords_text))\nprint()\nprint(schooldf)\n\n#schooldf[["webtext", "keywords_text", "ideology_text"]] = schooldf[["webtext", "keywords_text", "ideology_text"]].astype(object)\n#schooldf[["ess_count", "prog_count", "rit_count"]] = schooldf[["ess_count", "prog_count", "rit_count"]].apply(pd.to_numeric,downcast=\'float\')\n\n#schooldf.at[:,"keywords_text"] = schooldf["keywords_text"].astype(object)\n#schooldf.at[:,"keywords_text"] = []\n#schooldf.at[0,"keywords_text"] = schooldf["keywords_text"].astype(object)\n# schooldf = schooldf.assign("ess_count")\n#schooldf["ess_count"] = schooldf["ess_count"].apply(pd.to_numeric)\n\n# converted_df.loc[:,col] = df[col].astype(\'category\')\n# converted_df.select_dtypes(include=[\'float\']).apply(pd.to_numeric,downcast=\'float\')\n\n#.astype(object), df["keywords_text"].astype(object), df["ideology_text"].astype(object), df["ess_count"].astype(int), df["prog_count"].astype(int), df["rit_count"].astype(int)\n'

In [18]:
# Initialize text strings and counts as empty, then convert to more efficient data types:
empty = ["" for elem in range(len(schooldf["NCESSCH"]))] # Create empty string column length of longest variable (NCESCCH used for matching)
schooldf = schooldf.assign(webtext=empty, keywords_text=empty, ideology_text=empty, ess_count=empty, prog_count=empty, rit_count=empty) # Add empty columns to df
print(schooldf.info(memory_usage='deep'))

schooldf = convert_df(schooldf)
schooldf[["webtext", "keywords_text", "ideology_text"]] = schooldf[["webtext", "keywords_text", "ideology_text"]].apply(lambda x: x.astype(object)) # Convert to string type (to be safe)
schooldf[["ess_count", "prog_count", "rit_count"]] = schooldf[["ess_count", "prog_count", "rit_count"]].apply(pd.to_numeric,downcast='float')  # Convert to float type (if appropriate)

print()
print(schooldf.info(memory_usage='deep'))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6972 entries, 0 to 6971
Columns: 187 entries, SURVYEAR to webtext
dtypes: object(187)
memory usage: 79.3 MB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6972 entries, 0 to 6971
Columns: 187 entries, SURVYEAR to webtext
dtypes: category(155), float32(3), object(29)
memory usage: 19.6 MB
None

In [19]:
#schooldf.iloc[0]["webtext"] = load_list(wget_dataloc + 'Ayaprun_Elitnaurvik_AK/webtext.txt') #schooldf.iloc[0].apply(lambda x: load_list(wget_dataloc + "{}/webtext.txt".format(str(x.folder_name))))
#schooldf["webtext"] = schooldf["webtext"].astype(object)
#schooldf.at[0,"webtext"] = thislist
#schooldf.loc[0,"webtext"]
#schooldf

In [129]:
schooldf.loc[:,"folder_name"] = schooldf.loc[:,[NAME_var,ADDR_var]].apply(lambda x: re.sub(" ","_","{} {}".format(str(x[0]),str(x[1][-8:-6]))), axis=1) # This gives name and state separated by "_"

In [130]:
print(schooldf.loc[schooldf["folder_name"]=="NaN",NAME_var])


Series([], Name: SCH_NAME, dtype: object)

In [131]:
schooldf.loc[:17,"folder_name"]


Out[131]:
0                                Ayaprun_Elitnaurvik_AK
1                           Ketchikan_Charter_School_AK
2     Tongass_School_of_Arts_and_Sciences_Charter_Sc...
3                            Aquarian_Charter_School_AK
4                  Family_Partnership_Charter_School_AK
5                                 Winterberry_School_AK
6                       Eagle_Academy_Charter_School_AK
7                            Frontier_Charter_School_AK
8                  Highland_Tech_High_Charter_School_AK
9                        Rilke_Schule_Charter_School_AK
10             Alaska_Native_Cultural_Charter_School_AK
11                   Juneau_Community_Charter_School_AK
12                    Aurora_Borealis_Charter_School_AK
13                                  Fireweed_Academy_AK
14                Soldotna_Montessori_Charter_School_AK
15            Kaleidoscope_School_of_Arts_&_Sciences_AK
16                            Academy_Charter_School_AK
17               Midnight_Sun_Family_Learning_Center_AK
Name: folder_name, dtype: object

In [20]:
schooldf["school_folder"] = schooldf.folder_name.apply(lambda x: str(wget_dataloc) + '{}/'.format(str(x)))
schooldf["error_file"] = schooldf.school_folder.apply(lambda x: '{}error_flags.txt'.format(str(x))) # Define file path for error text log
schooldf["counts_file"] = schooldf.school_folder.apply(lambda x: '{}dict_counts.txt'.format(str(x)))

In [25]:
schooldf.loc[:20,["school_folder", "error_file", "counts_file"]]


Out[25]:
school_folder error_file counts_file
0 /home/jovyan/work/wget/parll_wget/Ayaprun_Elit... /home/jovyan/work/wget/parll_wget/Ayaprun_Elit... /home/jovyan/work/wget/parll_wget/Ayaprun_Elit...
1 /home/jovyan/work/wget/parll_wget/Ketchikan_Ch... /home/jovyan/work/wget/parll_wget/Ketchikan_Ch... /home/jovyan/work/wget/parll_wget/Ketchikan_Ch...
2 /home/jovyan/work/wget/parll_wget/Tongass_Scho... /home/jovyan/work/wget/parll_wget/Tongass_Scho... /home/jovyan/work/wget/parll_wget/Tongass_Scho...
3 /home/jovyan/work/wget/parll_wget/Aquarian_Cha... /home/jovyan/work/wget/parll_wget/Aquarian_Cha... /home/jovyan/work/wget/parll_wget/Aquarian_Cha...
4 /home/jovyan/work/wget/parll_wget/Family_Partn... /home/jovyan/work/wget/parll_wget/Family_Partn... /home/jovyan/work/wget/parll_wget/Family_Partn...
5 /home/jovyan/work/wget/parll_wget/Winterberry_... /home/jovyan/work/wget/parll_wget/Winterberry_... /home/jovyan/work/wget/parll_wget/Winterberry_...
6 /home/jovyan/work/wget/parll_wget/Eagle_Academ... /home/jovyan/work/wget/parll_wget/Eagle_Academ... /home/jovyan/work/wget/parll_wget/Eagle_Academ...
7 /home/jovyan/work/wget/parll_wget/Frontier_Cha... /home/jovyan/work/wget/parll_wget/Frontier_Cha... /home/jovyan/work/wget/parll_wget/Frontier_Cha...
8 /home/jovyan/work/wget/parll_wget/Highland_Tec... /home/jovyan/work/wget/parll_wget/Highland_Tec... /home/jovyan/work/wget/parll_wget/Highland_Tec...
9 /home/jovyan/work/wget/parll_wget/Rilke_Schule... /home/jovyan/work/wget/parll_wget/Rilke_Schule... /home/jovyan/work/wget/parll_wget/Rilke_Schule...
10 /home/jovyan/work/wget/parll_wget/Alaska_Nativ... /home/jovyan/work/wget/parll_wget/Alaska_Nativ... /home/jovyan/work/wget/parll_wget/Alaska_Nativ...
11 /home/jovyan/work/wget/parll_wget/Juneau_Commu... /home/jovyan/work/wget/parll_wget/Juneau_Commu... /home/jovyan/work/wget/parll_wget/Juneau_Commu...
12 /home/jovyan/work/wget/parll_wget/Aurora_Borea... /home/jovyan/work/wget/parll_wget/Aurora_Borea... /home/jovyan/work/wget/parll_wget/Aurora_Borea...
13 /home/jovyan/work/wget/parll_wget/Fireweed_Aca... /home/jovyan/work/wget/parll_wget/Fireweed_Aca... /home/jovyan/work/wget/parll_wget/Fireweed_Aca...
14 /home/jovyan/work/wget/parll_wget/Soldotna_Mon... /home/jovyan/work/wget/parll_wget/Soldotna_Mon... /home/jovyan/work/wget/parll_wget/Soldotna_Mon...
15 /home/jovyan/work/wget/parll_wget/Kaleidoscope... /home/jovyan/work/wget/parll_wget/Kaleidoscope... /home/jovyan/work/wget/parll_wget/Kaleidoscope...
16 /home/jovyan/work/wget/parll_wget/Academy_Char... /home/jovyan/work/wget/parll_wget/Academy_Char... /home/jovyan/work/wget/parll_wget/Academy_Char...
17 /home/jovyan/work/wget/parll_wget/Midnight_Sun... /home/jovyan/work/wget/parll_wget/Midnight_Sun... /home/jovyan/work/wget/parll_wget/Midnight_Sun...
18 /home/jovyan/work/wget/parll_wget/American_Cha... /home/jovyan/work/wget/parll_wget/American_Cha... /home/jovyan/work/wget/parll_wget/American_Cha...
19 /home/jovyan/work/wget/parll_wget/Twindly_Brid... /home/jovyan/work/wget/parll_wget/Twindly_Brid... /home/jovyan/work/wget/parll_wget/Twindly_Brid...
20 /home/jovyan/work/wget/parll_wget/Fronteras_Ch... /home/jovyan/work/wget/parll_wget/Fronteras_Ch... /home/jovyan/work/wget/parll_wget/Fronteras_Ch...

In [28]:
schooldf[schooldf["error_file"]=='nan']


Out[28]:
SURVYEAR FIPST STABR STATENAME SEANAME LEAID ST_LEAID LEA_NAME SCHID ST_SCHID ... folder_name ess_count ideology_text keywords_text prog_count rit_count webtext school_folder error_file counts_file

0 rows × 190 columns


In [34]:
tqdm.pandas(desc="Loading DF") # To show progress, create & register new `tqdm` instance with `pandas`
dfslice = schooldf.iloc[0:30,:]

In [35]:
dfslice["error_text"] = dfslice.error_file.progress_apply(lambda x: load_list('{}'.format(str(x))))
dfslice["error_text"]



Loading DF:   0%|          | 0/30 [00:00<?, ?it/s]

Loading DF: 100%|██████████| 30/30 [00:00<00:00, 2532.84it/s]/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
Out[35]:
0     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
1     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
2     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
3     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
4     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
5     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
6     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
7     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
8     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
9     [duplicate_flag 0\n, parse_error_flag 0\n, wge...
10    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
11    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
12    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
13    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
14    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
15    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
16    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
17    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
18    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
19    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
20    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
21    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
22    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
23    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
24    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
25    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
26    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
27    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
28    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
29    [duplicate_flag 0\n, parse_error_flag 0\n, wge...
Name: error_text, dtype: object

In [51]:
dfslice["duplicate_flag"] = dfslice.error_text.progress_apply(lambda x: '{}'.format(str(x[0].split()[-1]))) # int(df.error_text[0].split()[-1]) # last element of first piece of error_text
dfslice["parse_error_flag"] = dfslice.error_text.progress_apply(lambda x: '{}'.format(str(x[1].split()[-1]))) #int(df.error_text[1].split()[-1])
dfslice["wget_fail_flag"] = dfslice.error_text.progress_apply(lambda x: '{}'.format(str(x[2].split()[-1]))) #int(df.error_text[2].split()[-1])
dfslice["html_file_count"] = dfslice.error_text.progress_apply(lambda x: '{}'.format(str(x[3].split()[-1]))) #int(df.error_text[3].split()[-1])

dfslice[["duplicate_flag", "parse_error_flag", "wget_fail_flag", "html_file_count"]]

#if df["html_file_count"]==0:
#    df["wget_fail_flag"] = 1 # If no HTML, then web download failed! ## REDUNDANT with parse_school()



Loading DF:   0%|          | 0/30 [00:00<?, ?it/s]

Loading DF: 100%|██████████| 30/30 [00:00<00:00, 14840.09it/s]/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Loading DF:   0%|          | 0/30 [00:00<?, ?it/s]

Loading DF: 100%|██████████| 30/30 [00:00<00:00, 13775.91it/s]/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Loading DF:   0%|          | 0/30 [00:00<?, ?it/s]

Loading DF: 100%|██████████| 30/30 [00:00<00:00, 17544.50it/s]/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Loading DF:   0%|          | 0/30 [00:00<?, ?it/s]

Loading DF: 100%|██████████| 30/30 [00:00<00:00, 16324.48it/s]/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
Out[51]:
duplicate_flag parse_error_flag wget_fail_flag html_file_count
0 0 0 0 1
1 0 0 0 61
2 0 0 0 1
3 0 0 0 49
4 0 0 0 1
5 0 0 0 1
6 0 0 0 5
7 0 0 0 25
8 0 0 0 33
9 0 0 0 1
10 0 0 0 35
11 0 0 0 11
12 0 0 0 1
13 0 0 0 273
14 0 0 0 23
15 0 0 0 87
16 0 0 0 61
17 0 0 0 3
18 0 0 0 1
19 0 0 0 1
20 0 0 0 73
21 0 0 0 85
22 0 0 0 31
23 0 0 0 105
24 0 0 0 177
25 0 0 1 0
26 0 0 1 0
27 0 0 1 0
28 0 0 0 53
29 0 0 0 1

In [37]:
#df['wget_fail_flag'] = df.folder_name.progress_apply(lambda x: set_failflag(x)) # Comment out while fixing parser
downloaded = dfslice["wget_fail_flag"].map({"1":True,1:True,"0":False,0:False}) == False # This binary conditional filters df to only those rows with downloaded web content (where wget_fail_flag==False and thus does NOT signal download failure)
len(dfslice[downloaded])


Out[37]:
27

In [43]:
# Load school parse output from disk into DataFrame:
# df.loc[:,(downloaded,"keywords_text")] = df.loc[:,(downloaded,"school_folder")].progress_apply...
dfslice.loc[downloaded,"webtext"] = dfslice[downloaded].school_folder.progress_apply(lambda x: load_list("{}webtext.txt".format(str(x)))) # df["wget_fail_flag"]==False
dfslice["webtext"]



Loading DF:   0%|          | 0/27 [00:00<?, ?it/s]

Loading DF: 100%|██████████| 27/27 [00:00<00:00, 258.55it/s]

/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py:477: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
Out[43]:
0     [Skip to main content\n, State of Alaska\n, my...
1     [Skip to Main Content\n, District Home\n, Sele...
2     [Tongass School of Arts and Sciences\n, Home\n...
3     [Main\n, Main\n, Anchorage school district\n, ...
4     [Skip to main content\n, State of Alaska\n, my...
5     [Menu\n, Home\n, About\n, Mission\n, Leadershi...
6     [Main\n, Anchorage School District\n, Educatin...
7     [Home\n, APC\n, Business\n, Forms\n, Handbooks...
8     [Main\n, Main\n, Anchorage school district\n, ...
9     [Skip to main content\n, State of Alaska\n, my...
10    [Main\n, Anchorage school district\n, ASD Depa...
11    [Juneau Community Charter School \n, JCCS \n, ...
12    [Skip to main content\n, State of Alaska\n, my...
13    [Fireweed Academy\n, 995 Soundview Ave. Homer,...
14    [Soldotna Elementary School\n, 162 Park Ave. S...
15    [Kaleidoscope School of Arts and Science\n, 54...
16    [Skip to Main Content\n, District Home\n, Sele...
17    [Home\n, Classroom Pages \n, ~ Mrs. MacDonald\...
18    [Skip to Main Content\n, District Home\n, Sele...
19    [
ŧJù ̈ï ́<Ö ̧(­ðíwö(Z.`ŸlÄa
Ã*âGz<ÏNŠQÐ...
20    [Skip to Main Content\n, District Home\n, Sele...
21    [Skip to content\n, About\n, Mission\n, Histor...
22    [Home\n, About Us\n, Program Description\n, Sc...
23    [Ø'TVÒ8“n°èM¤·ÿÍ1™C<1nsU0<‘Q
èæLÌëc-3Î...
24    [Skip to Main Content\n, District Home\n, Sele...
25                                                     
26                                                     
27                                                     
28    [ ̄ö-`ùÅ+KÝZ$QÙEP3⁄43Çœ1⁄2Z@ýa?éGoÌy6?~Š ́...
29    [Skip to Main Content\n, District Home\n, Sele...
Name: webtext, dtype: object

In [47]:
dfslice.loc[downloaded,"keywords_text"] = dfslice.loc[downloaded,"school_folder"].progress_apply(lambda x: load_list("{}keywords_text.txt".format(str(x))))
dfslice.loc[downloaded,"ideology_text"] = dfslice.loc[downloaded,"school_folder"].progress_apply(lambda x: load_list("{}ideology_text.txt".format(str(x))))
dfslice.loc[downloaded,["keywords_text","ideology_text"]]



Loading DF:   0%|          | 0/27 [00:00<?, ?it/s]

Loading DF: 100%|██████████| 27/27 [00:00<00:00, 691.35it/s]/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py:477: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Loading DF:   0%|          | 0/27 [00:00<?, ?it/s]

Loading DF:  44%|████▍     | 12/27 [00:00<00:00, 105.95it/s]

Loading DF:  67%|██████▋   | 18/27 [00:00<00:00, 85.41it/s] 

Loading DF:  89%|████████▉ | 24/27 [00:00<00:00, 72.31it/s]

Loading DF: 100%|██████████| 27/27 [00:00<00:00, 50.53it/s]
Out[47]:
keywords_text ideology_text
0 [] [Alaska Public Schools Database\n, Lower Kusko...
1 [KCS School Creed\n, KCS School Creed\n, KCS S... [Select a School\n, Ketchikan High School\n, K...
2 [Our Vision\n, mission statement\n, Our School... [Tongass School of Arts and Sciences\n, Daily ...
3 [Aquarian students explore topics outside of s... [Anchorage school district\n, School board\n, ...
4 [] [Alaska Public Schools Database\n, ...
5 [Mission\n, Curriculum\n] [Mission\n, Program\n, What Is Waldorf Educati...
6 [Eagle Academy provides students with an excel... [Anchorage School District\n, Educating All St...
7 [for home school students. Frontier is disting... [New parent orientation meetings are held on t...
8 [Curriculum\n, Curriculum\n, Curriculum\n, Cur... [Anchorage school district\n, School board\n, ...
9 [] [Alaska Public Schools Database\n, ...
10 [ANCCS Curriculum \n, Outdoor education, art, ... [Anchorage school district\n, School board\n, ...
11 [Our Charter allows us the flexibility to cust... [Juneau Community Charter School \n, Juneau Co...
12 [] [Alaska Public Schools Database\n, ...
13 [MISSION STATEMENT\n, The Kenai Peninsula Boro... [Prior Year APC Minutes\n, FWA- a slice of lif...
14 [Vision Statement\n, Vision Statement\n, Visio... [Soldotna Elementary School\n, School Newslett...
15 [ background check through the district websit... [Kaleidoscope School of Arts and Science\n, Ch...
16 [Do you believe in school choice??? Do you wan... [Select a School\n, Academy Charter\n, America...
17 [~ Mission Statement\n, ‚…¢8“™È•÷ôàçå\É~Ÿ!þþO... [About Our School\n, ~ Mission Stateme...
18 [Mrs. Brady is the designer and creator of the... [Select a School\n, Academy Charter\n, America...
19 [] [ŧJù ̈ï ́<Ö ̧(­ðíwö(Z.`ŸlÄaÃ*âGz<ÏNŠQÐ...
20 [other forms can be found on the Fronteras on... [Select a School\n, Academy Charter\n, America...
21 [Mission\n, Curriculum\n, Birchtree Charter Sc... [Mission\n, School Hours/Transportation\n, Pro...
22 [] [Program Description\n, School Goals\n, Progra...
23 [The Extended Learning Program (ELP), formerly... [Ø'TVÒ8“n°èM¤·ÿÍ1™C<1nsU0<‘Q èæLÌëc-3Î...
24 [The middle school program creates small learn... [Select a School\n, Ben Eielson Jr./Sr. High\n...
28 [Í©°šòI©uÆ£Ê)âô®«\n, Ce,Ö6à#l-èRvq4TEð"øûˆ... [ ̄ö-`ùÅ+KÝZ$QÙEP3⁄43Çœ1⁄2Z@ýa?éGoÌy6?~Š ́...
29 [Mission Statement\n] [Select a School\n, Cactus Shadows High School...

In [68]:
dfslice["counts_text"] = dfslice.counts_file.progress_apply(lambda x: load_list("{}".format(str(x))))
dfslice.loc[downloaded,"ess_count"] = dfslice.loc[downloaded,"counts_text"].progress_apply(lambda x: "{}".format(str(x[0].split()[-1]))).apply(pd.to_numeric,downcast='unsigned') # 2nd element of 1st row in counts_text: take as uint dtype
dfslice.loc[downloaded,"prog_count"] = dfslice.loc[downloaded,"counts_text"].progress_apply(lambda x: "{}".format(str(x[1].split()[-1]))).apply(pd.to_numeric,downcast='unsigned') # 2nd element of 2nd row
dfslice.loc[downloaded,"rit_count"] = dfslice.loc[downloaded,"counts_text"].progress_apply(lambda x: "{}".format(str(x[2].split()[-1]))).apply(pd.to_numeric,downcast='unsigned') # 2nd element of 3rd row
dfslice[["counts_text","ess_count","prog_count","rit_count"]]
#df[downloaded]["ess_strength"] = df[downloaded]["ess_count"]/df[downloaded].rit_count
#df[downloaded]["prog_strength"] = df[downloaded]["prog_count"]/df[downloaded].rit_count
#print(df[downloaded]['prog_strength'])
#print("Got Here M")


Loading DF: 100%|██████████| 30/30 [00:00<00:00, 4444.53it/s]
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
Loading DF: 100%|██████████| 27/27 [00:00<00:00, 49560.70it/s]
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py:477: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
Loading DF: 100%|██████████| 27/27 [00:00<00:00, 51428.80it/s]
Loading DF: 100%|██████████| 27/27 [00:00<00:00, 61714.55it/s]
Out[68]:
counts_text ess_count prog_count rit_count
0 [ess_count 4\n, prog_count 1\n, rit_count 29] 4 1 29
1 [ess_count 45\n, prog_count 116\n, rit_count 1... 45 116 1096
2 [ess_count 3\n, prog_count 7\n, rit_count 80] 3 7 80
3 [ess_count 93\n, prog_count 85\n, rit_count 975] 93 85 975
4 [ess_count 4\n, prog_count 1\n, rit_count 35] 4 1 35
5 [ess_count 1\n, prog_count 8\n, rit_count 25] 1 8 25
6 [ess_count 50\n, prog_count 17\n, rit_count 167] 50 17 167
7 [ess_count 129\n, prog_count 84\n, rit_count 763] 129 84 763
8 [ess_count 89\n, prog_count 60\n, rit_count 774] 89 60 774
9 [ess_count 4\n, prog_count 1\n, rit_count 35] 4 1 35
10 [ess_count 116\n, prog_count 158\n, rit_count ... 116 158 1067
11 [ess_count 31\n, prog_count 120\n, rit_count 527] 31 120 527
12 [ess_count 4\n, prog_count 1\n, rit_count 35] 4 1 35
13 [ess_count 36\n, prog_count 901\n, rit_count 2... 36 901 2406
14 [ess_count 1\n, prog_count 7\n, rit_count 192] 1 7 192
15 [ess_count 104\n, prog_count 391\n, rit_count ... 104 391 1520
16 [ess_count 112\n, prog_count 120\n, rit_count ... 112 120 2326
17 [ess_count 10\n, prog_count 8\n, rit_count 44] 10 8 44
18 [ess_count 5\n, prog_count 12\n, rit_count 162] 5 12 162
19 [ess_count 10\n, prog_count 0\n, rit_count 40] 10 0 40
20 [ess_count 131\n, prog_count 134\n, rit_count ... 131 134 2556
21 [ess_count 126\n, prog_count 507\n, rit_count ... 126 507 2200
22 [ess_count 47\n, prog_count 85\n, rit_count 300] 47 85 300
23 [ess_count 237\n, prog_count 276\n, rit_count ... 237 276 3233
24 [ess_count 653\n, prog_count 489\n, rit_count ... 653 489 5641
25 [ess_count 0\n, prog_count 0\n, rit_count 0] NaN NaN NaN
26 [ess_count 0\n, prog_count 0\n, rit_count 0] NaN NaN NaN
27 [ess_count 0\n, prog_count 0\n, rit_count 0] NaN NaN NaN
28 [ess_count 126\n, prog_count 36\n, rit_count 640] 126 36 640
29 [ess_count 4\n, prog_count 4\n, rit_count 50] 4 4 50

In [69]:
type(dfslice.loc[0,"rit_count"])


Out[69]:
int

In [52]:
dfslice.loc[:,['wget_fail_flag','webtext','ess_count','rit_count','prog_count','counts_text','html_file_count','error_text']]


Out[52]:
wget_fail_flag webtext ess_count rit_count prog_count counts_text html_file_count error_text
0 0 [Skip to main content\n, State of Alaska\n, my... 4 29 1 [ess_count 4\n, prog_count 1\n, rit_count 29] 1 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
1 0 [Skip to Main Content\n, District Home\n, Sele... 45 1096 116 [ess_count 45\n, prog_count 116\n, rit_count 1... 61 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
2 0 [Tongass School of Arts and Sciences\n, Home\n... 3 80 7 [ess_count 3\n, prog_count 7\n, rit_count 80] 1 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
3 0 [Main\n, Main\n, Anchorage school district\n, ... 93 975 85 [ess_count 93\n, prog_count 85\n, rit_count 975] 49 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
4 0 [Skip to main content\n, State of Alaska\n, my... 4 35 1 [ess_count 4\n, prog_count 1\n, rit_count 35] 1 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
5 0 [Menu\n, Home\n, About\n, Mission\n, Leadershi... 1 25 8 [ess_count 1\n, prog_count 8\n, rit_count 25] 1 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
6 0 [Main\n, Anchorage School District\n, Educatin... 50 167 17 [ess_count 50\n, prog_count 17\n, rit_count 167] 5 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
7 0 [Home\n, APC\n, Business\n, Forms\n, Handbooks... 129 763 84 [ess_count 129\n, prog_count 84\n, rit_count 763] 25 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
8 0 [Main\n, Main\n, Anchorage school district\n, ... 89 774 60 [ess_count 89\n, prog_count 60\n, rit_count 774] 33 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
9 0 [Skip to main content\n, State of Alaska\n, my... 4 35 1 [ess_count 4\n, prog_count 1\n, rit_count 35] 1 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
10 0 [Main\n, Anchorage school district\n, ASD Depa... 116 1067 158 [ess_count 116\n, prog_count 158\n, rit_count ... 35 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
11 0 [Juneau Community Charter School \n, JCCS \n, ... 31 527 120 [ess_count 31\n, prog_count 120\n, rit_count 527] 11 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
12 0 [Skip to main content\n, State of Alaska\n, my... 4 35 1 [ess_count 4\n, prog_count 1\n, rit_count 35] 1 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
13 0 [Fireweed Academy\n, 995 Soundview Ave. Homer,... 36 2406 901 [ess_count 36\n, prog_count 901\n, rit_count 2... 273 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
14 0 [Soldotna Elementary School\n, 162 Park Ave. S... 1 192 7 [ess_count 1\n, prog_count 7\n, rit_count 192] 23 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
15 0 [Kaleidoscope School of Arts and Science\n, 54... 104 1520 391 [ess_count 104\n, prog_count 391\n, rit_count ... 87 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
16 0 [Skip to Main Content\n, District Home\n, Sele... 112 2326 120 [ess_count 112\n, prog_count 120\n, rit_count ... 61 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
17 0 [Home\n, Classroom Pages \n, ~ Mrs. MacDonald\... 10 44 8 [ess_count 10\n, prog_count 8\n, rit_count 44] 3 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
18 0 [Skip to Main Content\n, District Home\n, Sele... 5 162 12 [ess_count 5\n, prog_count 12\n, rit_count 162] 1 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
19 0 [ŧJù ̈ï ́<Ö ̧(­ðíwö(Z.`ŸlÄaÃ*âGz<ÏNŠQÐ... 10 40 0 [ess_count 10\n, prog_count 0\n, rit_count 40] 1 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
20 0 [Skip to Main Content\n, District Home\n, Sele... 131 2556 134 [ess_count 131\n, prog_count 134\n, rit_count ... 73 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
21 0 [Skip to content\n, About\n, Mission\n, Histor... 126 2200 507 [ess_count 126\n, prog_count 507\n, rit_count ... 85 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
22 0 [Home\n, About Us\n, Program Description\n, Sc... 47 300 85 [ess_count 47\n, prog_count 85\n, rit_count 300] 31 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
23 0 [Ø'TVÒ8“n°èM¤·ÿÍ1™C<1nsU0<‘Q èæLÌëc-3Î... 237 3233 276 [ess_count 237\n, prog_count 276\n, rit_count ... 105 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
24 0 [Skip to Main Content\n, District Home\n, Sele... 653 5641 489 [ess_count 653\n, prog_count 489\n, rit_count ... 177 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
25 1 NaN NaN NaN [ess_count 0\n, prog_count 0\n, rit_count 0] 0 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
26 1 NaN NaN NaN [ess_count 0\n, prog_count 0\n, rit_count 0] 0 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
27 1 NaN NaN NaN [ess_count 0\n, prog_count 0\n, rit_count 0] 0 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
28 0 [ ̄ö-`ùÅ+KÝZ$QÙEP3⁄43Çœ1⁄2Z@ýa?éGoÌy6?~Š ́... 126 640 36 [ess_count 126\n, prog_count 36\n, rit_count 640] 53 [duplicate_flag 0\n, parse_error_flag 0\n, wge...
29 0 [Skip to Main Content\n, District Home\n, Sele... 4 50 4 [ess_count 4\n, prog_count 4\n, rit_count 50] 1 [duplicate_flag 0\n, parse_error_flag 0\n, wge...

In [79]:
dfslice.loc[downloaded,"ess_strength"] = (dfslice.loc[downloaded,"ess_count"]/dfslice.loc[downloaded,"rit_count"]).apply(pd.to_numeric,downcast='float')
dfslice.loc[downloaded,"prog_strength"] = (dfslice.loc[downloaded,"prog_count"]/dfslice.loc[downloaded,"rit_count"]).apply(pd.to_numeric,downcast='float')
print(type(dfslice.loc[0,"ess_strength"]), "\n", dfslice.loc[downloaded,["ess_strength", "prog_strength"]])


<class 'numpy.float64'> 
     ess_strength  prog_strength
0       0.137931       0.034483
1       0.041058       0.105839
2       0.037500       0.087500
3       0.095385       0.087179
4       0.114286       0.028571
5       0.040000       0.320000
6       0.299401       0.101796
7       0.169069       0.110092
8       0.114987       0.077519
9       0.114286       0.028571
10      0.108716       0.148079
11      0.058824       0.227704
12      0.114286       0.028571
13      0.014963       0.374480
14      0.005208       0.036458
15      0.068421       0.257237
16      0.048151       0.051591
17      0.227273       0.181818
18      0.030864       0.074074
19      0.250000       0.000000
20      0.051252       0.052426
21      0.057273       0.230455
22      0.156667       0.283333
23      0.073307       0.085370
24      0.115760       0.086687
28      0.196875       0.056250
29      0.080000       0.080000
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py:477: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s

In [56]:
thisslice = schooldf.iloc[0:10,:]
pandify_webtext(thisslice)
thisslice


/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
Loading DF: 100%|██████████| 10/10 [00:00<00:00, 11.26it/s]
Loading DF: 100%|██████████| 10/10 [00:00<00:00, 78.07it/s]
Got here 1
Loading DF: 100%|██████████| 10/10 [00:00<00:00, 924.98it/s]
Loading DF:  50%|█████     | 5/10 [00:00<00:00, 40.19it/s]
Got here 2
Got here 3
Loading DF: 100%|██████████| 10/10 [00:00<00:00, 44.93it/s]
Got here 4
Got here 5
Got here 6
Got here 7
Got Here 8
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
    651             result = expressions.evaluate(op, str_rep, x, y,
--> 652                                           raise_on_error=True, **eval_kwargs)
    653         except TypeError:

/opt/conda/lib/python3.6/site-packages/pandas/computation/expressions.py in evaluate(op, op_str, a, b, raise_on_error, use_numexpr, **eval_kwargs)
    209         return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error,
--> 210                          **eval_kwargs)
    211     return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)

/opt/conda/lib/python3.6/site-packages/pandas/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b, raise_on_error, truediv, reversed, **eval_kwargs)
    120     if result is None:
--> 121         result = _evaluate_standard(op, op_str, a, b, raise_on_error)
    122 

/opt/conda/lib/python3.6/site-packages/pandas/computation/expressions.py in _evaluate_standard(op, op_str, a, b, raise_on_error, **eval_kwargs)
     62     with np.errstate(all='ignore'):
---> 63         return op(a, b)
     64 

TypeError: unsupported operand type(s) for /: 'str' and 'str'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in safe_na_op(lvalues, rvalues)
    675             with np.errstate(all='ignore'):
--> 676                 return na_op(lvalues, rvalues)
    677         except Exception:

/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
    657                 mask = notnull(x) & notnull(y)
--> 658                 result[mask] = op(x[mask], _values_from_object(y[mask]))
    659             elif isinstance(x, np.ndarray):

TypeError: unsupported operand type(s) for /: 'str' and 'str'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-54-ce314dcde17f> in pandify_webtext(df)
     57         print("Got Here 8")
---> 58         df.loc[downloaded,"ess_strength"] = df.loc[downloaded,"ess_count"]/df.loc[downloaded,"rit_count"]
     59         df.loc[downloaded,"prog_strength"] = df.loc[downloaded,"prog_count"]/df.loc[downloaded,"rit_count"]

/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in wrapper(left, right, name, na_op)
    714 
--> 715         result = wrap_results(safe_na_op(lvalues, rvalues))
    716         return construct_result(

/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in safe_na_op(lvalues, rvalues)
    685                     return _algos.arrmap_object(lvalues,
--> 686                                                 lambda x: op(x, rvalues))
    687             raise

pandas/src/algos_common_helper.pxi in pandas.algos.arrmap_object (pandas/algos.c:46681)()

/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in <lambda>(x)
    685                     return _algos.arrmap_object(lvalues,
--> 686                                                 lambda x: op(x, rvalues))
    687             raise

TypeError: unsupported operand type(s) for /: 'str' and 'str'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-56-28d47e39037e> in <module>()
      1 thisslice = schooldf.iloc[0:10,:]
----> 2 pandify_webtext(thisslice)
      3 thisslice

<ipython-input-54-ce314dcde17f> in pandify_webtext(df)
     68 
     69     except Exception as e:
---> 70         logging.critical("ERROR! Pandify function failed to load parsing output into DataFrame.\n" + e)
     71         print("    ERROR! Pandify function failed to load parsing output into DataFrame.")
     72         print("  ",e)

TypeError: must be str, not TypeError

In [ ]:
df = schooldf.iloc[0:10,:]
#df["webtext"], df["keywords_text"], df["ideology_text"], df["ess_count"], df["prog_count"], df["rit_count"] = df["webtext"].astype(object), df["keywords_text"].astype(object), df["ideology_text"].astype(object), df["ess_count"].astype(int), df["prog_count"].astype(int), df["rit_count"].astype(int)
df["webtext"], df["keywords_text"] = [], []
df["webtext"] = df["webtext"].astype(object)
df["keywords_text"] = df["keywords_text"].astype(object)
df

In [ ]:
# ### Run parsing algorithm on schools (requires access to webcrawl output)

if Debug:
    test_dicts = dicts_list[:1] # Limit number of schools to test/refine methods
    for school in test_dicts:
        parse_school(school)
    dictfile = "testing_dicts_" + str(datetime.today())
    save_datafile(test_dicts, temp_dir+dictfile, "JSON")
    sys.exit()
                
# Use multiprocessing.Pool(numcpus) to run parse_school(),
# which parses downloaded webtext and saves the results to local storage:
if __name__ == '__main__':
    with Pool(numcpus) as p:
        p.map(parse_school, tqdm(list(tuplist_zip), desc="Parsing folders"), chunksize=numcpus)

In [ ]:
# ### Load parsing output from disk into analyzable Python object (DataFrame or dicts_list)
        
"""# Now use dictify_webtext to load the parsing output from local storage into the list of dictionaries:
for school in dicts_list:
    try:
        school = dictify_webtext(school)
    except Exception as e:
        print("  ERROR! Failed to load into dict parsing output for " + school[NAME_var])
        print("  ",e)
        school_dict["parse_error_flag"] = 1
        continue"""


"""
# To work with limited system memory (almost there!), split this df into chunks, compute on each, and recombine later.
# The number of chunks equals numcpus, and schooldf is split by numschools/numcpus. 
for num in range(numcpus):
    try:
        "splitdf{}".format(str(num)) = pd.DataFrame()
        dfslice = pd.DataFrame()
        dfslice = schooldf.iloc[(splitnum*int(num-1)):(splitnum*int(num)),:]
    except Exception as e:
        print("  ERROR! Script failed to split schooldf into smaller DataFrame #" + str(num) + " of " + str(numcpus) + ".")
        print("  ",e)
        sys.exit()

# Now use pandify_webtext to load the parsing output from local storage into the DataFrame:
splitnum = int(round(float(numschools)/numcpus)) # Get chunk number based on total number of schools data
names_list = ["" for _ in range(numcpus)]
for num in numcpus:
    try:
        names_list[num] = "splitdf{}".format(str(num))
        dfslice = pd.DataFrame()
        dfslice = schooldf.iloc[(splitnum*int(num-1)):(splitnum*int(num)),:]
        dfslice = pandify_webtext(dfslice) # Load parsed output into the DF
        if num==1:
            save_datafile(dfslice,merged_df_file,"CSV") # Save this first chunk of results to new file
        else:
            dfslice.to_csv(merged_df_file,mode="a",columns=False,index=False) # Append this next chunk of results to existing saved results
        del dfslice # Free memory by deleting smaller slice
        
        if num = numcpus:
            del schooldf # Free memory by deleting full df, now that all the slices have been taken out
        
        splitdf1,splitdf2,splitdf3 = [pd.DataFrame() for _ in range(3)] # Initialize DFs to split into
        splitnum = int(round(float(numschools)/3)) # Get chunk number based on total number of schools data
        splitdf1 = schooldf.iloc[:splitnum,:]
        splitdf2 = schooldf.iloc[splitnum:splitnum*2,:]
        splitdf3 = schooldf.iloc[splitnum*2:,:]

        splitdf1 = pandify_webtext(splitdf1)
        save_datafile(splitdf1,merged_df_file,"CSV") # Save this first half of results
        del splitdf1 # Free memory

        splitdf2 = pandify_webtext(splitdf2)
        splitdf2.to_csv(merged_df_file,mode="a",columns=False,index=False) # Append these results to existing saved results
        del splitdf2 # Free memory

        splitdf3 = pandify_webtext(splitdf3)
        splitdf3.to_csv(merged_df_file,mode="a",columns=False,index=False) # Append these results to existing saved results
        del splitdf3 # Free memory
                  
    except Exception as e:
        print("  ERROR! Script failed to load parsing output into DataFrame #" + str(num) + " of " + str(numcpus) + "."")
        print("  ",e)
        sys.exit()"""


# Now create a Pandas DataFrame (from dicts_list or from file) and store the data in a memory-efficient way:
schooldf = pd.DataFrame.from_dict(dicts_list) # Convert dicts_list into a DataFrame
#schooldf = pd.read_pickle(temp_dir+"school_dicts_temp.pickle") # Use existing file while debugging pandify_webtext()
#schooldf = pd.read_csv(data_loc) # Creating school_df from scratch
schooldf = convert_df(schooldf) # Make this DF memory-efficient by converting appropriate columns to category data type
tqdm.pandas(desc="Loading DF") # To show progress, create & register new `tqdm` instance with `pandas`


# Load parsing output into big pandas DataFrame through slices (to work with limited system memory):
if dicts_list is not None:
    del dicts_list # Free memory
    
merged_df_file = temp_dir+"mergedf_"+str(datetime.today().strftime("%Y-%m-%d"))+".csv" # Prepare file name
slice_pandify(schooldf, numcpus*5, merged_df_file)
print("Larger DF successfully split into " + str(numcpus*5) + " smaller DFs, parsed, combined, and saved to file!")

if schooldf is not None:
    del schooldf # Free memory
else:
    pass
    
    
# Save final output:
print("\nSCHOOL PARSING COMPLETE!!!")
schooldf = pd.read_csv(merged_df_file,header=190) # Load full DF so we can save it pickle-style
schooldf = schooldf[schooldf.ADDRESS14 != 'ADDRESS14'] # Clean out any cases of header being written as row
newfile = "charters_parsed_" + str(datetime.today().strftime("%Y-%m-%d"))
save_datafile(schooldf, save_dir+newfile, "csv")