In [6]:
#!/usr/bin/env python
# -*- coding: UTF-8
import sys
sys.path.append("/usr/local/lib/python3.5/dist-packages")
Dictionary Analysis on HTML from wget
run!
In [84]:
# import necessary libraries
import os, re, fnmatch # for navigating file trees and working with strings
import csv # for reading in CSV files
#from glob import glob # for finding files within nested folders--compare with os.walk
import json, pickle # For saving a loading dictionaries, etc. from file with JSON and pickle formats
from datetime import datetime # For timestamping files
import sys # For working with user input
import logging # for logging output, to help with troubleshooting
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words
stemmer = PorterStemmer()
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
import urllib, urllib.request # for testing pages
from unicodedata import normalize # for cleaning text by converting unicode character encodings into readable format
#import shelve # For working with big dictionary files without having the whole file in memory at once
import pandas as pd # modifies data more efficiently than with a list of dicts
from tqdm import tqdm # For progress information during involved Pandas operations
# Import parser
from bs4 import BeautifulSoup # BS reads and parses even poorly/unreliably coded HTML
from bs4.element import Comment # helps with detecting inline/junk tags when parsing with BS
import lxml # for fast HTML parsing with BS, compared to "html.parser"
bsparser = "lxml"
In [85]:
# ### Set script options
Debug = False # Set to "True" for extra progress reports while algorithms run
notebook = True # Use different file paths depending on whether files are being accessed from shell (False) or within a Jupyter notebook (True)
usefile = False # Set to "True" if loading from file a dicts_list to add to. Confirms with user input first!
workstation = False # If working from office PC
if notebook:
usefile = False # Prompting user for input file is only useful in command-line
inline_tags = ["b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn",
"em", "kbd", "strong", "samp", "var", "bdo", "map", "object", "q",
"span", "sub", "sup"] # this list helps with eliminating junk tags when parsing HTML
In [86]:
# ### Set directories
if workstation and notebook:
dir_prefix = "C:\\Users\\Jaren\\Documents\\" # One level further down than the others
elif notebook:
dir_prefix = "/home/jovyan/work/"
else:
dir_prefix = "/vol_b/data/"
example_page = "https://westlakecharter.com/about/"
example_schoolname = "TWENTY-FIRST_CENTURY_NM"
save_dir = dir_prefix + "Charter-school-identities" + os.sep + "data" + os.sep # Directory in which to save data files
dicts_dir = dir_prefix + "Charter-school-identities" + os.sep + "dicts" + os.sep # Directory in which to find & save dictionary files
temp_dir = save_dir + "temp" + os.sep # Directory in which to save temporary data files
micro_sample13 = save_dir + "micro-sample13_coded.csv" # Random micro-sample of 300 US charter schools
URL_schooldata = save_dir + "charter_URLs_2014.csv" # 2014 population of 6,973 US charter schools
full_schooldata = save_dir + "charter_merged_2014.csv" # Above merged with PVI, EdFacts, year opened/closed
temp_data = save_dir + "school_parser_temp.json" # Full_schooldata dict with output for some schools
example_file = save_dir + "example_file.html" #example_folder + "21stcenturypa.com/wp/default?page_id=27.tmp.html"
In [87]:
# Set logging options
log_file = temp_dir + "dict_parsing_" + str(datetime.today()) + ".log"
logging.basicConfig(filename=log_file,level=logging.INFO)
In [88]:
# Set input file, if any
if usefile and not notebook:
print("\nWould you like to load from file a list of dictionaries to add to? (Y/N)")
answer = input()
if answer == "Y":
print("Please indicate file path for dictionary list file.")
answer2 = input()
if os.path.exists(answer2):
input_file = answer2
usefile = True
else:
print("Invalid file path. Aborting script.")
sys.exit()
elif answer == "N":
print("OK! This script will create a new file for this list of dictionaries.")
usefile = False
else:
print("Response not interpretable. Aborting script.")
sys.exit()
In [89]:
# ### Define (non-parsing) helper functions
def get_vars(data):
"""Defines variable names based on the data source called."""
if data==URL_schooldata:
URL_variable = "TRUE_URL"
NAME_variable = "SCH_NAME"
ADDR_variable = "ADDRESS"
elif data==full_schooldata:
URL_variable = "SCH_NAME" # Work-around until URLs merged into full data file
NAME_variable = "SCH_NAME"
ADDR_variable = "ADDRESS14"
elif data==micro_sample13:
URL_variable = "URL"
NAME_variable = "SCHNAM"
ADDR_variable = "ADDRESS"
else:
try:
print("Error processing variables from data file " + str(data) + "!")
except Exception as e:
print("ERROR: No data source established!\n")
print(e)
return(URL_variable,NAME_variable,ADDR_variable)
def tag_visible(element):
"""Returns false if a web element has a non-visible tag,
i.e. one site visitors wouldn't actually read--and thus one we don't want to parse"""
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def webtext_from_files(datalocation):
"""Concatenate and return a single string from all webtext (with .txt format) in datalocation"""
string = ""
for root, dirs, files in os.walk(datalocation):
for file in files:
if file.endswith(".txt"):
fileloc = open(datalocation+file, "r")
string = string + (fileloc.read())
return string
def remove_spaces(file_path):
"""Remove spaces from text file at file_path"""
words = [x for x in open(file_path).read().split() if x != ""]
text = ""
for word in words:
text += word + " "
return text
def write_errors(error_file, error1, error2, error3, file_count):
"""Writes to error_file three binary error flags derived from parse_school():
duplicate_flag, parse_error_flag, wget_fail_flag, and file_count."""
with open(error_file, 'w') as file_handler:
file_handler.write("duplicate_flag {}\n".format(int(error1)))
file_handler.write("parse_error_flag {}\n".format(int(error2)))
file_handler.write("wget_fail_flag {}\n".format(int(error3)))
file_handler.write("file_count {}".format(int(file_count)))
return
def write_counts(file_path, names_list, counts_list):
"""Writes to file_path the input dict_count names (a list) and counts (another list).
Assumes these two lists have same length and are in same order--
e.g., names_list[0]="ess_count" and counts_list[0]=ess_count."""
with open(file_path, 'w') as file_handler:
for tup in zip(names_list,counts_list): # iterate over zipped list of tuples
if tup != list(zip(names_list,counts_list))[-1]:
file_handler.write("{} {}\n".format(tup[0],tup[1]))
else:
file_handler.write("{} {}".format(tup[0],tup[1]))
return
def write_list(file_path, textlist):
"""Writes textlist to file_path. Useful for recording output of parse_school()."""
with open(file_path, 'w') as file_handler:
for elem in textlist:
file_handler.write("{}\n".format(elem))
return
def load_list(file_path):
"""Loads list into memory. Must be assigned to object."""
textlist = []
with open(file_path) as file_handler:
line = file_handler.readline()
while line:
textlist.append(line)
line = file_handler.readline()
return textlist
def save_datafile(data, file, thismode):
"""BROKEN for saving to CSV Pandas DataFrames (only saves header) and lists of dicts (only saves keys).
Saves data to file using JSON, pickle, or CSV format (whichever was specified).
Works with Pandas DataFrames or other objects, e.g. a list of dictionaries.
Deletes file first to reduce risk of data duplication."""
file = str(file)
thismode = str(thismode)
try:
if os.path.exists(file):
os.remove(file) # Delete file first to reduce risk of data duplication
else:
pass
if thismode.upper()=="JSON" or thismode.upper()==".JSON":
if not file.endswith(".json"):
file += ".json"
if type(data)=="pandas.core.frame.DataFrame":
data.to_json(file)
else:
with open(file, 'w') as outfile:
json.dump(data, outfile, encoding="utf-8")
#print("Data saved to " + file + "!")
elif thismode.lower()=="pickle" or thismode.lower()==".pickle":
if not file.endswith(".pickle"):
file += ".pickle"
if type(data)=="pandas.core.frame.DataFrame":
data.to_pickle(file, encoding="utf-8")
else:
with open(file, "wb") as outfile:
pickle.dump(data, outfile, encoding="utf-8")
#print("Data saved to " + file + "!")
elif thismode.upper()=="CSV" or thismode.upper()==".CSV":
if not file.endswith(".csv"):
file += ".csv"
if type(data)=="pandas.core.frame.DataFrame":
if os.path.exists(file): # If file already exists, assume we are appending to it (with same column names)
data.to_csv(file,mode="a",index=False,sep="\t",header=False,encoding="utf-8")
else: # If file doesn't exist, create it
data.to_csv(file,mode="w",index=False,sep="\t",header=data.columns.values,encoding="utf-8")
else:
with open(file, "w") as outfile:
wr = csv.writer(outfile)
wr.writerows(data)
#print("Data saved to " + file + "!")
else:
print("ERROR! Improper arguments. Please include: data object to save (Pandas DataFrames OK), file path, and file format ('JSON', 'pickle', or 'CSV').")
except Exception as e:
print("Failed to save to " + str(file) + " into memory using " + str(thismode) + " format. Please check arguments (data, file, file format) and try again.")
print(e)
def load_datafile(file):
"""Loads dicts_list (or whatever) from file, using either JSON or pickle format.
The created object should be assigned when called."""
file = str(file)
if file.lower().endswith(".json"):
with open(file,'r') as infile:
var = json.load(infile)
if file.lower().endswith(".pickle"):
with open(file,'rb') as infile:
var = pickle.load(infile)
print(file + " successfully loaded!")
return var
def load_dict(custom_dict, file_path):
"""Loads in a dictionary. Adds each entry from the dict at file_path to the defined set custom_dict (the input),
which can also be an existing dictionary. This allows the creation of combined dictionaries!"""
with open(file_path) as file_handler:
line = file_handler.readline()
while line:
custom_dict.add(stemmer.stem(line.replace("\n", ""))) # Add line after stemming dictionary entries and eliminating newlines
line = file_handler.readline() # Look for anything else in that line, add that too
return custom_dict
def list_files(folder_path, extension):
"""Outputs a list of every file in folder_path or its subdirectories that has a specified extension.
Prepends specified extension with '.' if it doesn't start with it already.
If no extension is specified, it just returns all files in folder_path."""
matches = []
if extension:
extension = str(extension) # Coerce to string, just in case
if extension and not extension.startswith("."):
extension = "." + extension
for dirpath,dirnames,filenames in os.walk(folder_path):
if extension:
for filename in fnmatch.filter(filenames, "*" + extension): # Use extension to filter list of files
matches.append(os.path.join(dirpath,filename))
else:
matches.append(os.path.join(dirpath,filename)) # If no extension, just take all files
return matches
def has_html(folder_path):
"""Simple function that counts .html files and returns a binary:
'True' if a specified folder has any .html files in it, 'False' otherwise."""
html_list = []
for dirpath,dirnames,filenames in os.walk(folder_path):
for file in fnmatch.filter(filenames, "*.html"): # Check if any HTML files in folder_path
html_list.append(file)
if len(html_list)==0:
return False
else:
return True
def convert_df(df):
"""Makes a Pandas DataFrame more memory-efficient through intelligent use of Pandas data types:
specifically, by storing columns with repetitive Python strings not with the object dtype for unique values
(entirely stored in memory) but as categoricals, which are represented by repeated integer values. This is a
net gain in memory when the reduced memory size of the category type outweighs the added memory cost of storing
one more thing. As such, this function checks the degree of redundancy for a given column before converting it."""
converted_df = pd.DataFrame() # Initialize DF for memory-efficient storage of strings (object types)
# TO DO: Infer dtypes of df
df_obj = df.select_dtypes(include=['object']).copy() # Filter to only those columns of object data type
for col in df.columns:
if col in df_obj:
num_unique_values = len(df_obj[col].unique())
num_total_values = len(df_obj[col])
if (num_unique_values / num_total_values) < 0.5: # Only convert data types if at least half of values are duplicates
converted_df.loc[:,col] = df[col].astype('category') # Store these columns as dtype "category"
else:
converted_df.loc[:,col] = df[col]
else:
converted_df.loc[:,col] = df[col]
converted_df.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='float')
converted_df.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
return converted_df
In [90]:
os.getcwd()
Out[90]:
In [91]:
#school_index = next((index for (index, d) in enumerate(dicts_list) if d["folder_name"] == "Natomas_Charter_CA"), None) # Find index of that school
#print(school_index)
#print(dicts_list[748]["folder_name"])
In [92]:
thispath = wget_dataloc + "Natomas_Charter_CA/"
html_list = []
for dirpath,dirnames,filenames in os.walk(thispath):
for file in fnmatch.filter(filenames, "*.html"): # Check if any HTML files in folder_path
html_list.append(file)
print(len(html_list))
#for dirpath,dirnames,filenames in os.walk(thispath):
# print(len([file for file in fnmatch.filter(filenames, "*.html")]))
In [93]:
def set_fail_flag2(folder_name):
"""The web_fail_flag indicates whether the webcrawl/download operation failed to capture any .html for a particular folder_name.
This function sets the web_fail_flag depending on two conditions:
(1) Whether or not there exists a web download folder corresponding to folder_name, and
(2) Whether or not that folder contains at least one file with the .html extension."""
global wget_dataloc,dicts_list # Need access to the dictionary file
web_fail_flag = "" # make output a str to work with currently limited Pandas dtype conversion functionality
folder_path = str(wget_dataloc) + folder_name + "/"
if (not os.path.exists(folder_path)) or (has_html(folder_path)==False):
web_fail_flag = str(1) # If folder doesn't exist, mark as fail and ignore when loading files
else:
web_fail_flag = str(0) # make str so can work with currently limited Pandas dtype conversion functionality
match_index = next((index for (index, d) in enumerate(dicts_list) if d["folder_name"] == folder_name), None) # Find dict index of input/folder_name
dicts_list[match_index]['wget_fail_flag'] = web_fail_flag # Assign output to dict entry for folder_name
return
In [94]:
#print(dicts_list[748]["wget_fail_flag"])
#print(dicts_list[748]["folder_name"])
In [95]:
#set_fail_flag2("Natomas_Charter_CA")
#print(dicts_list[748]["wget_fail_flag"])
In [96]:
# ### Set parsing keywords
keywords = ['values', 'academics', 'skills', 'purpose',
'direction', 'mission', 'vision', 'vision', 'mission', 'our purpose',
'our ideals', 'ideals', 'our cause', 'curriculum','curricular',
'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system',
'structure','philosophy', 'philosophical', 'beliefs', 'believe',
'principles', 'creed', 'credo', 'values','moral', 'history', 'our story',
'the story', 'school story', 'background', 'founding', 'founded',
'established','establishment', 'our school began', 'we began',
'doors opened', 'school opened', 'about us', 'our school', 'who we are',
'our identity', 'profile', 'highlights']
mission_keywords = ['mission','vision', 'vision:', 'mission:', 'our purpose', 'our ideals', 'ideals:', 'our cause', 'cause:', 'goals', 'objective']
curriculum_keywords = ['curriculum', 'curricular', 'program', 'method', 'pedagogy', 'pedagogical', 'approach', 'model', 'system', 'structure']
philosophy_keywords = ['philosophy', 'philosophical', 'beliefs', 'believe', 'principles', 'creed', 'credo', 'value', 'moral']
history_keywords = ['history', 'story','our story', 'the story', 'school story', 'background', 'founding', 'founded', 'established', 'establishment', 'our school began', 'we began', 'doors opened', 'school opened']
about_keywords = ['about us', 'our school', 'who we are', 'overview', 'general information', 'our identity', 'profile', 'highlights']
# Create sets for each aspect and one for all keywords
mission_keywords = set(stemmer.stem(word) for word in mission_keywords)
curriculum_keywords = set(stemmer.stem(word) for word in curriculum_keywords)
philosophy_keywords = set(stemmer.stem(word) for word in philosophy_keywords)
history_keywords = set(stemmer.stem(word) for word in history_keywords)
about_keywords = set(stemmer.stem(word) for word in about_keywords)
all_keywords = set(stemmer.stem(key) for key in keywords)
if Debug:
print("\nList of keywords:\n", list(all_keywords))
In [111]:
# ### Create dictionaries for each ideology and one for combined ideologies
ess_dict, prog_dict, rit_dict, all_ideol = set(), set(), set(), set()
all_ideol = load_dict(all_ideol, dicts_dir + "ess_dict.txt")
all_ideol = load_dict(all_ideol, dicts_dir + "prog_dict.txt")
ess_dict = load_dict(ess_dict, dicts_dir + "ess_dict.txt")
prog_dict = load_dict(prog_dict, dicts_dir + "prog_dict.txt")
rit_dict = load_dict(rit_dict, dicts_dir + "rit_dict.txt")
logging.info(str(len(all_ideol)) + "entries loaded into the combined ideology dictionary.")
list_dict = list(all_ideol)
list_dict.sort(key = lambda x: x.lower())
print("First 10 elements of combined ideology dictionary are:\n", list_dict[:10])
In [17]:
# ### Define list of tuples: keywords lists and their titles, for dictionary analyses
titles_list = ("mission","curriculum","philosophy","history","about","ideology","keywords")
keysnames_tupzip = zip((mission_keywords,curriculum_keywords,philosophy_keywords,history_keywords,about_keywords,\
all_ideol,all_keywords), titles_list)
dictsnames_list = ("ess", "prog", "rit", "all_ideol")
dictsnames_tupzip = zip((ess_dict,prog_dict,rit_dict,all_ideol), dictsnames_list)
if Debug:
print(list(keysnames_tupzip))
print()
print(list(dictsnames_tupzip))
In [45]:
# ### Define parsing helper functions
def parsefile_by_tags(HTML_file):
"""Cleans HTML by removing inline tags, ripping out non-visible tags,
replacing paragraph tags with a random string, and finally using this to separate HTML into chunks.
Reads in HTML from storage using a given filename, HTML_file."""
random_string = "".join(map(chr, os.urandom(75))) # Create random string for tag delimiter
soup = BeautifulSoup(open(HTML_file), "html5lib")
[s.extract() for s in soup(['style', 'script', 'head', 'title', 'meta', '[document]'])] # Remove non-visible tags
for it in inline_tags:
[s.extract() for s in soup("</" + it + ">")] # Remove inline tags
visible_text = soup.getText(random_string).replace("\n", "") # Replace "p" tags with random string, eliminate newlines
# Split text into list using random string while also eliminating tabs and converting unicode to readable text:
visible_text = list(normalize("NFKC",elem.replace("\t","")) for elem in visible_text.split(random_string))
# TO DO: Eliminate anything with a '\x' in it (after splitting by punctuation)
visible_text = list(filter(lambda vt: vt.split() != [], visible_text)) # Eliminate empty elements
# Consider joining list elements together with newline in between by prepending with: "\n".join
return(visible_text)
In [46]:
if Debug:
example_textlist = parsefile_by_tags(example_file)
print("Output of parsefile_by_tags:\n\n", example_textlist, "\n\n")
In [64]:
# ### Define dictionary matching helper functions
def dict_count(text_list, custom_dict):
"""Performs dictionary analysis, returning number of dictionary hits found.
Removes punctuation and stems the phrase being analyzed.
Compatible with multiple-word dictionary elements."""
counts = 0 # number of matches between text_list and custom_dict
dictless_list = [] # Updated text_list with dictionary hits removed
max_entry_length = max([len(entry.split()) for entry in custom_dict]) # Get length (in words) of longest entry in combined dictionary
for chunk in text_list: # chunk may be several sentences or possibly paragraphs long
chunk = re.sub(r'[^\w\s]', '', chunk) # Remove punctuation with regex that keeps only letters and spaces
# Do dictionary analysis for word chunks of lengths max_entry_length down to 1, removing matches each time.
# This means longer dict entries will get removed first, useful in case they contain smaller entries.
for length in range(max_entry_length, 0, -1):
dictless_chunk,len_counts = dict_match_len(chunk,custom_dict,length)
dictless_list.append(dictless_chunk)
counts += len_counts
return dictless_list,int(counts)
def dict_match_len(phrase, custom_dict, length):
"""Helper function to dict_match.
Returns # dictionary hits and updated copy of phrase with dictionary hits removed.
Stems phrases before checking for matches."""
hits_indices, counts = [], 0
splitted_phrase = phrase.split()
if len(splitted_phrase) < length:
return phrase, 0 # If text chunk is shorter than length of dict entries being matched, don't continue.
for i in range(len(splitted_phrase) - length + 1):
to_stem = ""
for j in range(length):
to_stem += splitted_phrase[i+j] + " " # Builds chunk of 'length' words
stemmed_word = stemmer.stem(to_stem[:-1]) # stem chunk
if stemmed_word in custom_dict:
hits_indices.append(i) # Store the index of the word that has a dictionary hit
counts += 1
#print(stemmed_word)
# Iterate through list of matching word indices and remove the matches
for i in range(len(hits_indices)-1, -1, -1):
splitted_phrase = splitted_phrase[:hits_indices[i]] + \
splitted_phrase[hits_indices[i] + length:]
modified_phrase = ""
for sp in splitted_phrase: # Rebuild the modified phrase, with matches removed
modified_phrase += sp + " "
return modified_phrase[:-1], counts
# @timeout_decorator.timeout(20, use_signals=False)
def dictmatch_file_helper(file, listlists, allmatch_count):
"""Counts number of matches in file for each list of terms given, and also collects the terms not matched.
listlists is a list of lists, each list containing:
a list of key terms--e.g., for dictsnames_biglist, currently essentialism, progressivism, ritualism, and all three combined (ess_dict, prog_dict, rit_dict, all_dicts);
the variables used to store the number of matches for each term lit (e.g., ess_count, prog_count, rit_count, alldict_count);
and the not-matches--that is, the list of words leftover from the file after all matches are removed (e.g., ess_dictless, prog_dictless, rit_dictless, alldict_dictless). """
for i in range(len(dictsnames_biglist)): # Iterate over dicts to find matches with parsed text of file
# For dictsnames_list, dicts are: (ess_dict, prog_dict, rit_dict, alldict_count); count_names are: (ess_count, prog_count, rit_count, alldict_count); dictless_names are: (ess_dictless, prog_dictless, rit_dictless, alldict_dictless)
# adict,count_name,dictless_name = dictsnames_tupzip[i]
dictless_add,count_add = dict_count(parsed_pagetext,listlists[i][0])
listlists[i][1] += count_add
listlists[i][2] += dictless_add
allmatch_count += count_add
print("Discovered " + str(count_add) + " matches for " + str(file) + \
", a total thus far of " + str(allmatch_count) + " matches...")
return listlists,allmatch_count
In [48]:
if Debug:
print("\nOutput of dict_count with ideology dict:\n\n", dict_count(example_textlist,all_ideol), "\n\n")
In [49]:
def filter_dict_page(pagetext_list, keyslist):
"""Filters webtext of a given .html page, which is parsed and in list format, to only those strings
within pagetext_list containing an element (word or words) of inputted keyslist.
Returns list filteredtext wherein each element has original case (not coerced to lower-case)."""
filteredtext = [] # Initialize empty list to hold strings of page
for string in pagetext_list:
lowercasestring = str(string).lower() # lower-case string...
dict_list = [key.lower() for key in list(keyslist)] # ...compared with lower-case element of keyslist
for key in dict_list:
if key in lowercasestring and key in lowercasestring.split(' '): # Check that the word is the whole word not part of another one
filteredtext.append(string)
return filteredtext
In [50]:
if Debug:
print("Output of filter_dict_page:\n\n", filter_dict_page(example_textlist, all_keywords), "\n\n")
In [79]:
def filter_by_keycount(folder_path):
"""NOT USED.
Filters webtext for a given school to only those text chunks containing specified keywords.
Categorizes each block of text by scoring based on keyword count, using already-defined lists of keywords per category:
mission, philosophy, curriculum, history, "about"/general self-description, combined ideology, and all keywords."""
# TO DO: Fix this function! And compare speed with that of filter_dict_page() above, especially for longer pages.
# Initialize keyword lists to count over (must be defined outside function)
global mission_keywords,curriculum_keywords,philosophy_keywords,history_keywords,about_keywords,all_ideol,all_keywords
mission_list,curriculum_list,philosophy_list,history_list,about_list,ideol_list,keys_list, = [],[],[],[],[],[],[]
file_list = list_files(folder_path, ".html")
for file in tqdm(file_list, desc="Filtering by keys:"):
try:
pagetext_list = parsefile_by_tags(file)
for string in pagetext_list:
mission_score, curriculum_score, philosophy_score, history_score, about_score, ideol_score, keys_score = 0, 0, 0, 0, 0, 0, 0
for word in mission_keywords:
mission_score+=string.count(word)
if 'mission' in string.lower():
mission_score = 2
for word in curriculum_keywords:
curriculum_score+=string.count(word)
if 'curriculum' in string.lower():
curriculum_score = 2
for word in philosophy_keywords:
philosophy_score+=string.count(word)
if 'philosophy' in string.lower() or 'value' in string.lower():
philosophy_score = 2
for word in history_keywords:
history_score+=string.count(word)
if 'history' in string.lower():
history_score = 2
for word in about_keywords:
about_score+=string.count(word)
if 'about us' in string.lower() or "about-us" in string.lower():
about_score = 2
for word in all_ideol:
ideol_score+=string.count(word)
if mission_score>=2:
mission_list.append(string)
if curriculum_score>=2:
curriculum_list.append(string)
if philosophy_score>=2:
philosophy_list.append(string)
if history_score>=2:
history_list.append(string)
if about_score>=2:
about_list.append(string)
if ideol_score>=2:
ideol_list.append(string)
if ((mission_score + curriculum_score + philosophy_score + about_score) >=2):
keys_list.append(string) # Impute keywords counting using its ideological constitutent elements--which excludes history_score
except Exception as e:
if Debug:
print(" ERROR categorizing " + str(file))
print(e)
continue
return mission_list, curriculum_list, philosophy_list, history_list, about_list, ideol_list, keys_list
In [81]:
print("Output of filter_by_keycount:\n\n", filter_by_keycount(example_folder), "\n\n")
In [118]:
def dict_bestmatch(folder_path, custom_dict):
"""Parse through all .html files in folder_path, detecting matches with custom_dict,
to find and return the full text from the html page that has the most matches with that dictionary."""
# Initialization
file_list = list_files(folder_path, ".html") # Get full list of file paths
num_pages = len(file_list) # Number of pages in school's folder
max_page_hits = (-1,-1) # Initialize tuple holding #hits, page number for HTML file with greatest # matches with custom_dict
max_weighted_score = (-1,-1) # Same as previous, but weighted by page length
max_hit_text,max_score_text = [],[] # Empty lists for each best matching pages
# Parse through pages to find maximum number of hits of custom_dict on any page
for pagenum in tqdm(range(num_pages), desc="Finding best match:"):
try:
page_dict_count,page_weighted_score = -1,-1
page_textlist = parsefile_by_tags(file_list[pagenum]) # Parse page with index pagenum into text list
if len(page_textlist)==0: # If page is empty, don't bother with it
continue
dictless_text, page_dict_hits = dict_count(page_textlist, custom_dict) # Count matches between custom_dict and page_textlist using dict_count
numwords = len('\n'.join(page_textlist).split())
page_weighted_score = page_dict_hits / numwords # Weight score by number of words on page
logging.info("Found" + str(page_dict_hits) + "for page #" + str(pagenum) + "and " + str(page_dict_hits) + "weighting for the " + numwords + " words on that page.")
if page_dict_hits > max_page_hits[0]: # Compare matches for this page with overall max
max_page_hits = (page_dict_hits, pagenum) # If its greater, then make new page the max
if page_weighted_score > max_weighted_score[0]: # Same as previous two lines, but weighted by page length
max_weighted_score = (page_weighted_score, pagenum)
except Exception as e:
logging.debug(" ERROR counting dict matches in page #" + str(pagenum))
logging.debug(str(e))
continue
logging.info("Number matches and index of best matching page: " + str(max_page_hits[0]) + " " + str(max_page_hits[1]))
logging.info("Number matches and index of best WEIGHTED matching page: " + str(max_weighted_score[0]) + " " + str(max_weighted_score[1]))
# Use pagenum to get text for page with highest number of hits and weighted score:
max_hit_text = parsefile_by_tags(file_list[max_page_hits[1]])
max_score_text = parsefile_by_tags(file_list[max_weighted_score[1]])
logging.info("Page with the highest number of dictionary hits:\n\n" + str(max_hit_text))
logging.info("Page with the highest weighted score:\n\n" + str(max_score_text))
return max_hit_text,max_score_text
In [119]:
print("Output of dict_bestmatch for all ideologies:\n", dict_bestmatch(example_folder, mission_keywords), "\n\n" )
In [15]:
def parse_school(school_dict):
"""This core function parses webtext for a given school, using helper functions to run analyses and then saving multiple outputs to school_dict:
counts of the number of matches between all text from a school's html pages and keywords from a defined keyword list, find dict_count();
and text contents of those individual pages best matching such keywords, via find_best_categories (in development).
For the sake of parsimony and manageable script calls, OTHER similar functions/scripts collect these additional outputs:
full (partially cleaned) webtext, by parsing webtext of each .html file (removing inline tags, etc.) within school's folder, via parsefile_by_tags();
filtered webtext, by keeping only those parsed text elements containing a keyword in previously defined keywords list, via filter_keywords_page();
and parsed webtext, having removed overlapping headers/footers common to multiple pages, via remove_overlaps()."""
# Allow function to access these variables already defined outside the function (globally)
global itervar,numschools,parsed,wget_dataloc,URL_var,NAME_var,ADDR_var
datalocation = wget_dataloc # Define path to local data storage
school_name, school_address, school_URL = school[NAME_var], school[ADDR_var], school[URL_var] # Define varnames
itervar+=1 # Count this school
print("Parsing " + str(school_name) + ", which is school #" + str(itervar) + " of " + str(numschools) + "...")
# Initialize variables
school_dict['ess_strength'],school_dict['prog_strength'] = 0.0,0.0
if not usefile:
school_dict["duplicate_flag"], school_dict["parse_error_flag"] = 0, 0
# Assign folder names
folder_name = re.sub(" ","_",(school_name+" "+school_address[-8:-6]))
school_dict["folder_name"] = folder_name
school_folder = datalocation + folder_name + "/"
if school_URL==school_name:
school_URL = folder_name # Workaround for full_schooldata, which doesn't yet have URLs
# Check if folder exists. If not, exit function
if not (os.path.exists(school_folder) or os.path.exists(school_folder.lower()) or os.path.exists(school_folder.upper())):
print("!! NO DIRECTORY FOUND matching " + str(school_folder) + ".\n Aborting parsing function...\n\n")
school_dict['wget_fail_flag'] = 1
return
""" # Commented out until dict_bestmatch() works
try:
for keylist,title in list(keysnames_tupzip): # Names are: ("mission","curriculum","philosophy","history","about","ideology","keywords")
bestvar_name = title + "_best" # assign varname to use as dict key
school_dict[bestvar_name],school_dict[bestvar_name+"_weighted"] = [],[] # initialize dict key/value pair as empty string
best_page,best_page_weighted = dict_bestmatch(school_folder,keylist) # Find pages best corresponding to keyword category for each in keysnames_tupzip
school_dict[bestvar_name].extend(best_page)
school_dict[bestvar_name+"_weighted"].extend(best_page_weighted)
except Exception as e:
print(" ERROR! Failed to find best pages while parsing webtext of " + str(school_name))
print(" ",e)
"""
try:
for adict,name in list(dictsnames_tupzip): # Names are: ("ess", "prog", "rit", "all_ideol")
dict_name = name + "_count"
school_dict[dict_name] = dict_count(school_folder,adict)[1]
school_dict['ess_strength'] = float(school_dict['ess_count'])/float(school_dict['rit_count'])
school_dict['prog_strength'] = float(school_dict['prog_count'])/float(school_dict['rit_count'])
print(" SUCCESS! Counted dictionary matches for " + str(school_name) + "...")
save_to_file(dicts_list, save_dir+"school_dictcounts_temp", "JSON") # Save output so we can pick up where left off, in case something breaks before able to save final output
return
except:
print(" ERROR! Failed to count number of dict matches while parsing webtext of " + str(school_name))
print(" ",e)
school_dict["parse_error_flag"] = 1
return
In [16]:
# ### Preparing data to be parsed
itervar = 0 # initialize iterator that counts number of schools already parsed
parsed = [] # initialize list of URLs that have already been parsed
dicts_list = [] # initialize list of dictionaries to hold school data
# If input_file was defined by user input in beginning of script, use that to load list of dictionaries. We'll add to it!
if usefile and not dicts_list:
dicts_list = load_datafile(input_file)
data_loc = full_schooldata # If loading data, assume we're running on full charter population
else:
# set charter school data file and corresponding varnames:
data_loc = full_schooldata # Run at scale using URL list of full charter population
# data_loc = micro_sample13 # This seems nice for debugging--except directories don't match because different data source
# Create dict list from CSV on file, with one dict per school
with open(data_loc, 'r', encoding = 'Latin1') as csvfile: # open data file
reader = csv.DictReader(csvfile) # create a reader
for row in reader: # loop through rows
dicts_list.append(row) # append each row to the list
URL_var,NAME_var,ADDR_var = get_vars(data_loc) # get varnames depending on data source
numschools = len(dicts_list) # Count number of schools in list of dictionaries
# Note on data structures: each row, dicts_list[i] is a dictionary with keys as column name and value as info.
# This will be translated into pandas data frame once (rather messy) website text is parsed into consistent variables
In [80]:
for school in dicts_list:
school["folder_name"] = re.sub(" ","_",(school[NAME_var]+" "+school[ADDR_var][-8:-6])) # This gives name and state separated by "_"
school["folder_path"] = str(wget_dataloc) + school["folder_name"] + "/" # This temporary variable simplifies next line of code
if (has_html(school["folder_path"])==False) or not os.path.exists(school["folder_path"]):
school['wget_fail_flag'] = str(1) # If folder doesn't exist, mark as fail and ignore when loading files
else:
school['wget_fail_flag'] = str(0) # make str so can work with currently limited Pandas dtype conversion functionality
In [88]:
print(schooldf[schooldf["folder_name"]=="Effie_Kokrine_Charter_School_AK"][["wget_fail_flag","folder_path"]])
In [90]:
print(schooldf[schooldf["folder_name"]=="Natomas_Charter_CA"][["wget_fail_flag","folder_path"]])
In [84]:
schooldf = pd.DataFrame.from_dict(dicts_list) # Convert dicts_list into a DataFrame
schooldf.info()
schooldf.head(4)
Out[84]:
In [43]:
schooldf["wget_fail_flag"] = schooldf["wget_fail_flag"].map({"1":True,1:True,"0":False,0:False}) # Convert to binary to use as conditional
In [49]:
schooldf[schooldf["wget_fail_flag"]==True][["folder_name","wget_fail_flag"]]
Out[49]:
In [39]:
schooldf.head(26)
Out[39]:
In [32]:
schooldf.wget_fail_flag.sum()
Out[32]:
In [79]:
tqdm.pandas(desc="Rocking pandas!")
In [ ]:
# ### Run parsing algorithm on schools (requires access to webcrawl output)
test_dicts = dicts_list[0] # Limit number of schools to analyze, in order to refine methods
if Debug:
for school in test_dicts:
parse_school(school)
else:
for school in dicts_list:
parse_school(school)
In [ ]:
# Check out results:
if Debug:
print(test_dicts[0])
else:
print(dicts_list[0])
In [ ]:
# Save output:
if Debug:
dictfile = "testing_dicts_" + str(datetime.today().strftime("%Y-%m-%d"))
save_to_file(test_dicts, save_dir+dictfile, "JSON")
else:
dictfile = "school_dicts_" + str(datetime.today().strftime("%Y-%m-%d"))
save_to_file(dicts_list, save_dir+dictfile, "JSON")