by David Taylor, www.prooffreader.com (blog), www.dtdata.io (hire me!)
This IPython notebook contains a Python script that:
The SQL database contains eight tables; the schema is explained below. They are not in totally normal form, as IMO this would have been overkill.
There already are SQL dumps of Shakespeare on the Internet, but they are incomplete. They only contain plays, and only the 'main 38' plays of undisputed authorship. This contains all work reasonably attributed to Shakespeare, including the poems.
The XML files come from Ron Severdia's GitHub, https://github.com/severdia/PlayShakespeare.com-XML
Note that the SQL files are not totally normalized. I tried to find a compromise between thoroughness and usability.
I plan to do things with this data. I'll keep you posted.
The XML files are very good, but they're not perfect. There are a few errors and duplications. The script handles them individually
In [2]:
from __future__ import (absolute_import, division,
print_function, unicode_literals)
# I did this all in Python 3.4, I have no reason to think it won't work
# in Python 2 with the above statements, but I haven't tried
import sys
print(sys.version)
import pandas as pd
import os.path
from collections import Counter
import xml.etree.cElementTree as ET
import mysql.connector
from sqlalchemy import create_engine
import ftfy
#ftfy is not part of the standard python, but it's great for making sure all
#characters are ascii-fied
In [2]:
filenames = ['alls_well_that_ends_well.xml', 'as_you_like_it.xml',
'comedy_of_errors.xml', 'loves_labours_lost.xml', 'measure_for_measure.xml',
'merchant_of_venice.xml', 'merry_wives_of_windsor.xml',
'midsummer_nights_dream.xml', 'much_ado_about_nothing.xml', 'pericles.xml',
'taming_of_the_shrew.xml', 'tempest.xml', 'troilus_and_cressida.xml',
'twelfth_night.xml', 'two_gentlemen_of_verona.xml',
'two_noble_kinsmen.xml', 'winters_tale.xml', 'henry_iv_pt1.xml',
'henry_iv_pt2.xml', 'henry_v.xml', 'henry_vi_pt1.xml', 'henry_vi_pt2.xml',
'henry_vi_pt3.xml', 'henry_viii.xml', 'king_lear.xml',
'king_richard_ii.xml', 'richard_iii.xml', 'antony_cleopatra.xml',
'coriolanus.xml', 'cymbeline.xml', 'hamlet.xml', 'julius_caesar.xml',
'king_john.xml', 'macbeth.xml', 'othello.xml', 'romeo_and_juliet.xml',
'timon_of_athens.xml', 'titus_andronicus.xml', 'double_falsehood.xml',
'edward_iii.xml', 'lovers_complaint.xml', 'passionate_pilgrim.xml',
'phoenix_and_turtle.xml', 'rape_of_lucrece.xml', 'sir_thomas_more.xml',
'sonnets.xml', 'to_the_queen.xml', 'venus_and_adonis.xml']
# The following two dicts are the only information I add to what is in the XML files.
# The first points out whether the work is 'canon', i.e. attributed unreservedly to
# Shakespeare, and the second gives the standard abbreviations for the canon works.
# I devised abbreviations for the non-canon works.
categories = {'double_falsehood.xml': 'Non-canon', 'passionate_pilgrim.xml': 'Non-canon',
'to_the_queen.xml': 'Non-canon', 'edward_iii.xml': 'Non-canon',
'sir_thomas_more.xml': 'Non-canon', 'alls_well_that_ends_well.xml': 'Canon',
'as_you_like_it.xml': 'Canon', 'comedy_of_errors.xml': 'Canon',
'loves_labours_lost.xml': 'Canon', 'measure_for_measure.xml': 'Canon',
'merchant_of_venice.xml': 'Canon', 'merry_wives_of_windsor.xml': 'Canon',
'midsummer_nights_dream.xml': 'Canon', 'much_ado_about_nothing.xml': 'Canon',
'pericles.xml': 'Canon', 'taming_of_the_shrew.xml': 'Canon', 'tempest.xml': 'Canon',
'troilus_and_cressida.xml': 'Canon', 'twelfth_night.xml': 'Canon',
'two_gentlemen_of_verona.xml': 'Canon', 'two_noble_kinsmen.xml': 'Non-canon',
'winters_tale.xml': 'Canon', 'henry_iv_pt1.xml': 'Canon',
'henry_iv_pt2.xml': 'Canon', 'henry_v.xml': 'Canon',
'henry_vi_pt1.xml': 'Canon', 'henry_vi_pt2.xml': 'Canon',
'henry_vi_pt3.xml': 'Canon', 'henry_viii.xml': 'Canon',
'king_lear.xml': 'Canon', 'king_richard_ii.xml': 'Canon',
'richard_iii.xml': 'Canon', 'antony_cleopatra.xml': 'Canon',
'coriolanus.xml': 'Canon', 'cymbeline.xml': 'Canon',
'hamlet.xml': 'Canon', 'julius_caesar.xml': 'Canon',
'king_john.xml': 'Canon', 'macbeth.xml': 'Canon',
'othello.xml': 'Canon', 'romeo_and_juliet.xml': 'Canon',
'timon_of_athens.xml': 'Canon', 'titus_andronicus.xml': 'Canon',
'lovers_complaint.xml': 'Canon', 'phoenix_and_turtle.xml': 'Canon',
'rape_of_lucrece.xml': 'Canon', 'sonnets.xml': 'Canon',
'venus_and_adonis.xml': 'Canon'}
abbrevs = {'alls_well_that_ends_well.xml': 'AWW',
'antony_cleopatra.xml': 'Ant', 'as_you_like_it.xml': 'AYL',
'comedy_of_errors.xml': 'Err', 'coriolanus.xml': 'Cor',
'cymbeline.xml': 'Cym', 'double_falsehood.xml': 'DF',
'edward_iii.xml': 'Edw', 'hamlet.xml': 'Ham',
'henry_iv_pt1.xml': '1H4', 'henry_iv_pt2.xml': '2H4',
'henry_v.xml': 'H5', 'henry_vi_pt1.xml': '1H6',
'henry_vi_pt2.xml': '2H6', 'henry_vi_pt3.xml': '3H6',
'henry_viii.xml': 'H8', 'julius_caesar.xml': 'JC',
'king_john.xml': 'Jn', 'king_lear.xml': 'Lr',
'king_richard_ii.xml': 'R2', 'lovers_complaint.xml': 'LC',
'loves_labours_lost.xml': 'LLL', 'macbeth.xml': 'Mac',
'measure_for_measure.xml': 'MM', 'merchant_of_venice.xml': 'MV',
'merry_wives_of_windsor.xml': 'Wiv', 'midsummer_nights_dream.xml': 'MND',
'much_ado_about_nothing.xml': 'Ado', 'othello.xml': 'Oth',
'passionate_pilgrim.xml': 'PP', 'pericles.xml': 'Per',
'phoenix_and_turtle.xml': 'PhT', 'rape_of_lucrece.xml': 'Luc',
'richard_iii.xml': 'R3', 'romeo_and_juliet.xml': 'Rom',
'sir_thomas_more.xml': 'STM', 'sonnets.xml': 'Son',
'taming_of_the_shrew.xml': 'Shr', 'tempest.xml': 'Tmp',
'timon_of_athens.xml': 'Tim', 'titus_andronicus.xml': 'Tit',
'to_the_queen.xml': 'TTQ', 'troilus_and_cressida.xml': 'Tro',
'twelfth_night.xml': 'TN', 'two_gentlemen_of_verona.xml': 'TGV',
'two_noble_kinsmen.xml': 'TNK', 'venus_and_adonis.xml': 'Ven',
'winters_tale.xml': 'WT'}
In [65]:
# files that already exist in working directory will be ignored
url_base = "https://raw.githubusercontent.com/severdia/PlayShakespeare.com-XML/master/playshakespeare_editions/"
for filename in filenames:
if not os.path.isfile(filename):
url = url_base + filename
r = requests.get(url)
with open(filename, "wb+") as f:
f.write(r.content)
In [66]:
c = Counter() # collect and count all paths from root to node
# only if node has text or attributes, i.e. is not just a
# path to a further node with text or attributes
counter = 0
counter_min = 400
counter_max = 410
print("A sample of the data:")
for filename in filenames:
tree = ET.parse(filename)
countdowns = [1] # keeps track of how many children there remain to be traversed
current_tree = [] # keeps track of current path
current_tag_tree = []
last_level = 0
for elem in tree.getiterator():
counter += 1
level = str(len(countdowns))
tag, text, tail = None, None, None
attribs = {}
children = 0
if elem.tag:
tag = elem.tag
if elem.text:
text = elem.text.strip()
if len(text) == 0:
text = None
if elem.attrib.items():
for key, value in elem.attrib.items():
attribs[key] = value
if list(elem):
children = len(list(elem))
if elem.tail:
tail = elem.tail.strip()
if len(tail) == 0:
tail = None
current_level = len(countdowns)
if current_level > last_level:
current_tree.append(tag)
elif current_level < last_level:
n = last_level - current_level
current_tree = current_tree[:-n]
current_tree[-1] = tag
last_level = current_level
if text or len(attribs) > 0:
c.update(['|'.join(current_tree)])
if counter_min <= counter < counter_max:
print(countdowns)
print(current_tree)
print('level : '+level)
if tag is not None:
print('tag : '+tag)
if text:
print('text : '+text)
if len(attribs) > 0:
for key, value in attribs.items():
print('attribute : '+key +' : '+value)
print('children : '+str(children))
if tail:
print('tail : '+tail)
print('====================')
# update countdowns for the NEXT iteration
# do any calculations or collection BEFORE this point
if children > 0:
countdowns.append(children)
else:
while len(countdowns) > 0 and countdowns[-1] == 1:
countdowns = countdowns[:-1]
if len(countdowns) > 0:
countdowns[-1] -= 1
In [25]:
# show all paths leading to text or attributes
import pandas as pd
ks = []
vs = []
for key, value in c.items():
ks.append(key)
vs.append(value)
tagdf = pd.DataFrame({'path': ks, 'cnt': vs})
tagdf.sort('path', inplace=True)
tagdf.reset_index(drop=True, inplace=True)
for i in range(len(tagdf)):
print("{0:5} {1}".format(tagdf.cnt.iloc[i], tagdf.path.iloc[i]))
The actual schemas appear in the next cell; this is an explanation as to how some elements of the schema work and why certain design choices were made.
TABLE 'plays'
One row per play. Poems go in their own table.
COLUMNS: 'title', 'title_long', 'title_abbrev' give the normal, long and abbreviated forms of the title.
They come from the <title short="">, <title> tags and dict above, respectively.
Note that I changed the nomenclature so that 'title_long' comes from <title> and 'title' comes from <title short="">
This is because the so-called short title is the by far most familiar form of the title of each work,
so it should be used as a Primary Key and should be, semantically, the base form.
All other columns are self-explanatory.
TABLE 'characters'
One row per character. Comes from the Dramatis Personae at the beginning of each file.
See the further explanatory note about character names below.
'name' is the long form of the character name, e.g. 'Macbeth'
'name_short' is the all-caps abbreviation, e.g. 'MACB.'
Each character has one row where the fields 'alias' and 'alias_short' repeat 'name' and 'name_short', respectively.
If the character appears in a grouping with another (i.e. a single line of dialogue is attributed to two or more named
characters simultaneously), there is another row with the same 'name' and 'name_short' but with the other two columns as, e.g.:
'alias': 'Macbeth and Lennox'
'alias_short': 'MACB. AND LEN.'
'speaking_bool': 1 if the character has dialogue, 0 if it is a non-speaking part (or group)
Explanatory note for columns 'act' and 'scene' in following tables:
'scene': 'Scene 1', 'Scene 2', etc., or 'Prologue' or 'Epilogue'
'act': 'Act 1', 'Act 2', etc, but '0' if the scene is 'Prologue' or 'Epilogue'.
This is necessary because SQL will not allow a null value in a primary key column, even if it still
results in a unique combination of values.
TABLE 'scene_personae'
One row per character, per scene. Comes from the list of characters at the beginning of each scene.
Sometimes characters are repeated in these lists, sometimes with slightly different names, e.g. 'Brutus' and 'Junius Brutus'
The TagParser class below catches these exceptions and only allows one row in the table.
I chose to keep the name that most closely matches alias_short.
'alias_short' matches the field of the same name in the character table
but 'scene_name' will often match nothing in the character table.
For example:
KING. = alias_short in character and scene_personae (and dialogue) tables, and name_short in character table
Claudius, King of Denmark = alias and name in character table
Claudius = scene_name in scene_personae table and dialogue_name in dialogue table
TABLE 'scene_locations'
This table exists only to record the <location> tag, one per scene.
TABLE 'dialogue'
One row per line of dialogue in the play. Speeches are often contiguous lines (and therefore rows in the table) by the same speaker.
'alias_short' is the only character identification
'line_num' is the number of the line in the scene, 'line_num_global' in the play.
'offset' identifies when two characters share a line -- not simultaneously (they have an alias indicating both names in that case)
but consecutively. The first character speaking has a row with offset=0, then the second character has its own row with offset>0,
but the same line numbers. Since it is part of the primary key, it cannot be null, so the value -99 is given for lines
with no offset.
'contains_tags' indicates whether the xml file of the line included interior tags such as <latin>, <recite>, etc. The actual tags
have been discarded.
TABLE stagedirs
One row per stage direction in the play. A total of three stage directions in the entire oeuvre have interior tags such as <latin>,
so they were discarded.
TABLE poems
Just like the table 'plays', one per poetical work
TABLE poem_lines
There is a complex set of possibilities of nested tags in poems (sonnet, stanza, quatrain, couplet, tercet), so
each line is given a serial ID, and then tags are numbered where possible and given a boolean (1=true, 0=false) value.
Some are identified as dedications and subtitles as well.
'intro_or_body' indicates whether the line is in the poem proper or in front matter.
'offset' works as it does in dialogue, but it signifies a change in nested tags, not speaker, as there are no speakers.
'rhyme' signifies the a/b/c rhyme scheme; it is NaN for non-rhyming lines.
In [9]:
myu = open("../mysql_emanresu.txt", "r").read()
mypw = open("../mysql_drowssap.txt", "r").read()
cnx = mysql.connector.connect(user=myu, password=mypw,
host='127.0.0.1', buffered=True)
dbname = 'shakespeare_complete_works'
cursor.execute("CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(dbname))
cursor.execute("USE "+dbname)
cursor.execute('SHOW TABLES')
if cursor is not None:
tables = [x[0] for x in cursor]
else:
tables = []
if 'plays' not in tables:
cmd = ("CREATE TABLE `plays` ("
" `title_long` varchar(55) NOT NULL,"
" `title` varchar(25) NOT NULL,"
" `title_abbrev` varchar(9) NOT NULL,"
" `category` enum('Canon', 'Non-canon') NOT NULL,"
" `num_scenes` tinyint(2) NOT NULL,"
" `num_chars` tinyint(3) NOT NULL,"
" `genre` varchar(7) NOT NULL,"
" `pub_date` varchar(15) NOT NULL,"
" `playwright` varchar(60) NOT NULL,"
" `copyright` varchar(60) NOT NULL,"
" `license` varchar(60) NOT NULL,"
" `licenseurl` varchar(99) NOT NULL,"
" `source` varchar(60) NOT NULL,"
" `sourceurl` varchar(99) NOT NULL,"
" `termsurl` varchar(99) NOT NULL,"
" `version` varchar(6) NOT NULL,"
" PRIMARY KEY (`title`)"
") ENGINE=InnoDB")
cursor.execute(cmd)
if 'poems' not in tables:
cmd = ("CREATE TABLE `poems` ("
" `title_long` varchar(45) NOT NULL,"
" `title` varchar(25) NOT NULL,"
" `title_abbrev` varchar(9) NOT NULL,"
" `category` enum('Canon', 'Non-canon') NOT NULL,"
" `pub_date` varchar(15) NOT NULL,"
" `playwright` varchar(60) NOT NULL,"
" `copyright` varchar(60) NOT NULL,"
" `license` varchar(60) NOT NULL,"
" `licenseurl` varchar(99) NOT NULL,"
" `source` varchar(60) NOT NULL,"
" `sourceurl` varchar(99) NOT NULL,"
" `termsurl` varchar(99) NOT NULL,"
" `version` varchar(6) NOT NULL,"
" PRIMARY KEY (`title`)"
") ENGINE=InnoDB")
cursor.execute(cmd)
if 'poem_lines' not in tables:
cmd = ("CREATE TABLE `poem_lines` ("
" `id` int(5) NOT NULL AUTO_INCREMENT,"
" `title` varchar(25) NOT NULL,"
" `intro_or_body` enum('Intro', 'Body'),"
" `sonnet_num` smallint(3),"
" `stanza_num` smallint(3),"
" `quatrain_num` tinyint(2),"
" `quatrain_bool` tinyint(1),"
" `stanzasmall_bool` tinyint(1),"
" `couplet_bool` tinyint(1),"
" `tercet_bool` tinyint(1),"
" `dedication_bool` tinyint(1),"
" `subtitle_bool` tinyint(1),"
" `line` MEDIUMTEXT,"
" `line_number_global` smallint(5),"
" `line_number` smallint(4),"
" `offset` tinyint(2),"
" `rhyme` varchar(1),"
" `contains_tags` varchar(63),"
" PRIMARY KEY (`id`),"
" FOREIGN KEY (`title`) REFERENCES poems(`title`)"
") ENGINE=InnoDB")
cursor.execute(cmd)
if 'characters' not in tables:
cmd = ("CREATE TABLE `characters` ("
" `title` varchar(25) NOT NULL,"
" `gender` varchar(6),"
" `name` varchar(60) NOT NULL,"
" `name_short` varchar(30) NOT NULL,"
" `alias` varchar(74),"
" `alias_short` varchar(74) NOT NULL,"
" `speaking_bool` tinyint(1) NOT NULL,"
" PRIMARY KEY (`title`, `name_short`, `alias_short`),"
" FOREIGN KEY (`title`) REFERENCES plays(`title`)"
") ENGINE=InnoDB")
cursor.execute(cmd)
if 'scene_personae' not in tables:
cmd = ("CREATE TABLE `scene_personae` ("
" `title` varchar(25) NOT NULL,"
" `alias_short` varchar(74) NOT NULL,"
" `act` varchar(8) NOT NULL,"
" `scene` varchar(16) NOT NULL,"
" PRIMARY KEY (`title`, `alias_short`, `act`, `scene`),"
" FOREIGN KEY (`title`) REFERENCES plays(`title`)"
") ENGINE=InnoDB")
cursor.execute(cmd)
if 'scene_location' not in tables:
cmd = ("CREATE TABLE `scene_location` ("
" `title` varchar(25) NOT NULL,"
" `location` varchar(255) NOT NULL,"
" `act` varchar(8) NOT NULL,"
" `scene` varchar(16) NOT NULL,"
" PRIMARY KEY (`title`, `location`, `act`, `scene`),"
" FOREIGN KEY (`title`) REFERENCES plays(`title`)"
") ENGINE=InnoDB")
cursor.execute(cmd)
if 'dialogue' not in tables:
cmd = ("CREATE TABLE `dialogue` ("
" `title` varchar(25) NOT NULL,"
" `act` varchar(8) NOT NULL,"
" `scene` varchar(16) NOT NULL,"
" `alias_short` varchar(74) NOT NULL,"
" `line_num` smallint(4) NOT NULL,"
" `line_num_global` smallint(5) NOT NULL,"
" `offset` tinyint(3),"
" `contains_tags` varchar(50),"
" `text` MEDIUMTEXT NOT NULL,"
" PRIMARY KEY (`title`, `line_num_global`, `offset`),"
" FOREIGN KEY (`title`) REFERENCES plays(`title`)"
") ENGINE=InnoDB")
cursor.execute(cmd)
if 'stagedirs' not in tables:
cmd = ("CREATE TABLE `stagedirs` ("
" `title` varchar(25) NOT NULL,"
" `act` varchar(8) NOT NULL,"
" `scene` varchar(16) NOT NULL,"
" `sd_num` decimal(6,2) NOT NULL,"
" `sd_num_global` decimal(7,2) NOT NULL,"
" `sd` MEDIUMTEXT NOT NULL,"
" PRIMARY KEY (`title`, `sd_num_global`),"
" FOREIGN KEY (`title`) REFERENCES plays(`title`)"
") ENGINE=InnoDB")
cursor.execute(cmd)
In [10]:
class TagParser:
def __init__(self, filename):
self.filename = filename
self.cmd = None
self.title_long = None
self.title = None
self.title_abbrev = None
self.category = None
self.num_scenes = None
self.num_chars = None
self.genre = None
self.pub_date = None
self.playwright = None
self.copyright = None
self.license = None
self.licenseurl = None
self.source = None
self.sourceurl = None
self.termsurl = None
self.version = None
self.act = 0
self.name = None
self.name_short = None
self.gender = None
self.alias = None
self.alias_short = None
self.char_title = []
self.char_gender = []
self.char_name = []
self.char_name_short = []
self.char_alias = []
self.char_alias_short = []
self.char_speaking = []
self.sl_title = []
self.sl_location = []
self.sl_act = []
self.sl_scene = []
self.sp_title = []
self.sp_alias_short = []
self.sp_act = []
self.sp_scene = []
self.line_title = []
self.line_act = []
self.line_scene = []
self.line_alias_short = []
self.line_num = []
self.line_num_global = []
self.line_offset = []
self.contains_tags = []
self.line_text = []
self.sd_title = []
self.sd_act = []
self.sd_scene = []
self.sd_alias = []
self.sd_num = []
self.sd_num_global= []
self.sd = []
self.sonnet_num = None
self.stanza_num = None
self.quatrain_num = None
self.pl_title = []
self.pl_intro_or_body = []
self.pl_sonnet_num = []
self.pl_stanza_num = []
self.pl_quatrain_num = []
self.pl_quatrain_bool = []
self.pl_stanzasmall_bool = []
self.pl_couplet_bool = []
self.pl_tercet_bool = []
self.pl_dedication_bool = []
self.pl_subtitle_bool = []
self.pl_line = []
self.pl_line_number_global = []
self.pl_line_number = []
self.pl_offset = []
self.pl_rhyme = []
self.pl_contains_tags = []
self.switch_play = False
self.sp_check4dups = []
def update(self, tag, attribs, text, path):
#plays table
if tag == 'play':
self.switch_play = True
self.num_scenes = int(attribs['numberOfScenes'])
self.num_chars = int(attribs['numberOfCharacters'])
self.genre = attribs['genre']
if tag == 'title':
self.title_long = text
self.title = attribs['short']
self.title_abbrev = abbrevs[self.filename]
self.category = categories[self.filename]
if tag == 'playwright':
self.playwright = text
if tag == 'pubdate':
self.pub_date = text
if tag == 'copyright':
self.copyright = text
if tag == 'license':
self.license = text
if tag == 'licenseurl':
self.licenseurl = text
if tag == 'source':
self.source = text
if tag == 'sourceurl':
self.sourceurl = text
if tag == 'termsurl':
self.termsurl = text
if tag == 'version':
self.version = text
if 'play' in path:
if tag == 'persona':
self.gender = attribs['gender']
if tag == 'persname':
if "persaliases" in path:
self.char_title.append(self.title)
self.char_gender.append(self.gender)
self.char_name.append(self.name)
self.char_name_short.append(self.name_short)
self.char_alias.append(text)
self.char_alias_short.append(attribs['short'])
self.char_speaking.append(1)
else:
self.name = text
self.name_short = attribs['short']
self.char_title.append(self.title)
self.char_gender.append(self.gender)
self.char_name.append(self.name)
self.char_name_short.append(self.name_short)
self.char_alias.append(None)
self.char_alias_short.append(self.name_short)
self.char_speaking.append(1)
if tag == 'act':
self.act = text
if tag == 'epilogue':
self.act = 0
if tag == 'acttitle':
self.act = text
if tag == 'scenetitle':
self.scene = text
if tag == 'scenelocation':
self.sl_title.append(self.title)
self.sl_location.append(text)
self.sl_act.append(self.act)
self.sl_scene.append(self.scene)
if tag == 'scenepersona':
# duplicate names
if self.title[:6] == 'Winter':
if text in ['Shepherds', 'Shepherdesses']:
text = 'Shepherds and Shepherdesses'
if self.title == 'Henry IV, Pt. 1':
if text in ['Prince Henry', 'Prince of Wales']:
text = 'Prince Henry'
if self.title == 'King Richard II':
if text == 'Duke of Norfolk':
text = 'Mowbray'
if text in ['Bullingbrook', 'Aumerle']:
text = 'Duke Aumerle'
if self.title == 'Richard III':
if text == 'Earl of Derby':
text = 'Stanley'
if self.title == 'Coriolanus':
if text == 'Junius Brutus':
text = 'Brutus'
if text == 'Sicinius Velutus':
text = 'Sicinius'
if self.title == 'King John':
if text == 'Philip':
text = 'Philip the Bastard'
if self.title == 'Sir Thomas More':
if text == 'Clown':
text = 'Clown Betts'
if [self.act, self.scene, text] not in self.sp_check4dups:
if 'short' in attribs:
self.sp_title.append(self.title)
self.sp_alias_short.append(attribs['short'])
self.sp_act.append(self.act)
self.sp_scene.append(self.scene)
self.sp_check4dups.append([self.act, self.scene, text])
else:
self.char_gender.append(None)
self.char_name.append(text)
self.char_name_short.append(text)
self.char_alias.append(None)
self.char_alias_short.append(text)
self.char_speaking.append(0)
self.sp_title.append(self.title)
self.sp_alias_short.append(text)
self.sp_act.append(self.act)
self.sp_scene.append(self.scene)
self.sp_check4dups.append([self.act, self.scene, text])
if tag == 'speaker':
self.alias_short = text
if tag == 'line':
self.line_title.append(self.title)
self.line_act.append(self.act)
self.line_scene.append(self.scene)
self.line_alias_short.append(self.alias_short)
self.line_num.append(attribs['number'])
self.line_num_global.append(attribs['globalnumber'])
if 'offset' in attribs.keys():
self.line_offset.append(attribs['offset'])
else:
self.line_offset.append(-99)
if 'contains_tags' in attribs.keys():
self.contains_tags.append(attribs['contains_tags'])
else:
self.contains_tags.append(None)
self.line_text.append(text)
if tag == 'stagedir':
self.sd_title.append(self.title)
self.sd_act.append(self.act)
self.sd_scene.append(self.scene)
self.sd_num.append(attribs['sdnumber'])
self.sd_num_global.append(attribs['sdglobalnumber'])
if tag == 'dir':
self.sd.append(text)
if 'poem' in path:
if tag == 'intro':
self.intro_or_body = 'intro'
if tag == 'body':
self.intro_or_body = 'body'
if tag == 'sonnetnum':
self.sonnet_num = text
if tag == 'stanza':
self.stanza_num = attribs['num']
if tag == 'quatrain':
if 'quanum' in attribs.keys():
self.quatrain_num = attribs['quanum']
else:
self.quatrain_num = None
if tag in ['line', 'subtitle']:
self.pl_title.append(self.title)
if 'poembody' in path:
self.pl_intro_or_body.append('body')
elif 'poemintro' in path:
self.pl_intro_or_body.append('intro')
else:
self.pl_intro_or_body.append(None)
if 'sonnet' in path:
self.pl_sonnet_num.append(self.sonnet_num)
else:
self.pl_sonnet_num.append(None)
if 'stanza' in path:
self.pl_stanza_num.append(self.stanza_num)
else:
self.pl_stanza_num.append(None)
if 'quatrain' in path:
self.pl_quatrain_bool.append(1)
else:
self.pl_quatrain_bool.append(0)
self.pl_quatrain_num.append(self.quatrain_num)
if 'stanzasmall' in path:
self.pl_stanzasmall_bool.append(1)
else:
self.pl_stanzasmall_bool.append(0)
if 'couplet' in path:
self.pl_couplet_bool.append(1)
else:
self.pl_couplet_bool.append(0)
if 'tercet' in path:
self.pl_tercet_bool.append(1)
else:
self.pl_tercet_bool.append(0)
if 'stanzasmall' in path:
self.pl_stanzasmall_bool.append(1)
else:
self.pl_stanzasmall_bool.append(0)
if tag == 'subtitle':
self.pl_dedication_bool.append(0)
self.pl_subtitle_bool.append(1)
self.pl_line.append(text)
self.pl_line_number_global.append(None)
self.pl_line_number.append(None)
self.pl_offset.append(None)
self.pl_rhyme.append(None)
self.pl_contains_tags.append(None)
elif tag == 'line':
if 'dedication' in path:
self.pl_dedication_bool.append(1)
self.pl_line_number.append(None)
self.pl_offset.append(None)
self.pl_rhyme.append(None)
self.pl_contains_tags.append(None)
else:
self.pl_dedication_bool.append(0)
if 'number' in attribs.keys():
self.pl_line_number.append(attribs['number'])
if 'offset' in attribs.keys():
self.pl_offset.append(attribs['offset'])
else:
self.pl_offset.append(None)
if 'rhyme' in attribs.keys():
self.pl_rhyme.append(attribs['rhyme'])
else:
self.pl_rhyme.append(None)
if 'contains_tags' in attribs.keys():
self.pl_contains_tags.append(attribs['contains_tags'])
else:
self.pl_contains_tags.append(None)
self.pl_subtitle_bool.append(0)
self.pl_line.append(text)
self.pl_line_number_global.append(attribs['globalnumber'])
self.pl_line_number.append(None)
self.pl_offset.append(None)
self.pl_rhyme.append(None)
self.pl_contains_tags.append(None)
def write_sql_cmds(self):
if self.switch_play:
cmd = (("INSERT INTO plays (title_long, title, title_abbrev, category,"
" num_scenes, num_chars, genre, pub_date, playwright, copyright,"
" license, licenseurl, source, sourceurl, termsurl, version) VALUE (")
+ ', '.join([quote(self.title_long),
quote(self.title),
quote(self.title_abbrev),
quote(self.category),
str(self.num_scenes),
str(self.num_chars),
quote(self.genre),
quote(self.pub_date),
quote(self.playwright),
quote(self.copyright),
quote(self.license),
quote(self.licenseurl),
quote(self.source),
quote(self.sourceurl),
quote(self.termsurl),
quote(self.version)])
+ ");")
self.send_to_sql(cmd)
for i in range(len(self.char_title)):
cmd = (("INSERT INTO characters (title, gender, name, name_short, alias,"
" alias_short, speaking_bool) VALUE (")
+ ', '.join([quote(self.char_title[i]),
quote(self.char_gender[i]),
quote(self.char_name[i]),
quote(self.char_name_short[i]),
quote(self.char_alias[i]),
quote(self.char_alias_short[i]),
str(self.char_speaking[i])])
+ ");")
self.send_to_sql(cmd)
for i in range(len(self.sl_title)):
cmd = (("INSERT INTO scene_location (title, location, act, scene"
") VALUE (")
+ ', '.join([quote(self.sl_title[i]),
quote(self.sl_location[i]),
quote(self.sl_act[i]),
quote(self.sl_scene[i])])
+ ");")
self.send_to_sql(cmd)
for i in range(len(self.sp_title)):
cmd = (("INSERT INTO scene_personae (title, alias_short, act, scene"
") VALUE (")
+ ', '.join([quote(self.sp_title[i]),
quote(self.sp_alias_short[i]),
quote(self.sp_act[i]),
quote(self.sp_scene[i])])
+ ");")
self.send_to_sql(cmd)
for i in range(len(self.line_title)):
cmd = (("INSERT INTO dialogue (title, act, scene, alias_short, line_num,"
" line_num_global, offset, contains_tags, text"
") VALUE (")
+ ', '.join([quote(self.line_title[i]),
quote(self.line_act[i]),
quote(self.line_scene[i]),
quote(self.line_alias_short[i]),
str(self.line_num[i]),
str(self.line_num_global[i]),
str(self.line_offset[i]),
quote(self.contains_tags[i]),
quote(self.line_text[i])])
+ ");")
self.send_to_sql(cmd)
for i in range(len(self.sd_title)):
cmd = (("INSERT INTO stagedirs (title, act, scene, sd_num,"
" sd_num_global, sd"
") VALUE (")
+ ', '.join([quote(self.sd_title[i]),
quote(self.sd_act[i]),
quote(self.sd_scene[i]),
str(self.sd_num[i]),
str(self.sd_num_global[i]),
quote(self.sd[i]) ])
+ ");")
self.send_to_sql(cmd)
else:
cmd = (("INSERT INTO poems (title_long, title, title_abbrev, category,"
" pub_date, playwright, copyright,"
" license, licenseurl, source, sourceurl, termsurl, version) VALUE (")
+ ', '.join([quote(self.title_long),
quote(self.title),
quote(self.title_abbrev),
quote(self.category),
quote(self.pub_date),
quote(self.playwright),
quote(self.copyright),
quote(self.license),
quote(self.licenseurl),
quote(self.source),
quote(self.sourceurl),
quote(self.termsurl),
quote(self.version)])
+ ");")
self.send_to_sql(cmd)
for i in range(len(self.pl_title)):
cmd = (("INSERT INTO poem_lines (title, intro_or_body, sonnet_num, stanza_num, quatrain_num,"
" quatrain_bool, stanzasmall_bool, couplet_bool, tercet_bool, dedication_bool,"
" subtitle_bool, line, line_number_global, line_number, offset,"
" rhyme, contains_tags"
") VALUE (")
+ ', '.join([quote(self.pl_title[i]),
quote(self.pl_intro_or_body[i]),
quote(self.pl_sonnet_num[i]).replace("\"", ""), # so that it will play nice with NULLs
quote(self.pl_stanza_num[i]).replace("\"", ""),
quote(self.pl_quatrain_num[i]).replace("\"", ""),
quote(self.pl_quatrain_bool[i]),
quote(self.pl_stanzasmall_bool[i]),
quote(self.pl_couplet_bool[i]),
quote(self.pl_tercet_bool[i]),
quote(self.pl_dedication_bool[i]),
quote(self.pl_subtitle_bool[i]),
quote(self.pl_line[i]),
quote(self.pl_line_number_global[i]),
quote(self.pl_line_number[i]),
quote(self.pl_offset[i]),
quote(self.pl_rhyme[i]),
quote(self.pl_contains_tags[i]) ])
+ ");")
self.send_to_sql(cmd)
def send_to_sql(self, cmd):
#open('eraseme.txt', 'a+').write(cmd+'\n') # for development
cursor.execute(cmd)
def quote(s):
if s is not None:
return "\"" + str(s).replace("\"", "\\\"") + "\""
else:
return("NULL")
In [11]:
skip = []
for filename in filenames:
open('eraseme.txt', 'w+').write('')
print(filename, end=' ')
parser = TagParser(filename)
tree = ET.parse(filename)
countdowns = [1] # keeps track of how many children there remain to be traversed
to_collect = 0 # to collect interior tags where applicable
collected_attribs = []
current_tree = [] # keeps track of current path
current_tag_tree = []
last_level = 0
for elem in tree.getiterator():
level = str(len(countdowns))
tag, text, tail = None, None, None
attribs = {}
line_attribs = {}
children = 0
if elem.tag:
tag = elem.tag
if elem.text:
text = elem.text.strip()
if len(text) == 0:
text = None
if elem.attrib.items():
for key, value in elem.attrib.items():
attribs[key] = value
if list(elem):
children = len(list(elem))
if elem.tail:
tail = elem.tail.strip()
if len(tail) == 0:
tail = None
# There is a typo in one file that must be corrected:
if filename == 'two_noble_kinsmen.xml':
if 'lng' in attribs.keys():
attribs['long'] = attribs['lng']
# There is a blank line in Richard III at position 2623
if filename == 'richard_iii.xml' and 'globalnumber' in attribs.keys():
if attribs['globalnumber'] == '2623':
tag='skipme'
# There are a total of three stage directions that have
# interior tags. Rather than write a parser for them,
# I identified them by error-catching (the 'abandoned tail'
# print statement below) and will manually correct them here.
if filename == 'coriolanus.xml':
if tag == 'dir' and text == 'Exeunt Coriolanus, Cominius, ':
text += ' cum aliis (Menenius, Senators, and Patricians).'
if tag == 'latin' and text == 'cum aliis':
tail = ''
if filename == 'hamlet.xml':
if tag == 'dir' and text == 'Flourish. Enter Claudius, King of Denmark, Gertrude the Queen; Council: as Polonius; and his son Laertes, Hamlet, ':
text += 'cum aliis including Voltemand and Cornelius.'
if tag == 'latin' and text == 'cum aliis':
tail = ''
current_level = len(countdowns)
if current_level > last_level:
current_tree.append(tag)
elif current_level < last_level:
n = last_level - current_level
current_tree = current_tree[:-n]
current_tree[-1] = tag
last_level = current_level
if to_collect == 0 and collected_attribs != []: # this has to go here to catch interior tags of interior tags of line tags
if tag in ['french', 'latin', 'recite', 'spanish', 'italian', 'dutch']:
to_collect += 1
else:
collected_attribs['contains_tags'] = ' '.join(collected_tags)
parser.update('line', collected_attribs, ' '.join(collected_text), current_tree)
collected_attribs = []
# collect everything from interior tags, if applicable, for lines
if tag == 'line' and children > 0:
to_collect = children + 1
collected_text = []
collected_tags = []
collected_attribs = {k: v for k, v in attribs.items()}
if to_collect > 0:
to_collect -= 1
if text and len(text.strip()) > 0:
collected_text.append(text.strip())
if tail and len(tail.strip()) > 0:
collected_text.append(tail.strip())
tail = ''
if tag != 'line':
collected_tags.append(tag)
if to_collect == 0:
if tail is not None and len(tail) > 0:
print("abandoned tail: "+tail)
if text:
text = text.replace("—", "--")
text = ftfy.fix_text(text)
if attribs:
for key in attribs.keys():
attribs[key] = ftfy.fix_text(attribs[key])
if collected_attribs == []:
parser.update(tag, attribs, text, current_tree)
# update countdowns for the NEXT iteration
# do any calculations or collection BEFORE this point
if children > 0:
countdowns.append(children)
else:
while len(countdowns) > 0 and countdowns[-1] == 1:
countdowns = countdowns[:-1]
if len(countdowns) > 0:
countdowns[-1] -= 1
parser.write_sql_cmds()
cnx.commit()
cursor.close()
cnx.close()
In [12]:
# in case of error
# for i in range(4,44):
# cursor.execute("DROP DATABASE shakespeare_complete_works")
In [5]:
myu = open("../mysql_emanresu.txt", "r").read()
mypw = open("../mysql_drowssap.txt", "r").read()
engine = create_engine('mysql+mysqlconnector://'+myu+':'+mypw+'@localhost:3306/shakespeare_complete_works')
for table in ['plays', 'characters', 'scene_location', 'scene_personae', 'dialogue', 'stagedirs', 'poems', 'poem_lines']:
assert engine.has_table(table)
df = pd.read_sql(table, engine)
df.to_csv(table+'.csv', index=False)
print(table, len(df))
In [4]:
tables = ['plays', 'characters', 'scene_location', 'scene_personae', 'dialogue', 'stagedirs', 'poems', 'poem_lines']
for n in range(8):
df = pd.read_csv(tables[n]+'.csv', encoding='latin-1')
if n > 0:
print("==============================\n")
print(tables[n] + ': ')
print(df.tail(10))
In [ ]: