{ 'enriched_field_units': 1,
'notices': [],
'original_media_type': 'text/html',
'snapshots': [ { 'snapshot': { 'html': '<html>\n'
'<head>\n'
' <title>Simple HTML '
'Page</title>\n'
'</head>\n'
'<body>\n'
' <h1>Chapter 1</h1>\n'
' <p>The content of the '
'first chapter.</p>\n'
'</body>\n'
'</html>'},
'step': 'html_input'},
{ 'snapshot': { 'html': "<?xml version='1.0' "
"encoding='UTF-8' "
"standalone='yes'?><html>\n"
'<head>\n'
' <meta '
'content="text/html; '
'charset=UTF-8" '
'http-equiv="Content-Type"/>\n'
' <title>Simple HTML '
'Page</title>\n'
'\n'
'</head>\n'
'<body>\n'
'\n'
' <h1>Chapter 1</h1>\n'
' <p>The content of the '
'first chapter.</p>\n'
'\n'
'</body></html>'},
'step': 'html_output'},
{ 'snapshot': { 'extracted_metadata': { 'title': 'Simple '
'HTML '
'Page'},
'html': "<?xml version='1.0' "
"encoding='UTF-8' "
"standalone='yes'?><html>\n"
'<head>\n'
' <meta '
'content="text/html; '
'charset=UTF-8" '
'http-equiv="Content-Type"/>\n'
' \n'
' <title>Simple HTML '
'Page</title>\n'
'\n'
'\n'
'</head>\n'
'<body>\n'
'\n'
'\n'
' <h1>Chapter 1</h1>\n'
' <p>The content of the '
'first chapter.</p>\n'
'\n'
'</body></html>',
'metadata': {},
'text': 'Simple HTML Page\n'
'\n'
'Chapter 1\n'
'\n'
'The content of the first '
'chapter.'},
'step': 'json_output'},
{ 'snapshot': { 'extracted_metadata': { 'title': 'Simple '
'HTML '
'Page'},
'html': "<?xml version='1.0' "
"encoding='UTF-8' "
"standalone='yes'?><html>\n"
'<head>\n'
' <meta '
'content="text/html; '
'charset=UTF-8" '
'http-equiv="Content-Type"/>\n'
' \n'
' <title>Simple HTML '
'Page</title>\n'
'\n'
'\n'
'</head>\n'
'<body>\n'
'\n'
'\n'
' <h1>Chapter 1</h1>\n'
' <p>The content of the '
'first chapter.</p>\n'
'\n'
'</body></html>',
'metadata': {},
'text': 'Simple HTML Page\n'
'\n'
'Chapter 1\n'
'\n'
'The content of the first '
'chapter.'},
'step': 'json_normalizations_output'},
{ 'snapshot': { 'enriched_field_units': 1,
'enriched_text': { 'concepts': [ { 'dbpedia': 'http://dbpedia.org/resource/HTML',
'freebase': 'http://rdf.freebase.com/ns/m.03g20',
'opencyc': 'http://sw.opencyc.org/concept/Mx4rvWVukJwpEbGdrcN5Y29ycA',
'relevance': 0.932431,
'text': 'HTML',
'yago': 'http://yago-knowledge.org/resource/HTML'}],
'docSentiment': { 'mixed': False,
'type': 'neutral'},
'entities': [],
'keywords': [ { 'relevance': 0.915478,
'sentiment': { 'mixed': False,
'type': 'neutral'},
'text': 'Simple '
'HTML '
'Page'},
{ 'relevance': 0.539264,
'sentiment': { 'mixed': False,
'type': 'neutral'},
'text': 'content'},
{ 'relevance': 0.511578,
'sentiment': { 'mixed': False,
'type': 'neutral'},
'text': 'chapter'}],
'language': 'english',
'relations': [],
'status': 'OK',
'taxonomy': [ { 'confident': False,
'label': '/technology '
'and '
'computing/programming '
'languages/javascript',
'score': 0.577361},
{ 'confident': False,
'label': '/business '
'and '
'industrial/company/bankruptcy',
'score': 0.57735},
{ 'confident': False,
'label': '/art '
'and '
'entertainment/books '
'and '
'literature',
'score': 0.367906}]},
'extracted_metadata': { 'title': 'Simple '
'HTML '
'Page'},
'html': "<?xml version='1.0' "
"encoding='UTF-8' "
"standalone='yes'?><html>\n"
'<head>\n'
' <meta '
'content="text/html; '
'charset=UTF-8" '
'http-equiv="Content-Type"/>\n'
' \n'
' <title>Simple HTML '
'Page</title>\n'
'\n'
'\n'
'</head>\n'
'<body>\n'
'\n'
'\n'
' <h1>Chapter 1</h1>\n'
' <p>The content of the '
'first chapter.</p>\n'
'\n'
'</body></html>',
'metadata': {},
'text': 'Simple HTML Page\n'
'\n'
'Chapter 1\n'
'\n'
'The content of the first '
'chapter.'},
'step': 'enrichments_output'},
{ 'snapshot': { 'enriched_field_units': 1,
'enriched_text': { 'concepts': [ { 'dbpedia': 'http://dbpedia.org/resource/HTML',
'freebase': 'http://rdf.freebase.com/ns/m.03g20',
'opencyc': 'http://sw.opencyc.org/concept/Mx4rvWVukJwpEbGdrcN5Y29ycA',
'relevance': 0.932431,
'text': 'HTML',
'yago': 'http://yago-knowledge.org/resource/HTML'}],
'docSentiment': { 'mixed': False,
'type': 'neutral'},
'entities': [],
'keywords': [ { 'relevance': 0.915478,
'sentiment': { 'mixed': False,
'type': 'neutral'},
'text': 'Simple '
'HTML '
'Page'},
{ 'relevance': 0.539264,
'sentiment': { 'mixed': False,
'type': 'neutral'},
'text': 'content'},
{ 'relevance': 0.511578,
'sentiment': { 'mixed': False,
'type': 'neutral'},
'text': 'chapter'}],
'language': 'english',
'relations': [],
'status': 'OK',
'taxonomy': [ { 'confident': False,
'label': '/technology '
'and '
'computing/programming '
'languages/javascript',
'score': 0.577361},
{ 'confident': False,
'label': '/business '
'and '
'industrial/company/bankruptcy',
'score': 0.57735},
{ 'confident': False,
'label': '/art '
'and '
'entertainment/books '
'and '
'literature',
'score': 0.367906}]},
'extracted_metadata': { 'title': 'Simple '
'HTML '
'Page'},
'html': "<?xml version='1.0' "
"encoding='UTF-8' "
"standalone='yes'?><html>\n"
'<head>\n'
' <meta '
'content="text/html; '
'charset=UTF-8" '
'http-equiv="Content-Type"/>\n'
' \n'
' <title>Simple HTML '
'Page</title>\n'
'\n'
'\n'
'</head>\n'
'<body>\n'
'\n'
'\n'
' <h1>Chapter 1</h1>\n'
' <p>The content of the '
'first chapter.</p>\n'
'\n'
'</body></html>',
'metadata': {},
'text': 'Simple HTML Page\n'
'\n'
'Chapter 1\n'
'\n'
'The content of the first '
'chapter.'},
'step': 'normalizations_output'}],
'status': 'completed'}