In [ ]:
response.xpath('//main[@property="mainContentOfPage"]').extract()
# extracts total content of page
In [ ]:
keywords = response.xpath('//meta[@name="keywords"]').re('content="(.*)\"')
# extracts keywords from meta tag as list
In [ ]:
item['number'] = r.xpath('//title/text()').re('Decision\s(.*)\\r')
# extracts file name, but not reliably
In [ ]:
for metadata in response.xpath('//head'):
title = metadata.xpath('//meta[@name="dcterms.title"]').re('content="(.*)\"')
doctype = metadata.xpath('//meta[@name="keywords"]').re('content="(.*)\"')
keywords = metadata.xpath('//meta[@name="keywords"]').re('content="(.*)\"')
print(dict(title=title, doctype=doctype, keywords=keywords))
# extracts metadata
In [ ]:
In [ ]:
%%bash
ls
In [ ]:
%%bash
head ../scrapy/result11a.json
In [ ]:
%%bash
grep -E ../scrapy/result11.json
In [ ]:
import ijson
from ijson import items
In [ ]:
filename = "../scrapy/result11.json"
with open(filename, 'r') as f:
objects = ijson.items(f, 'metadata')
items = list(objects)
In [ ]:
print(items[:0])
In [ ]:
f = "../scrapy/result11.json"
objects = items(f, 'file.metadata.item')
titles = (o for o in objects if o['type'] == 'title')
for title in titles:
print(title)
In [ ]:
url = 'http://www.crtc.gc.ca/eng/archive/2016/2016-491.htm'
page = url.split("/")[-1]
file = page.split(".")[0]
print(file)
In [ ]:
import json
import glob
import os
import pandas as pd
import numpy as np
from pprint import pprint
In [ ]:
json_data=open("../scrapy/result12.json",).read()
In [ ]:
data = json.loads(json_data)
pprint(data)
In [ ]:
crtc_files = "../scrapy/result11.json"
crtc_data = []
In [ ]:
with open(crtc_files) as f:
for line in f:
crtc_data.append(line)
In [ ]:
print(crtc_data)
In [ ]:
print(data)
In [ ]:
crtc = pd.read_json("../scrapy/result12.json", orient = "index")
crtc
In [ ]:
print(crtc)
In [ ]:
crtc = nltk.data.load("../scrapy/result9.json", format = 'text')[:160]
In [ ]:
import ijson
filename = "../scrapy/result9.json"
text = []
with open(filename, 'r') as f:
for line in f:
text.append(json.loads(line))
In [ ]:
objects = ijson.items(f, 'file.metadata.subject.item')
columns = list(objects)
print(list(objects))
print(columns)
In [ ]:
# finally something that is working. This code reads each level of the json file (file, metadata, text)
with open("../scrapy/result12.json") as json_file:
data = json.load(json_file)
for d in data:
print(d['file'][0]) # changing the number from 0 to 2 reads each level
In [ ]:
data
In [ ]:
file_name = [li['metadata'] for li in data]
print(file_name)
In [ ]:
type(data)
In [ ]:
first_elem = data[0]
print(first_elem)
In [ ]:
file = first_elem['file']
pprint(file)
In [ ]:
text = file['text']
In [ ]:
# KEEP THIS
for prefix, the_type, value in ijson.parse(open("../scrapy/result12.json")):
print(prefix, the_type, value)
In [ ]:
In [ ]:
import xml.dom.minidom
xml = xml.dom.minidom.parse("../scrapy/result10.xml") # or xml.dom.minidom.parseString(xml_string)
pretty_xml_as_string = xml.toprettyxml()
print(pretty_xml_as_string)
In [ ]: