Working Scrapy Code Snippets


In [ ]:
response.xpath('//main[@property="mainContentOfPage"]').extract()

# extracts total content of page

In [ ]:
keywords = response.xpath('//meta[@name="keywords"]').re('content="(.*)\"')

# extracts keywords from meta tag as list

In [ ]:
item['number'] = r.xpath('//title/text()').re('Decision\s(.*)\\r')

# extracts file name, but not reliably

In [ ]:
for metadata in response.xpath('//head'):
    title = metadata.xpath('//meta[@name="dcterms.title"]').re('content="(.*)\"')
    doctype = metadata.xpath('//meta[@name="keywords"]').re('content="(.*)\"')
    keywords = metadata.xpath('//meta[@name="keywords"]').re('content="(.*)\"')
    print(dict(title=title, doctype=doctype, keywords=keywords))
    
# extracts metadata

In [ ]:



In [ ]:
%%bash
ls

In [ ]:
%%bash


head ../scrapy/result11a.json

In [ ]:
%%bash

grep -E  ../scrapy/result11.json

In [ ]:
import ijson
from ijson import items

In [ ]:
filename = "../scrapy/result11.json"
with open(filename, 'r') as f:
    objects = ijson.items(f, 'metadata')
    items = list(objects)

In [ ]:
print(items[:0])

In [ ]:
f = "../scrapy/result11.json"
objects = items(f, 'file.metadata.item')
titles = (o for o in objects if o['type'] == 'title')
for title in titles:
    print(title)


In [ ]:
url = 'http://www.crtc.gc.ca/eng/archive/2016/2016-491.htm'
page = url.split("/")[-1]
file = page.split(".")[0]
print(file)

In [ ]:
import json
import glob
import os
import pandas as pd
import numpy as np
from pprint import pprint

In [ ]:
json_data=open("../scrapy/result12.json",).read()

In [ ]:
data = json.loads(json_data)
pprint(data)

In [ ]:
crtc_files = "../scrapy/result11.json"
crtc_data = []

In [ ]:
with open(crtc_files) as f:
    for line in f:
        crtc_data.append(line)

In [ ]:
print(crtc_data)

In [ ]:
print(data)

In [ ]:
crtc = pd.read_json("../scrapy/result12.json", orient = "index")
crtc

In [ ]:
print(crtc)

In [ ]:
crtc = nltk.data.load("../scrapy/result9.json", format = 'text')[:160]

In [ ]:
import ijson
filename = "../scrapy/result9.json"
text = []
with open(filename, 'r') as f:
    for line in f:
        text.append(json.loads(line))

In [ ]:
objects = ijson.items(f, 'file.metadata.subject.item')
columns = list(objects)
print(list(objects))
print(columns)

In [ ]:
# finally something that is working. This code reads each level of the json file (file, metadata, text)
with open("../scrapy/result12.json") as json_file:  
    data = json.load(json_file)
    for d in data:
        print(d['file'][0]) # changing the number from 0 to 2 reads each level

In [ ]:
data

In [ ]:
file_name = [li['metadata'] for li in data]
print(file_name)

In [ ]:
type(data)

In [ ]:
first_elem = data[0]
print(first_elem)

In [ ]:
file = first_elem['file']
pprint(file)

In [ ]:
text = file['text']

In [ ]:
# KEEP THIS
for prefix, the_type, value in ijson.parse(open("../scrapy/result12.json")):
    print(prefix, the_type, value)

In [ ]:


In [ ]:
import xml.dom.minidom

xml = xml.dom.minidom.parse("../scrapy/result10.xml") # or xml.dom.minidom.parseString(xml_string)
pretty_xml_as_string = xml.toprettyxml()
print(pretty_xml_as_string)

In [ ]: