Working Scrapy Code Snippets



In [ ]:

    
response.xpath('//main[@property="mainContentOfPage"]').extract()

# extracts total content of page



In [ ]:

    
keywords = response.xpath('//meta[@name="keywords"]').re('content="(.*)\"')

# extracts keywords from meta tag as list



In [ ]:

    
item['number'] = r.xpath('//title/text()').re('Decision\s(.*)\\r')

# extracts file name, but not reliably



In [ ]:

    
for metadata in response.xpath('//head'):
    title = metadata.xpath('//meta[@name="dcterms.title"]').re('content="(.*)\"')
    doctype = metadata.xpath('//meta[@name="keywords"]').re('content="(.*)\"')
    keywords = metadata.xpath('//meta[@name="keywords"]').re('content="(.*)\"')
    print(dict(title=title, doctype=doctype, keywords=keywords))
    
# extracts metadata



In [ ]:



In [ ]:

    
%%bash
ls



In [ ]:

    
%%bash


head ../scrapy/result11a.json



In [ ]:

    
%%bash

grep -E  ../scrapy/result11.json



In [ ]:

    
import ijson
from ijson import items



In [ ]:

    
filename = "../scrapy/result11.json"
with open(filename, 'r') as f:
    objects = ijson.items(f, 'metadata')
    items = list(objects)



In [ ]:

    
print(items[:0])



In [ ]:

    
f = "../scrapy/result11.json"
objects = items(f, 'file.metadata.item')
titles = (o for o in objects if o['type'] == 'title')
for title in titles:
    print(title)



In [ ]:

    
url = 'http://www.crtc.gc.ca/eng/archive/2016/2016-491.htm'
page = url.split("/")[-1]
file = page.split(".")[0]
print(file)



In [ ]:

    
import json
import glob
import os
import pandas as pd
import numpy as np
from pprint import pprint



In [ ]:

    
json_data=open("../scrapy/result12.json",).read()



In [ ]:

    
data = json.loads(json_data)
pprint(data)



In [ ]:

    
crtc_files = "../scrapy/result11.json"
crtc_data = []



In [ ]:

    
with open(crtc_files) as f:
    for line in f:
        crtc_data.append(line)



In [ ]:

    
print(crtc_data)



In [ ]:

    
print(data)



In [ ]:

    
crtc = pd.read_json("../scrapy/result12.json", orient = "index")
crtc



In [ ]:

    
print(crtc)



In [ ]:

    
crtc = nltk.data.load("../scrapy/result9.json", format = 'text')[:160]



In [ ]:

    
import ijson
filename = "../scrapy/result9.json"
text = []
with open(filename, 'r') as f:
    for line in f:
        text.append(json.loads(line))



In [ ]:

    
objects = ijson.items(f, 'file.metadata.subject.item')
columns = list(objects)
print(list(objects))
print(columns)



In [ ]:

    
# finally something that is working. This code reads each level of the json file (file, metadata, text)
with open("../scrapy/result12.json") as json_file:  
    data = json.load(json_file)
    for d in data:
        print(d['file'][0]) # changing the number from 0 to 2 reads each level



In [ ]:

    
data



In [ ]:

    
file_name = [li['metadata'] for li in data]
print(file_name)



In [ ]:

    
type(data)



In [ ]:

    
first_elem = data[0]
print(first_elem)



In [ ]:

    
file = first_elem['file']
pprint(file)



In [ ]:

    
text = file['text']



In [ ]:

    
# KEEP THIS
for prefix, the_type, value in ijson.parse(open("../scrapy/result12.json")):
    print(prefix, the_type, value)



In [ ]:



In [ ]:

    
import xml.dom.minidom

xml = xml.dom.minidom.parse("../scrapy/result10.xml") # or xml.dom.minidom.parseString(xml_string)
pretty_xml_as_string = xml.toprettyxml()
print(pretty_xml_as_string)



In [ ]: