In [1]:
from lxml import html
import requests
import urllib2
import os
from urllib import url2pathname
from os import listdir
from os.path import isfile, join
import unicodecsv as csv
In [2]:
# Need this for local files, code in this cell from stackoverflow
class LocalFileAdapter(requests.adapters.BaseAdapter):
"""Protocol Adapter to allow Requests to GET file:// URLs
@todo: Properly handle non-empty hostname portions.
"""
@staticmethod
def _chkpath(method, path):
"""Return an HTTP status for the given filesystem path."""
if method.lower() in ('put', 'delete'):
return 501, "Not Implemented" # TODO
elif method.lower() not in ('get', 'head'):
return 405, "Method Not Allowed"
elif os.path.isdir(path):
return 400, "Path Not A File"
elif not os.path.isfile(path):
return 404, "File Not Found"
elif not os.access(path, os.R_OK):
return 403, "Access Denied"
else:
return 200, "OK"
def send(self, req, **kwargs): # pylint: disable=unused-argument
"""Return the file specified by the given request
@type req: C{PreparedRequest}
@todo: Should I bother filling `response.headers` and processing
If-Modified-Since and friends using `os.stat`?
"""
path = os.path.normcase(os.path.normpath(url2pathname(req.path_url)))
response = requests.Response()
response.status_code, response.reason = self._chkpath(req.method, path)
if response.status_code == 200 and req.method.lower() != 'head':
try:
response.raw = open(path, 'rb')
except (OSError, IOError), err:
response.status_code = 500
response.reason = str(err)
if isinstance(req.url, bytes):
response.url = req.url.decode('utf-8')
else:
response.url = req.url
response.request = req
response.connection = self
return response
def close(self):
pass
In [3]:
requests_session = requests.session()
requests_session.mount('file://', LocalFileAdapter())
# Filenames
html_path = yourpath # Change this to the directory you have saved your .html files to
onlyfiles = [ f for f in listdir(html_path) if isfile(join(html_path,f)) ]
onlyfiles = sorted(onlyfiles)
# Pre-initialize
questions = []
answers_self = []
answers_target = []
for i in range(0,len(onlyfiles)):
filename = 'file://' + html_path + onlyfiles[i]
page = requests_session.get(filename)
tree = html.fromstring(page.text)
#cur_questions = tree.xpath("//[starts-with(@class,'question public']//*[starts-with(@id,'qtext_')]//p/text()")
cur_questions = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'qtext_')]//p/text()")
cur_answers_self = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'answer_viewer_')]/text()")
cur_answers_target = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'answer_target_')]/text()")
questions += cur_questions
answers_self += cur_answers_self
answers_target += cur_answers_target
In [122]:
output = [questions,answers_self,answers_target]
output = zip(*output)
with open('output.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter='^', quotechar='"',quoting=csv.QUOTE_MINIMAL)
spamwriter.writerows(output)
In [ ]: