notebook.community

Edit and run



In [1]:

    
from lxml import html
import requests
import urllib2
import os
from urllib import url2pathname
from os import listdir
from os.path import isfile, join
import unicodecsv as csv



In [2]:

    
# Need this for local files, code in this cell from stackoverflow
class LocalFileAdapter(requests.adapters.BaseAdapter):
    """Protocol Adapter to allow Requests to GET file:// URLs

    @todo: Properly handle non-empty hostname portions.
    """

    @staticmethod
    def _chkpath(method, path):
        """Return an HTTP status for the given filesystem path."""
        if method.lower() in ('put', 'delete'):
            return 501, "Not Implemented"  # TODO
        elif method.lower() not in ('get', 'head'):
            return 405, "Method Not Allowed"
        elif os.path.isdir(path):
            return 400, "Path Not A File"
        elif not os.path.isfile(path):
            return 404, "File Not Found"
        elif not os.access(path, os.R_OK):
            return 403, "Access Denied"
        else:
            return 200, "OK"

    def send(self, req, **kwargs):  # pylint: disable=unused-argument
        """Return the file specified by the given request

        @type req: C{PreparedRequest}
        @todo: Should I bother filling `response.headers` and processing
               If-Modified-Since and friends using `os.stat`?
        """
        path = os.path.normcase(os.path.normpath(url2pathname(req.path_url)))
        response = requests.Response()

        response.status_code, response.reason = self._chkpath(req.method, path)
        if response.status_code == 200 and req.method.lower() != 'head':
            try:
                response.raw = open(path, 'rb')
            except (OSError, IOError), err:
                response.status_code = 500
                response.reason = str(err)

        if isinstance(req.url, bytes):
            response.url = req.url.decode('utf-8')
        else:
            response.url = req.url

        response.request = req
        response.connection = self

        return response

    def close(self):
        pass



In [3]:

    
requests_session = requests.session()
requests_session.mount('file://', LocalFileAdapter())

# Filenames
html_path = yourpath # Change this to the directory you have saved your .html files to
onlyfiles = [ f for f in listdir(html_path) if isfile(join(html_path,f)) ]
onlyfiles = sorted(onlyfiles)

# Pre-initialize
questions = []
answers_self = []
answers_target = []

for i in range(0,len(onlyfiles)):
    filename = 'file://' + html_path + onlyfiles[i]

    page = requests_session.get(filename)
    tree = html.fromstring(page.text)
    
    #cur_questions = tree.xpath("//[starts-with(@class,'question public']//*[starts-with(@id,'qtext_')]//p/text()")
    cur_questions = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'qtext_')]//p/text()")
    cur_answers_self = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'answer_viewer_')]/text()")
    cur_answers_target = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'answer_target_')]/text()")
    
    questions += cur_questions
    answers_self += cur_answers_self
    answers_target += cur_answers_target



In [122]:

    
output = [questions,answers_self,answers_target]
output = zip(*output)
with open('output.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='^', quotechar='"',quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerows(output)



In [ ]: