In [83]:
from lxml import html
import requests
import urllib2
import os
from urllib import url2pathname
from os import listdir
from os.path import isfile, join
import unicodecsv as csv
import re
from more_itertools import unique_everseen

In [2]:
# Need this for local files, code in this cell from stackoverflow
class LocalFileAdapter(requests.adapters.BaseAdapter):
    """Protocol Adapter to allow Requests to GET file:// URLs

    @todo: Properly handle non-empty hostname portions.
    """

    @staticmethod
    def _chkpath(method, path):
        """Return an HTTP status for the given filesystem path."""
        if method.lower() in ('put', 'delete'):
            return 501, "Not Implemented"  # TODO
        elif method.lower() not in ('get', 'head'):
            return 405, "Method Not Allowed"
        elif os.path.isdir(path):
            return 400, "Path Not A File"
        elif not os.path.isfile(path):
            return 404, "File Not Found"
        elif not os.access(path, os.R_OK):
            return 403, "Access Denied"
        else:
            return 200, "OK"

    def send(self, req, **kwargs):  # pylint: disable=unused-argument
        """Return the file specified by the given request

        @type req: C{PreparedRequest}
        @todo: Should I bother filling `response.headers` and processing
               If-Modified-Since and friends using `os.stat`?
        """
        path = os.path.normcase(os.path.normpath(url2pathname(req.path_url)))
        response = requests.Response()

        response.status_code, response.reason = self._chkpath(req.method, path)
        if response.status_code == 200 and req.method.lower() != 'head':
            try:
                response.raw = open(path, 'rb')
            except (OSError, IOError), err:
                response.status_code = 500
                response.reason = str(err)

        if isinstance(req.url, bytes):
            response.url = req.url.decode('utf-8')
        else:
            response.url = req.url

        response.request = req
        response.connection = self

        return response

    def close(self):
        pass

In [87]:
requests_session = requests.session()
requests_session.mount('file://', LocalFileAdapter())

# Filenames
html_path = yourpath # Change this
onlyfiles = [ f for f in listdir(html_path) if isfile(join(html_path,f)) ]
onlyfiles = sorted(onlyfiles)

# Pre-initialize
qn_id = []
questions = []
answers_self = []
answers_accepted = []
importance = []

for i in range(0,len(onlyfiles)):
    filename = 'file://' + html_path + onlyfiles[i]

    page = requests_session.get(filename)
    tree = html.fromstring(page.text)
    
    cur_questions = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'qtext_')]//p/text()")
    cur_answers_self_items = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'my_answer') and @checked]")
    cur_answers_accepted_items = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'their_answer_') and @checked]")
    cur_answers_accepted_id = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'their_answer_') and @checked]/./@id")
    cur_importance_items = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'importance') and @checked]") 
    
    answers_accepted_id = []
    for item in range(0,len(cur_answers_accepted_id)):
        answers_accepted_id.append(re.match('.*?([0-9]+)$', cur_answers_accepted_id[item]).group(1))
    
    cur_qn_id = list(unique_everseen(answers_accepted_id))
    for j in range(0,len(cur_qn_id)):
        qn_id.append(cur_qn_id[j])
        
        questions.append(cur_questions[j])
        
        answers_self.append(cur_answers_self_items[j].label.text_content())
        
        indices = [k for k, x in enumerate(answers_accepted_id) if x == cur_qn_id[j]]
        
        qn_answers_accepted = ''
        for idx in range(0,len(indices)):
            pos = indices[idx]
            if idx == 0:
                qn_answers_accepted += cur_answers_accepted_items[pos].label.text_content()
            else:
                qn_answers_accepted = qn_answers_accepted + '/' + cur_answers_accepted_items[pos].label.text_content()
        answers_accepted.append(qn_answers_accepted)
                
        importance.append(cur_importance_items[j].label.text_content())

In [88]:
output = [qn_id,questions,answers_self,answers_accepted,importance]
output = zip(*output)
with open('self_qa.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='^', quotechar='"',quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerows(output)

In [89]:
print qn_id


['18247', '83660', '9415', '42640', '453', '299', '165', '86', '169', '1138', '16479', '28549', '19676', '6520', '68818', '41113', '53629', '70135', '26723', '70468', '22803', '42839', '28741', '16571', '50565', '46166', '16345', '137', '59426', '82428', '12560', '63930', '20838', '7239', '457', '19893', '20108', '8865', '553', '6619', '51349', '535', '512', '15586', '15746', '391', '15743', '401', '41099', '158', '1062', '430', '6634', '436', '441', '426', '47751', '60892', '188', '62455', '4018', '19079', '13669', '617', '193', '86049', '18825', '20', '168', '44384', '81307', '79920', '86750', '35415', '63', '1158', '49094', '6155', '16487', '15878', '20452', '15744', '43965', '58829', '23733', '8079', '26317', '58162', '180', '7535', '22569', '127', '17808', '12540', '13077', '16269', '28918', '62140', '60318', '434', '6888', '35520', '93', '23527', '28478', '1287', '83184', '18711', '15182', '160', '542', '185', '68580', '33170', '50006', '55348', '47635', '18639', '12625', '38912', '365', '126', '78853', '1830', '68738', '1146', '15757', '35252', '22706', '86895', '13095', '6019', '37893', '1274', '48770', '1439', '2457', '35347', '46091', '84366', '18087', '59825', '217', '42190', '57718', '1447', '18664', '26484', '45835', '57847', '1049', '45553', '1222', '511', '23618', '64476', '67651', '86217', '57731', '35894', '1011', '32057', '84', '18636', '26367', '7085', '49175', '52827', '12796', '25294', '61428', '76', '36823', '66165', '106', '49093', '113', '64334', '55185', '9401', '23305', '27527', '17732', '54016', '763', '6377', '17778', '9418', '59597', '412', '42010', '17582', '1122', '38230', '55980', '44595', '43509', '24333', '13032', '82681', '34059', '46632', '12', '7273', '72735', '1712', '5764', '309', '24053', '35955', '13100', '15409', '234', '67418', '6537', '42120', '30271', '46176', '21986', '24684', '13086', '48328', '30723', '66827', '84910', '33870', '6722', '15839', '1112', '48557', '54052', '9522', '31898', '216', '20981', '16820', '82528', '71315', '39714', '35862', '898', '50379', '382', '39210', '2781', '19365', '6867', '60308', '20298', '63068', '496', '27794', '26299', '52728', '25561', '26052', '15751', '60145', '55264', '357', '18212', '105', '18955', '38196', '19125', '55355', '38051', '220', '54695', '26529', '153', '155', '29389', '1730', '20240', '34517', '366', '267', '38173', '19854', '13136', '46780', '17457', '83480', '124', '56606', '49107', '86909', '86184', '69186', '30830', '50755', '6111', '1173', '35475', '58812', '388', '6347', '34333', '60901', '43304', '73', '23132', '156915', '39576', '156918', '546', '416', '114', '38223', '7060', '64', '477', '54', '18125', '54127', '45598', '156914', '218', '25114', '354', '307', '48981', '21793', '23147', '47764', '18504', '40570', '26582', '32855', '209', '18682', '48953', '12503', '48325', '26720', '19392', '531', '18979', '20818', '57717', '26684', '20950', '85658', '142', '33561', '33897', '16317', '1119', '171', '12584', '15037', '18398', '35593', '61733', '66544', '32', '255', '26573', '395', '28831', '1588', '15414', '39', '1172', '68401', '71', '15742', '53299', '28', '67641', '338', '21527', '18841', '49086', '35276', '83646', '463', '38767', '265', '887', '20530', '40288', '219', '30607', '19681', '5417', '323', '1446', '78', '48347', '7077', '23293', '294', '21175', '81847', '86417', '41711', '1117', '416235', '20011', '21896', '42776', '13033', '75932', '35904', '43815', '37070', '36790', '21487', '42918', '19919', '28537', '30416', '491', '383', '18547', '23338', '45303', '75677', '31920', '19458', '508', '1471', '6971', '38054', '45758', '41352', '49053', '19145', '67821', '28757', '21249', '142190', '40944', '6350', '256', '59866', '64969', '963', '60852', '28742', '22439', '85315', '85879', '53965', '94', '85590', '86544', '85572', '23507', '32657', '31983', '32973', '17168', '35606', '22496', '19075', '30384', '18609', '16752', '81', '82566', '41733', '28662', '13106', '42415', '346', '88', '44354', '28754', '45428', '46563', '18594', '33602', '1290', '31374', '27243', '34255', '6258', '20212', '22122', '63010', '15280', '13054', '72086', '18843', '20976', '15704', '35778', '1132', '31055', '48947', '14', '9379', '226', '459187', '9628', '26525', '27672', '41393', '29', '34113', '34094', '86366', '21488', '461426', '1128', '213', '57', '8054', '27239', '665', '15752', '13', '30903', '23696', '20778', '46856', '74', '26557', '18692', '46403', '1040', '9447', '35203', '66548', '15030', '459194', '486', '29384', '9668', '30207', '53540', '21411', '24282', '19815', '27477', '31877', '174', '37693', '17017', '128', '61786', '461423', '30', '214', '527', '22021', '62254', '31821', '411', '122', '55', '393', '274', '60843', '318', '1185', '614', '23993', '461422', '313640', '1454', '1052', '210', '44', '61281', '29032', '48372', '8155', '324', '20889', '21', '37708', '146', '39373', '461421', '36045', '27164', '19265', '14913', '301', '136', '1028', '1707', '308', '66506', '17', '59457', '26841', '459190', '321556', '25228', '15889', '594', '979', '1766', '19378', '40194', '63114', '784', '165644', '20519', '31840', '19898', '41850', '15698', '156913', '134', '39185', '221106', '65', '55744', '13006', '19931', '31581', '52682', '53611', '16713', '18875', '56', '1099', '170849', '18797', '85835', '43261', '442', '35355', '87', '48753', '20135', '455', '1201', '20418', '40441', '178', '19874', '445141', '221105', '80041', '1401', '24345', '212814', '20021', '20725', '12970', '24125', '46927', '359387', '18834', '416236', '1815', '23834', '28903', '23543', '29829', '358014', '35660', '18530', '12964', '12605', '838', '70', '612', '175', '212813', '179268', '77', '80', '79', '8672', '14835', '49', '997', '44639', '154', '46', '1440', '403', '358084', '149', '41', '358080', '20930', '24375', '325', '1597', '41953', '27921', '123', '9688', '358077', '48', '501', '16053', '60100', '35']

In [58]:
cur_answers_accepted_id = tree.xpath("//*[contains(@class, 'question public')]//*[starts-with(@id,'their_answer_') and @checked]/./@id")
print (cur_answers_accepted_id)


['their_answer_1_18247', 'their_answer_2_18247', 'their_answer_3_18247', 'their_answer_1_83660', 'their_answer_2_9415', 'their_answer_1_42640', 'their_answer_1_453', 'their_answer_2_299', 'their_answer_2_165', 'their_answer_2_86', 'their_answer_1_169', 'their_answer_2_1138', 'their_answer_3_1138']

In [75]:
tmp=[]
for item in range(0,len(cur_answers_accepted_id)):
   tmp.append(re.match('.*?([0-9]+)$', cur_answers_accepted_id[item]).group(1))

In [80]:
indices = [k for k, x in enumerate(tmp) if x == '18247']
print indices


[0, 1, 2]

In [39]:
print cur_answers_self[3].label.text_content()


['Can you identify a French Manicure?', 'Do you like the idea of living alone?', 'Would you date someone with a facial piercing?', 'Do you consider yourself to be a picky eater?', 'Do you think any person is truly selfless?', 'Do you believe in the accuracy of horoscopes?', 'Are you totally anti-war?', 'Do you own a gun?', 'Should the death penalty be abolished?', 'Have you ever been to prison?']
Yes.

In [ ]:
output = [questions,answers_self,answers_target]
output = zip(*output)
with open('output.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='^', quotechar='"',quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerows(output)