Scraper


In [ ]:
import scrapy
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from scraper.guardianukscraper.spiders.guardian_spider import GuardianSpider
from scraper.guardianukscraper import settings
import os

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

os.environ['SCRAPY_SETTINGS_MODULE'] = 'scraper.guardianukscraper.settings'
settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
settings = Settings()
settings.setmodule(settings_module_path, priority='project')

print settings['MONGODB_SERVER']
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})

crawler = scrapy.crawler.Crawler(GuardianSpider,settings)

process.crawl(crawler)
process.start()

In [2]:
import pymongo
from scraper.guardianukscraper import settings
from scrapy.settings import Settings
import logging

from linkfinder.linkfinder.link_finder import preprocess_docs
from linkfinder.linkfinder.link_finder import strong_similarities_and_appropriate_links_thresh
from linkfinder.linkfinder.link_finder import perform_queries_and_get_links
from linkfinder.linkfinder.link_finder import find_links_between_in

sets = Settings()
sets.setmodule(settings, priority='project')
connection = pymongo.MongoClient(
        sets['MONGODB_SERVER'],
        sets['MONGODB_PORT']
)
db = connection[sets['MONGODB_DB']]
collection = db[sets['MONGODB_COLLECTION']]

def order_comments(comments):
    comments.reverse()

def join_all_comments_paragraphs(comments):
    for comment in comments:
        if comment['content'].__class__ == list:
            comment['content'] = " \n ".join(comment['content'])
            
def text_list_into_sentences_dict_and_list(text, start_id):
    import nltk.data
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    
    dicts = []
    sentences = []
    
    if text.__class__ == list:
        for i, t in enumerate(text):
            for s in sent_detector.tokenize(t):
                new = {
                    'text': s,
                    'comment': i
                }
                dicts.append(new)
                sentences.append(s)
    else:
        for s in sent_detector.tokenize(text):
                new = {
                    'text': s,
                    'comment': -1
                }
                dicts.append(new)
                sentences.append(s)
    return dicts, sentences

def article_body_into_sentences_dict_and_list(article_body):
    joined_body = " \n ".join(article_body)
    dicts, sentences = text_list_into_sentences_dict_and_list(joined_body,
                                                             0)
    return dicts, sentences

def comments_to_sentences_dict_and_list(comments, start_id):
    join_all_comments_paragraphs(comments)
    paragraphs = [c['content'] for c in comments]
    dicts, sentences = text_list_into_sentences_dict_and_list(paragraphs,
                                                             start_id)
    return dicts, sentences

def split_into_sentences(article_dict):
    import copy
    
    a_dicts, a_sents = article_body_into_sentences_dict_and_list(
        article_dict['body']
    )
    start_comment_id = len(a_sents)
    c_dicts, c_sents = comments_to_sentences_dict_and_list(
        article_dict['comments'],
        start_comment_id
    )
#     if a_dicts is None or c_dicts is None:
#         print "Dicts are empty"
#     else:
# #         print "THESE ARE THE DICTS:\n{0}".format(c_dicts)
#         print c_dicts[0]
#         print article_dict['comments'][0]['content']
    
    all_dicts = copy.deepcopy(a_dicts)
    all_sentences = copy.deepcopy(a_sents)
    all_dicts.extend(copy.deepcopy(c_dicts))
    all_sentences.extend(copy.deepcopy(c_sents))
    
    article_dict['comment_sentences'] = c_sents
    article_dict['article_sentences'] = a_sents
    article_dict['all_sentences'] = all_sentences
    article_dict['all_sentences_dicts'] = all_dicts
    
#     if all_dicts[len(article_dict['article_sentences'])]['text'] in article_dict['comments'][c_dicts[0]['comment']]['content']:
#         print "Comment sentences keep a reference to their comment. Passed."
    
def preprocess_article(article_dict):
    order_comments(article_dict['comments'])
    split_into_sentences(article_dict)

def classify_links(s_dict, all_sentences, comment_start_index):
    '''
    Receive a similarity links structure in the form of 
    {
    comment_sentence_no: [(list of tuples with sentence id,
                        and percentage)]
    }
    
    comment_start_index has the offset to add to the comment
    sentence number to change it into an id.
    
    It changes the list of links into a triple, with the:
    (sentence_id, percentage, "type of link")
    
    
    '''
    for comment_no, link_list in s_dict.iteritems():
        comment_sentence_id = comment_no + comment_start_index
        classified_links = []
        for l in link_list:
            # category = classify(all_sentences[comment_sentence_id],
            #           all_sentences[l[0]])
            category = 'stub'
            classified_links.append((l[0], l[1].item(), category))
        s_dict[comment_no] = classified_links
    
def summarize(article_dict):
    preprocess_article(article_dict)
    summary = copy.deepcopy(article_dict)
    del summary['body']
    
    docs = summary['all_sentences']
    comments = summary['comment_sentences']
    
    # not that the dictionary doesn't have the sentence id, but the
    # comment sentence number as key. So you have to add
    # len(summary['article_sentences']) to the key, to get the 
    # comment sentence id.
    similarity_dict = find_links_between_in(docs, comments)
    summary['links'] = similarity_dict
    print summary.keys()
    return summary
    
art1 = collection.find_one()
summary = summarize(art1)
print summary['links'][0]
classify_links(summary['links'], 
               summary['all_sentences'], 
               len(summary['article_sentences']))
def convert_keys_to_string(dictionary):
    """Recursively converts dictionary keys to strings."""
    if not isinstance(dictionary, dict):
        return dictionary
    return dict((str(k), convert_keys_to_string(v)) 
        for k, v in dictionary.items())

str_summary = convert_keys_to_string(summary)

str_summary['links']


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-2-42909157d6b8> in <module>()
    144 
    145 art1 = collection.find_one()
--> 146 summary = summarize(art1)
    147 print summary['links'][0]
    148 classify_links(summary['links'], 

<ipython-input-2-42909157d6b8> in summarize(article_dict)
    127 
    128 def summarize(article_dict):
--> 129     preprocess_article(article_dict)
    130     summary = copy.deepcopy(article_dict)
    131     del summary['body']

<ipython-input-2-42909157d6b8> in preprocess_article(article_dict)
     98 def preprocess_article(article_dict):
     99     order_comments(article_dict['comments'])
--> 100     split_into_sentences(article_dict)
    101 
    102 def classify_links(s_dict, all_sentences, comment_start_index):

<ipython-input-2-42909157d6b8> in split_into_sentences(article_dict)
     69 
     70     a_dicts, a_sents = article_body_into_sentences_dict_and_list(
---> 71         article_dict['body']
     72     )
     73     start_comment_id = len(a_sents)

KeyError: 'body'

In [ ]:


In [ ]:
def output_top_sentence_pairs(s_dict, all_art_sentences,
                              all_sentences, 
                              all_comment_sentences):
    d = s_dict
    comment_start_index = len(all_art_sentences)
    sentences = all_sentences
    comment_sentences = all_comment_sentences
    
    for comment_sentence, links in zip(d.keys(),d.values()):

        s1_id = "s{0}".format(comment_sentence + comment_start_index)
        s2_id = "s{0}".format(links[0][0])
        print "\nLink found"
        print comment_sentence+comment_start_index, [l[0] for l in links]        
        print "s{0} is:\n{1}\nSimilar too:".format(comment_sentence+comment_start_index,
                                                  comment_sentences[comment_sentence].strip().encode('utf8'))
        for i, (l_id, prob) in enumerate(links):
            print "S{0}: {1}".format(l_id,sentences[l_id].strip().encode('utf8'))

% time
output_top_sentence_pairs(summary['links'],
                         summary['article_sentences'],
                         summary['all_sentences'],
                         summary['comment_sentences'])

In [1]:
collection = db['summaries']
summ = collection.find_one()
summ


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-8460f9057e41> in <module>()
----> 1 collection = db['summaries']
      2 summ = collection.find_one()
      3 summ

NameError: name 'db' is not defined