In [ ]:
import scrapy
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from scraper.guardianukscraper.spiders.guardian_spider import GuardianSpider
from scraper.guardianukscraper import settings
import os
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
os.environ['SCRAPY_SETTINGS_MODULE'] = 'scraper.guardianukscraper.settings'
settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
settings = Settings()
settings.setmodule(settings_module_path, priority='project')
print settings['MONGODB_SERVER']
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
crawler = scrapy.crawler.Crawler(GuardianSpider,settings)
process.crawl(crawler)
process.start()
In [2]:
import pymongo
from scraper.guardianukscraper import settings
from scrapy.settings import Settings
import logging
from linkfinder.linkfinder.link_finder import preprocess_docs
from linkfinder.linkfinder.link_finder import strong_similarities_and_appropriate_links_thresh
from linkfinder.linkfinder.link_finder import perform_queries_and_get_links
from linkfinder.linkfinder.link_finder import find_links_between_in
sets = Settings()
sets.setmodule(settings, priority='project')
connection = pymongo.MongoClient(
sets['MONGODB_SERVER'],
sets['MONGODB_PORT']
)
db = connection[sets['MONGODB_DB']]
collection = db[sets['MONGODB_COLLECTION']]
def order_comments(comments):
comments.reverse()
def join_all_comments_paragraphs(comments):
for comment in comments:
if comment['content'].__class__ == list:
comment['content'] = " \n ".join(comment['content'])
def text_list_into_sentences_dict_and_list(text, start_id):
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
dicts = []
sentences = []
if text.__class__ == list:
for i, t in enumerate(text):
for s in sent_detector.tokenize(t):
new = {
'text': s,
'comment': i
}
dicts.append(new)
sentences.append(s)
else:
for s in sent_detector.tokenize(text):
new = {
'text': s,
'comment': -1
}
dicts.append(new)
sentences.append(s)
return dicts, sentences
def article_body_into_sentences_dict_and_list(article_body):
joined_body = " \n ".join(article_body)
dicts, sentences = text_list_into_sentences_dict_and_list(joined_body,
0)
return dicts, sentences
def comments_to_sentences_dict_and_list(comments, start_id):
join_all_comments_paragraphs(comments)
paragraphs = [c['content'] for c in comments]
dicts, sentences = text_list_into_sentences_dict_and_list(paragraphs,
start_id)
return dicts, sentences
def split_into_sentences(article_dict):
import copy
a_dicts, a_sents = article_body_into_sentences_dict_and_list(
article_dict['body']
)
start_comment_id = len(a_sents)
c_dicts, c_sents = comments_to_sentences_dict_and_list(
article_dict['comments'],
start_comment_id
)
# if a_dicts is None or c_dicts is None:
# print "Dicts are empty"
# else:
# # print "THESE ARE THE DICTS:\n{0}".format(c_dicts)
# print c_dicts[0]
# print article_dict['comments'][0]['content']
all_dicts = copy.deepcopy(a_dicts)
all_sentences = copy.deepcopy(a_sents)
all_dicts.extend(copy.deepcopy(c_dicts))
all_sentences.extend(copy.deepcopy(c_sents))
article_dict['comment_sentences'] = c_sents
article_dict['article_sentences'] = a_sents
article_dict['all_sentences'] = all_sentences
article_dict['all_sentences_dicts'] = all_dicts
# if all_dicts[len(article_dict['article_sentences'])]['text'] in article_dict['comments'][c_dicts[0]['comment']]['content']:
# print "Comment sentences keep a reference to their comment. Passed."
def preprocess_article(article_dict):
order_comments(article_dict['comments'])
split_into_sentences(article_dict)
def classify_links(s_dict, all_sentences, comment_start_index):
'''
Receive a similarity links structure in the form of
{
comment_sentence_no: [(list of tuples with sentence id,
and percentage)]
}
comment_start_index has the offset to add to the comment
sentence number to change it into an id.
It changes the list of links into a triple, with the:
(sentence_id, percentage, "type of link")
'''
for comment_no, link_list in s_dict.iteritems():
comment_sentence_id = comment_no + comment_start_index
classified_links = []
for l in link_list:
# category = classify(all_sentences[comment_sentence_id],
# all_sentences[l[0]])
category = 'stub'
classified_links.append((l[0], l[1].item(), category))
s_dict[comment_no] = classified_links
def summarize(article_dict):
preprocess_article(article_dict)
summary = copy.deepcopy(article_dict)
del summary['body']
docs = summary['all_sentences']
comments = summary['comment_sentences']
# not that the dictionary doesn't have the sentence id, but the
# comment sentence number as key. So you have to add
# len(summary['article_sentences']) to the key, to get the
# comment sentence id.
similarity_dict = find_links_between_in(docs, comments)
summary['links'] = similarity_dict
print summary.keys()
return summary
art1 = collection.find_one()
summary = summarize(art1)
print summary['links'][0]
classify_links(summary['links'],
summary['all_sentences'],
len(summary['article_sentences']))
def convert_keys_to_string(dictionary):
"""Recursively converts dictionary keys to strings."""
if not isinstance(dictionary, dict):
return dictionary
return dict((str(k), convert_keys_to_string(v))
for k, v in dictionary.items())
str_summary = convert_keys_to_string(summary)
str_summary['links']
In [ ]:
In [ ]:
def output_top_sentence_pairs(s_dict, all_art_sentences,
all_sentences,
all_comment_sentences):
d = s_dict
comment_start_index = len(all_art_sentences)
sentences = all_sentences
comment_sentences = all_comment_sentences
for comment_sentence, links in zip(d.keys(),d.values()):
s1_id = "s{0}".format(comment_sentence + comment_start_index)
s2_id = "s{0}".format(links[0][0])
print "\nLink found"
print comment_sentence+comment_start_index, [l[0] for l in links]
print "s{0} is:\n{1}\nSimilar too:".format(comment_sentence+comment_start_index,
comment_sentences[comment_sentence].strip().encode('utf8'))
for i, (l_id, prob) in enumerate(links):
print "S{0}: {1}".format(l_id,sentences[l_id].strip().encode('utf8'))
% time
output_top_sentence_pairs(summary['links'],
summary['article_sentences'],
summary['all_sentences'],
summary['comment_sentences'])
In [1]:
collection = db['summaries']
summ = collection.find_one()
summ