In [28]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import os
#Creates folder data if it doesn't exit
os.makedirs(os.path.dirname("./data"), exist_ok=True)

#Download corpora if you haven't downloaded before (otherwise comment lines below)
#import nltk
#nltk.download()


showing info http://www.nltk.org/nltk_data/
Out[28]:
True

Crawler

Starting from a website crawls all the outgoing links, looking for a word/sentence and saving the results


In [21]:
from time import sleep
from random import random
from urllib.parse import urljoin
import bs4 as bs
import requests

def get_links_website(website,keyword_search):
    """
    Finds all the links in the website, and how many times the keyword was found
    """
    try: r = requests.get(website, timeout=100)
    except: 
        print("error {} in page {}".format(r.status_code,website))
        return 0,[]
    if r.ok:
        html = bs.BeautifulSoup(r.text, "lxml")
    else: 
        print(r.status_code)
        return 0,[]
    
    #Find all snapshots
    snapshots = []
    all_links = html.find_all("a", href=True)
    hits = r.text.count(keyword_search)
    return hits,all_links

def crawl_one_level(snapshot, link_base, links_snapshot_set,keyword):
    """
    get's the links of the website (link_base) and filter them, keeping the ones that stay in the domain
    """
    hits, links = get_links_website(link_base,keyword)
    for link in links:
        link_joined = urljoin(link_base, link["href"])
        if (snapshot in link_joined) and (link_joined not in links_snapshot_set):
            links_snapshot_set.add(link_joined)
    return hits, links_snapshot_set


def crawl_all_levels(snapshot,keyword):
    """
    crawls the website snapshot, without leaving the domain
    """
    #Get source code
    dict_hits = dict()
    links_snapshot_set_old = set([snapshot])
    new_links = set([snapshot])
    while len(new_links) > 0:
        for i, link in enumerate(new_links):
            sleep(random()*2)            
            hits, links_snapshot_set_new = crawl_one_level(snapshot, link,links_snapshot_set_old.copy(),keyword)
            print("{0}: Link: {1} has the word \"{2}\" {3} times".format(i, link,keyword,hits))
            dict_hits.update({link: hits})
        new_links = links_snapshot_set_new - links_snapshot_set_old
        print("\nNew links to follow: ", len(new_links))
        links_snapshot_set_old = links_snapshot_set_old | links_snapshot_set_new
        
    return dict_hits

def BFS_keyword(website,keyword,filename):
    """
    calls the script and saves the results to a file
    """
    dict_hits = crawl_all_levels(website,keyword)
    print(dict_hits)
    with open(filename, "w+") as f:
        f.write("link" + "\t" + "number_hits" + "\n")
        for link in dict_hits:
            f.write("{0}\t{1}\n".format(link,dict_hits[link]))

In [22]:
##USAGE
initial_website = "https://www.amnesty.org/en/"
words_to_find = "campaign"
file_to_save = "results_crawling_amnesty.csv"
BFS_keyword(initial_website,words_to_find,file_to_save)


0: Link: https://www.amnesty.org/en/ has the word "campaign" 13 times

New links to follow:  32
0: Link: https://www.amnesty.org/en/latest/ has the word "campaign" 17 times
1: Link: https://www.amnesty.org/en/latest/news/2016/01/cameroon-authorities-must-drop-non-denunciation-charges-against-three-journalists/ has the word "campaign" 10 times
2: Link: https://www.amnesty.org/en/human-rights-education/ has the word "campaign" 12 times
3: Link: https://www.amnesty.org/en/#maincontent has the word "campaign" 13 times
4: Link: https://www.amnesty.org/en/latest/campaigns/2016/01/family-reunification-refugees-sweden-syria/ has the word "campaign" 27 times
5: Link: https://www.amnesty.org/en/about-us/contact/ has the word "campaign" 10 times
6: Link: https://www.amnesty.org/en/about-us/privacy-policy/ has the word "campaign" 9 times
7: Link: https://www.amnesty.org/en/latest/news/2016/01/myanmar-prisoner-amnesty-marred-by-outrageous-jailing-of-activist-for-facebook-post/ has the word "campaign" 12 times
8: Link: https://www.amnesty.org/en/latest/news/2016/01/amnesty-appoints-telecommunications-entrepreneur-bassim-haidar-to-its-global-council/ has the word "campaign" 10 times
9: Link: https://www.amnesty.org/en/get-involved/ has the word "campaign" 10 times
10: Link: https://www.amnesty.org/en/who-we-are/jobs-and-internships/ has the word "campaign" 0 times
11: Link: https://www.amnesty.org/en/latest/campaigns/ has the word "campaign" 42 times
12: Link: https://www.amnesty.org/en/jobs/ has the word "campaign" 14 times
13: Link: https://www.amnesty.org/en/media-centre/ has the word "campaign" 9 times
14: Link: https://www.amnesty.org/en/annual-report/ has the word "campaign" 13 times
15: Link: https://www.amnesty.org/en/about-us/cookie-statement/ has the word "campaign" 9 times
16: Link: https://www.amnesty.org/en/wire-magazine/ has the word "campaign" 10 times
17: Link: https://www.amnesty.org/en/latest/campaigns/2016/01/10-ways-you-help-us-respond-to-a-crisis/ has the word "campaign" 25 times
18: Link: https://www.amnesty.org/en/latest/news/ has the word "campaign" 13 times
19: Link: https://www.amnesty.org/en/latest/education/ has the word "campaign" 20 times
20: Link: https://www.amnesty.org/en/who-we-are/ has the word "campaign" 19 times
21: Link: https://www.amnesty.org/en/get-involved/join/ has the word "campaign" 9 times
22: Link: https://www.amnesty.org/en/about-us/how-were-run/ has the word "campaign" 9 times
23: Link: https://www.amnesty.org/en/get-involved/take-action/ has the word "campaign" 0 times
404
24: Link: https://www.amnesty.org/en/donate/ has the word "campaign" 0 times
25: Link: https://www.amnesty.org/en/countries/ has the word "campaign" 10 times
26: Link: https://www.amnesty.org/en/latest/news/2016/01/turkey-onslaught-on-kurdish-areas-putting-tens-of-thousands-of-lives-at-risk/ has the word "campaign" 12 times
27: Link: https://www.amnesty.org/en/what-we-do/ has the word "campaign" 20 times
28: Link: https://www.amnesty.org/en/who-we-are/how-were-run/ has the word "campaign" 9 times
29: Link: https://www.amnesty.org/en/get-involved/volunteer/ has the word "campaign" 0 times
30: Link: https://www.amnesty.org/en/about-us/permissions/ has the word "campaign" 9 times
31: Link: https://www.amnesty.org/en/latest/research/ has the word "campaign" 13 times

New links to follow:  20
0: Link: https://www.amnesty.org/en/latest/research/?p=3 has the word "campaign" 13 times
1: Link: https://www.amnesty.org/en/documents/mde12/3251/2016/en/ has the word "campaign" 0 times
2: Link: https://www.amnesty.org/en/latest/research/#maincontent has the word "campaign" 13 times
3: Link: https://www.amnesty.org/en/documents/asa11/3288/2016/fr/ has the word "campaign" 9 times
4: Link: https://www.amnesty.org/en/latest/news/2016/01/lesotho-army-chief-cannot-continue-to-disobey-a-court-order/ has the word "campaign" 10 times
5: Link: https://www.amnesty.org/en/documents/mde12/3221/2016/en/ has the word "campaign" 9 times
6: Link: https://www.amnesty.org/en/documents/afr16/3259/2016/en/ has the word "campaign" 9 times
7: Link: https://www.amnesty.org/en/documents/asa11/3288/2016/en/ has the word "campaign" 9 times
8: Link: https://www.amnesty.org/en/documents/amr18/3234/2016/es/ has the word "campaign" 9 times
9: Link: https://www.amnesty.org/en/documents/mde13/3261/2016/es/ has the word "campaign" 9 times
10: Link: https://www.amnesty.org/en/latest/research/?p=2 has the word "campaign" 13 times
11: Link: https://www.amnesty.org/en/documents/ior51/3248/2016/en/ has the word "campaign" 9 times
12: Link: https://www.amnesty.org/en/documents/eur44/3230/2016/en/ has the word "campaign" 9 times
13: Link: https://www.amnesty.org/en/rss/?contentType=2564 has the word "campaign" 0 times
14: Link: https://www.amnesty.org/en/latest/research/?p=4851 has the word "campaign" 13 times
15: Link: https://www.amnesty.org/en/documents/mde14/3229/2016/en/ has the word "campaign" 9 times
16: Link: https://www.amnesty.org/en/documents/asa16/3244/2016/fr/ has the word "campaign" 9 times
17: Link: https://www.amnesty.org/en/documents/mde13/3261/2016/en/ has the word "campaign" 9 times
18: Link: https://www.amnesty.org/en/documents/mde12/3221/2016/ar/ has the word "campaign" 9 times
19: Link: https://www.amnesty.org/en/documents/eur46/3255/2016/en/ has the word "campaign" 9 times

New links to follow:  4
---------------------------------------------------------------------------
InvalidSchema                             Traceback (most recent call last)
<ipython-input-21-614c9af2b1dc> in get_links_website(website, keyword_search)
     10     """
---> 11     try: r = requests.get(website, timeout=100)
     12     except:

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/requests/api.py in get(url, params, **kwargs)
     68     kwargs.setdefault('allow_redirects', True)
---> 69     return request('get', url, params=params, **kwargs)
     70 

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/requests/api.py in request(method, url, **kwargs)
     49     session = sessions.Session()
---> 50     response = session.request(method=method, url=url, **kwargs)
     51     # By explicitly closing the session, we avoid leaving sockets open which

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    467         send_kwargs.update(settings)
--> 468         resp = self.send(prep, **send_kwargs)
    469 

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/requests/sessions.py in send(self, request, **kwargs)
    569         # Get the appropriate adapter to use
--> 570         adapter = self.get_adapter(url=request.url)
    571 

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/requests/sessions.py in get_adapter(self, url)
    643         # Nothing matches :-/
--> 644         raise InvalidSchema("No connection adapters were found for '%s'" % url)
    645 

InvalidSchema: No connection adapters were found for 'mailto:?body=Share this https://www.amnesty.org/en/documents/eur46/3255/2016/en/'

During handling of the above exception, another exception occurred:

UnboundLocalError                         Traceback (most recent call last)
<ipython-input-22-fe1d2a6b346d> in <module>()
      3 words_to_find = "campaign"
      4 file_to_save = "results_crawling_amnesty.csv"
----> 5 BFS_keyword(initial_website,words_to_find,file_to_save)

<ipython-input-21-614c9af2b1dc> in BFS_keyword(website, keyword, filename)
     61     calls the script and saves the results to a file
     62     """
---> 63     dict_hits = crawl_all_levels(website,keyword)
     64     print(dict_hits)
     65     with open(filename, "w+") as f:

<ipython-input-21-614c9af2b1dc> in crawl_all_levels(snapshot, keyword)
     48         for i, link in enumerate(new_links):
     49             sleep(random()*2)
---> 50             hits, links_snapshot_set_new = crawl_one_level(snapshot, link,links_snapshot_set_old.copy(),keyword)
     51             print("{0}: Link: {1} has the word \"{2}\" {3} times".format(i, link,keyword,hits))
     52             dict_hits.update({link: hits})

<ipython-input-21-614c9af2b1dc> in crawl_one_level(snapshot, link_base, links_snapshot_set, keyword)
     29     get's the links of the website (link_base) and filter them, keeping the ones that stay in the domain
     30     """
---> 31     hits, links = get_links_website(link_base,keyword)
     32     for link in links:
     33         link_joined = urljoin(link_base, link["href"])

<ipython-input-21-614c9af2b1dc> in get_links_website(website, keyword_search)
     11     try: r = requests.get(website, timeout=100)
     12     except:
---> 13         print("error {} in page {}".format(r.status_code,website))
     14         return 0,[]
     15     if r.ok:

UnboundLocalError: local variable 'r' referenced before assignment

LEXIS NEXIS Crawler

Queries a search in lexis nexis, using the CSI (http://www.lexisnexis.com/hottopics/lnacademic/) as the newspaper identifier

Disclaimer: We accept no liability for the content of this code, or for the consequences of any actions taken on the basis of the information provided.

Code from: http://yc-lexisnexis-scraper.readthedocs.org/


In [ ]:
import random
import re
import os
import requests
import urllib.request, urllib.parse, urllib.error
import urllib.parse as urlparse

import selenium.common.exceptions
import selenium.webdriver
import selenium.webdriver.common.desired_capabilities
import selenium.webdriver.support.ui
from selenium.webdriver.support import expected_conditions


class LexisNexisScraper():
    """
    Class for downloading documents given a query string to Lexis Nexis academic (http://www.lexisnexis.com/hottopics/lnacademic/).

    Example::

            downloader = LexisNexisScraper(mass_download_mode=True)
            for (content, (doc_index, doc_count)) in downloader.iter_search_results(6318, 'DATE(=1987)'):
                print doc_id

     """

    _RE_STYLESHEET = re.compile(r'\<STYLE TYPE\=\"text\/css\"\>(\<\!\-\-)?(?P<css_string>.+?)(\-\-\>)?\<\/STYLE\>', flags=re.S | re.U | re.I)
    _RE_LEXIS_DOC = re.compile(r'\<DOC NUMBER\=(?P<docid>\d+)\>\s+\<DOCFULL\>(?P<doc>.+?)\<\/DOCFULL\>', flags=re.S | re.U | re.I)

    def __init__(self, wait_timeouts=(15, 180), documents_per_download=(250, 500), user_agent_string='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0', mass_download_mode=False):
        """
        Constructs a downloader object.

        :param float,float wait_timeouts: tuple of `(short, long)` where `short` and `long` are the no. of seconds to wait while page elements are loaded (for Webdriver). `long` timeout is used when waiting for LexisNexis to format documents for mass downloads.
        :param int,int documents_per_download: a range specifying the number of documents to download each time when using :attr:`mass_download_mode`.
        :param bool mass_download_mode: whether to mass download articles using the download link or page through each document one by one and download.
        """

        self._USER_AGENT_STRING = user_agent_string
        self._DOCUMENTS_PER_DOWNLOAD = documents_per_download

  
        self._driver = selenium.webdriver.Firefox()
        self._driver.set_window_size(800, 600)

        self._short_wait = selenium.webdriver.support.ui.WebDriverWait(self._driver, wait_timeouts[0], poll_frequency=0.05)
        self._long_wait = selenium.webdriver.support.ui.WebDriverWait(self._driver, wait_timeouts[1], poll_frequency=1)

        self.mass_download_mode_ = mass_download_mode
    #end def

    def __del__(self):
        try: self._driver.quit()
        except: pass

    def iter_search_results(self, csi, search_query, start_from=1):
        """
        A generator function that executes LexisNexis search query on source data CSI (:attr:`csi`), with query :attr:`search_query` and downloads all documents returned by search.

        :param str csi: LexisNexis CSI (see `<http://amdev.net/rpt_download.php>`_ for full list).
        :param str search_query: execute search query string.
        :param int start_from: document index to start downloading from.
        :returns: a tuple `(doc_content, (index, results_count))`, where `doc_content` is the HTML content of the `index`th document, and `results_count` is the number of documents returned by specified search query.
        """

        self._driver.get('http://www.lexisnexis.com/hottopics/lnacademic/?' + urllib.parse.urlencode({'verb': 'sr', 'csi': csi, 'sr': search_query}))
        if not self._have_results(): return []

        if self.mass_download_mode_: return self._mass_download(start_from)
        return self._sequential_download(start_from)
    #end def

    def _have_results(self):    # todo: kinda slow, due to having wait for multiple timeouts
        self._switch_to_frame('main')
        if self._wait_for_element('//td[text()[contains(., \'No Documents Found\')]]', raise_error=False) is not None: return False
        if self._wait_for_element('//frame[@title=\'Results Content Frame\']', raise_error=False) is not None: return True
        if self._wait_for_element('//frame[@title=\'Results Document Content Frame\']', raise_error=False) is not None: return True

        raise Exception('Page loaded improperly while checking for results frame.')
    #end def

    def _mass_download(self, start_from=1):    # Returns documents as a list of strings containing HTML
        self._switch_to_frame('navigation')

        try: documents_count = int(self._driver.find_element_by_xpath('//form[@name=\'results_docview_DocumentForm\']/input[@name=\'totalDocsInResult\']').get_attribute('value'))
        except: documents_count = -1

        def download_sequence(start, end):
            docs_left = end - start + 1
            cur = start
            while docs_left > self._DOCUMENTS_PER_DOWNLOAD[1]:
                download_count = random.randint(*self._DOCUMENTS_PER_DOWNLOAD)
                yield (cur, cur + download_count - 1)
                docs_left -= download_count
                cur += download_count
            #end while

            yield (cur, cur + docs_left - 1)
        #end def

        def lexis_nexis_download_window_appears(current_handle):
            def f(driver):
                for handle in driver.window_handles:
                    if current_handle != handle:
                        driver.switch_to.window(handle)    # switch first to check window title
                        if driver.title.endswith('Download Documents'): return True    # this is our new window!
                    #end if
                #end for

                return False
            #end def

            return f
        #end class

        for download_start, download_end in download_sequence(start_from, documents_count):
            self._switch_to_frame('navigation')

            parent_window_handle = self._driver.current_window_handle

            # check for download icon and click it
            self._wait_for_element('//img[@title=\'Download Documents\']').click()

            # wait for download window to appear
            self._short_wait.until(lexis_nexis_download_window_appears(parent_window_handle))
            self._wait_for_element('//img[@title=\'Download\']')

            # get all the form items
            selenium.webdriver.support.ui.Select(self._driver.find_element_by_xpath('//select[@name=\'delFmt\']')).select_by_value('QDS_EF_HTML')
            selenium.webdriver.support.ui.Select(self._driver.find_element_by_xpath('//select[@name=\'delView\']')).select_by_value('GNBFI')
            selenium.webdriver.support.ui.Select(self._driver.find_element_by_xpath('//select[@name=\'delFontType\']')).select_by_value('COURIER')    # i like courier

            search_term_bold = self._driver.find_element_by_xpath('//input[@type=\'checkbox\'][@id=\'termBold\']')
            if not search_term_bold.is_selected(): search_term_bold.click()
            doc_new_page = self._driver.find_element_by_xpath('//input[@type=\'checkbox\'][@id=\'docnewpg\']')
            if not doc_new_page.is_selected(): doc_new_page.click()

            self._driver.find_element_by_xpath('//input[@type=\'radio\'][@id=\'sel\']').click()
            self._driver.find_element_by_xpath('//input[@type=\'text\'][@id=\'rangetextbox\']').send_keys('{}-{}'.format(download_start, download_end))

            self._driver.find_element_by_xpath('//img[@title=\'Download\']').click()

            download_url = self._long_wait.until(expected_conditions.presence_of_element_located((selenium.webdriver.common.by.By.XPATH, '//center[@class=\'suspendbox\']/p/a'))).get_attribute('href')

            # set up cookies and use requests library to do download
            cookies = dict([(cookie['name'], cookie['value']) for cookie in self._driver.get_cookies()])
            response = requests.get(download_url, cookies=cookies, headers={'User-Agent': self._USER_AGENT_STRING})
            html_content = response.text

            m = self._RE_STYLESHEET.search(html_content)
            css_string = m.group('css_string').strip()

            for i, m in enumerate(self._RE_LEXIS_DOC.finditer(html_content)):
                page_content = m.group('doc').replace('<!-- Hide XML section from browser', '').replace('-->', '').strip()
                page_content = '\n'.join(['<HTML>', '<HEAD>', '<STYLE TYPE=\"text/css\">', css_string, '</STYLE>', '</HEAD>', '<BODY>', page_content, '</BODY>', '</HTML>'])

                yield (page_content, (download_start + i, documents_count))
            #end for

            self._driver.close()
            self._driver.switch_to.window(parent_window_handle)
        #end for
    #end def

    def _sequential_download(self, start_from=1):
        self._switch_to_frame('navigation')
        try: documents_count = int(self._driver.find_element_by_xpath('//form[@name=\'results_docview_DocumentForm\']/input[@name=\'totalDocsInResult\']').get_attribute('value'))
        except: documents_count = -1
        if documents_count <= 0: return

        if start_from > documents_count: return

        if documents_count == 1:
            self._switch_to_frame('content')
            page_content = self._driver.page_source
            yield (page_content, (1, 1))
            return
        #end if

        self._switch_to_frame('results')    # go to results list and grab the first link
        first_document_url = self._wait_for_element('//td/a[contains(@href, \'/lnacui2api/results/docview/docview.do\')]').get_attribute('href')

        url_obj = urlparse.urlparse(first_document_url)
        qs_dict = dict(urlparse.parse_qsl(url_obj.query))
        qs_dict['docNo'] = start_from
        doc_url = urlparse.urlunparse((url_obj.scheme, url_obj.netloc, url_obj.path, url_obj.params, urllib.parse.urlencode(qs_dict), url_obj.fragment))
        self._driver.get(doc_url)    # jump to the page we want

        # qs_dict['RELEVANCE'] = 'BOOLEAN'    # doesnt seem to work
        # http://www.lexisnexis.com/lnacui2api/results/docview/docview.do?docLinkInd=true&risb=21_T21153102977&format=GNBFI&sort=RELEVANCE&startDocNo=1&resultsUrlKey=29_T21153102981&cisb=22_T21153102980&treeMax=true&treeWidth=0&csi=6318&docNo=1

        for doc_index in range(start_from, documents_count + 1):
            self._switch_to_frame('content', in_iframe=False)
            page_content = self._driver.page_source
            yield (page_content, (doc_index, documents_count))

            self._switch_to_frame('navigation', in_iframe=False)
            next_page_elem = self._wait_for_element('//img[@title=\'View next document\']', raise_error=False)
            if next_page_elem is None:
                if doc_index != documents_count:
                    raise Exception('Next page icon could not be found: doc_index={}, documents_count={}'.format(doc_index, documents_count))
            else: next_page_elem.click()
        #end while
    #end def

    def _switch_to_frame(self, frame_name, in_iframe=True):
        self._driver.switch_to.default_content()

        if in_iframe:
            frame = self._safe_wait(expected_conditions.frame_to_be_available_and_switch_to_it('mainFrame'))
            if not frame: raise SwitchFrameException(frame_name)
        #end if

        try:
            if frame_name == 'main': return frame
            elif frame_name == 'results': frame = self._wait_for_element('//frame[@title=\'Results Content Frame\']')
            elif frame_name == 'navigation': frame = self._wait_for_element('//frame[@title=\'Results Navigation Frame\']')
            elif frame_name == 'content': frame = self._wait_for_element('//frame[@title=\'Results Document Content Frame\']')
        except selenium.common.exceptions.TimeoutException:
            raise SwitchFrameException(frame_name)

        self._safe_wait(expected_conditions.frame_to_be_available_and_switch_to_it(frame))

        return frame
    #end def

    def _safe_wait(self, poll_func):
        try: return self._short_wait.until(poll_func)
        except selenium.common.exceptions.TimeoutException: return None
    #end def

    def _wait_for_element(self, xpath, raise_error=True):
        elem = self._safe_wait(expected_conditions.presence_of_element_located((selenium.webdriver.common.by.By.XPATH, xpath)))
        if raise_error and elem is None: raise selenium.common.exceptions.TimeoutException(msg='XPath \'{}\' presence wait timeout.'.format(xpath))
        return elem
    #end def
#end class


class SwitchFrameException(Exception):
    """
    Exception class when we are unable to load the require page properly.
    This is usually due to
    #. Page taking too long to load. This happens sometimes when loading LexisNexis for the first time.
    #. Improper page loading.
    """

    def __init__(self, frame_name): self.frame_name = frame_name

    def __str__(self): return 'Exception while switching to frame \'{}\'.'.format(self.frame_name)
#end class

import bs4 as bs
def scrape_news(csi,init_date,end_date,search,path="./"):
    """
    call the code from that other person, saves the results (one file = one article)
    """
    import os      
    os.makedirs(os.path.dirname(path), exist_ok=True)
    query = "{0} AND Date(geq({1}) AND leq({2}))".format(search,init_date,end_date)

    scraper = LexisNexisScraper(mass_download_mode=True)

    for (content, (doc_index, doc_count)) in scraper.iter_search_results(csi,query):
        with open("{0}/{1}_{2}_{3}.html".format(path,search.replace(" ","_").replace("/","-"),doc_index, doc_count), "w+") as fOut:
            fOut.write(content)    


def format_new(content):
    """
    finds the article as a paragraph with class c8 or c9, and the other parts that we want too
    """
    html = bs.BeautifulSoup(content, "lxml")
    #print(html.prettify())
    new = " ".join([_.text for _ in html.find_all("p",{"class": ["c8","c9"]}) if len(_.text) > 100])
    other_parts = html.find_all("p",{"class": "c1"})
    date = other_parts[2].text
    newspaper = other_parts[1].text
    print(date)
    return date,newspaper,new

def format_all_files(path,file_out_name,search):
    """
    format the files, saving date, newspaper and article and saves the result
    """
    import os      
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open("./data/{0}".format(file_out_name),"w+") as fOut:        
        for file in sorted(os.listdir(path)):
            if not search.replace(" ","_") in file: continue
            print(file)
            date,newspaper,new = format_new(open("./data/news_raw/{0}".format(file)))    
            if len(new) > 100:
                fOut.write("{0}\t{1}\t{2}\n".format(date,newspaper,new))
            #break

In [ ]:
##USAGE
#CSI = Newspaper identifier: 
csi = "166103"

#Initial and final dates
init_date = "11/20/2015" #format: mm/dd/yyyy
end_date = "1/20/2016"  #format: mm/dd/yyyy

#Query (Use AND, OR, and parenthesis), saves the results to /data/news_raw
path = "./data/news_raw/"
search = "Pablo Iglesias AND Podemos"
scrape_news(csi,init_date,end_date,search,path=path)

#Where to save the results, formatting it as date, newspaper, article
file_out_name = "Pablo_Iglesias_Podemos.csv"
format_all_files(path,file_out_name,search)

Other useful stuff

  • Linke histogram in linear and log scale
  • Correlation
  • Cross-correlation
  • Compare samples (ttest, MWU,anova)
  • Linear regression

In [3]:
def make_line_hist(x,minValue,maxValue,nbins=10,logscale=False):
    """
    make a line histogram (instead of bars, just the top line)
    """
    if logscale:
        freqs,bins = np.histogram(x,bins=np.logspace(np.log10(minValue),np.log10(maxValue),nbins),normed=True) 
    else:
        freqs,bins = np.histogram(x,bins=np.linspace(minValue,maxValue,nbins),normed=True)
        
    
    plt.plot((bins[:-1]+bins[1:])/2,freqs)
    return freqs

def correlation(x,y,typeCorr="pearson"):
    from scipy.stats import pearsonr, spearmanr
    if typeCorr == "pearson":
        corr,p_value = pearsonr(x,y)
    else:
        corr,p_value = spearmanr(x,y)
    
    print("Correlation: {0:2.2g}. P-value: {1:2.5f}".format(corr,p_value))
    return corr,p_value
    
def cross_correlation(x,y,time):
    """
    Calculates the normalized cross-correlation and plots it
    """
    import numpy as np
    modeC = "same"
    x = (x - np.mean(x))/np.std(x)
    y =  (y - np.mean(y))/np.std(y)

    timeInt = np.diff(time).mean().days
    numPoints = len(x)
    fig = plt.figure(figsize=(6,3.5))        
    d = np.correlate(y,x,modeC)

    plt.plot([0,0],[-0.5,1],color="grey")
    plt.xlabel("Lag")
    plt.ylabel("Correlation")
    plt.plot(np.linspace(len(x)/2*timeInt,-len(x)/2*timeInt,len(x)),d/numPoints)
    plt.show()

def compare_samples(populations,parametric=False):
    """
    check if the samples come from the same population or not
    """
    from scipy.stats import mannwhitneyu, ttest_ind, f_oneway, kruskal, ranksums
    from statsmodels.stats.multicomp import pairwise_tukeyhsd
    populations = [np.array(pop) for pop in populations] #obscure line to take out missing values
    populations = [pop[~np.isnan(pop)]  for pop in populations]

    if len(populations) == 2:
        if parametric:
            stat, p_value = ttest_ind(*populations)
            print("P-value t-test: {0:2.10f}".format(p_value))
        else:
            stat, p_value1 = mannwhitneyu(*populations)
            print("P-value MWW: {0:2.10f}".format(p_value))
            stat, p_value2 = ranksums(*populations)
            print("P-value Ranksum: {0:2.10f}".format(p_value))
    
    if len(populations) > 2:
        if parametric:
            stat, p_value = f_oneway(*populations)
            print("P-value anova: {0:2.10f}".format(p_value))
        else:
            stat, p_value = kruskal(*populations)  
            print("P-value kruskal: {0:2.10f}".format(p_value))
            
        if p_value < 0.05:
            flatten_pop = []
            label_pop = []
            for i,pop in enumerate(populations):
                flatten_pop += list(pop)
                label_pop += ["pop{0}".format(i)]*len(pop)
                    
            res2 = pairwise_tukeyhsd(np.asarray(flatten_pop),label_pop)
            print("Printing pair comparisons using Tukey HSD")
            print(res2)
            res2.plot_simultaneous(comparison_name=None,xlabel='diffs',ylabel='grups')
            
    print(("Means: " + ", {}"*len(populations)).format(*[np.mean(_) for _ in populations]))
    print(("STDs: " + ", {}"*len(populations)).format(*[np.std(_) for _ in populations]))
    
    
    return p_value

def linear_regression(vectors_X,vector_y,variables_names="ABCDEFG",formula="Y ~ A * B"):
    """
    linear_regression
    vectors_X: list of the vectors X
    vector_y: independent variable
    variables_names: whatever you want to name the variables in X 
    formula: formulat to use
    """
    #linear regression
    import pandas as pd
    import statsmodels.formula.api as sm
    d = dict(zip(variables_names,vectors_X))
    d["Y"] = vector_y
             
    df = pd.DataFrame(d)
    result = sm.ols(formula=formula, data=df).fit()
    print(result.summary())
    
    return result.params

In [23]:
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import pylab as plt
from nltk.stem.snowball import SnowballStemmer
from labMTsimple.storyLab import emotionFileReader, emotion, stopper, emotionV
import string
from dateutil.parser import parse
def get_happiness_raw(string_study,lang="english"):
    """
    Available languages: german      portuguese arabic   hindi   russian chinese  indonesian  spanish english  korean      urdu french   pashto
    """

    from labMTsimple.storyLab import emotionFileReader, emotion, stopper, emotionV
    labMT,labMTvector,labMTwordList = emotionFileReader(stopval=0.0,lang=lang,returnVector=True)

    #Then we can score the text and get the word vector at the same time:
    Valence,Fvec = emotion(string_study,labMT,shift=True,happsList=labMTvector)
   
    return Valence

def get_happiness_delete_neutral(string_study,lang="english"):
    """
    Available languages: german      portuguese arabic   hindi   russian chinese  indonesian  spanish english  korean      urdu french   pashto
    """

    from labMTsimple.storyLab import emotionFileReader, emotion, stopper, emotionV
    labMT,labMTvector,labMTwordList = emotionFileReader(stopval=0.0,lang=lang,returnVector=True)

    #Then we can score the text and get the word vector at the same time:
    Valence,Fvec = emotion(string_study,labMT,shift=True,happsList=labMTvector)
    StoppedVec = stopper(Fvec,labMTvector,labMTwordList,stopVal=1.0)
    Valence = emotionV(StoppedVec,labMTvector)
    
    return Valence
#Delete punctuation
def remove_punctuation(string_to_remove):
    transtable = {ord(c): None for c in string.punctuation}
    return string_to_remove.translate(transtable).lower()

def remove_stop_words(text):
    text = ' '.join([word for word in text.split() if word not in cached_stop])
    return text

def stem_string(string_to_stem,language="english"):
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer(language)
    return " ".join([stemmer.stem(word) for word in string_to_stem.split(" ")])


def dates_vs_yvalues(df,column_ys,column_dates,groupbyday = True,filter_zeros=True,color="",smoothing=2/3,print_points=True,label=""):
    from statsmodels.nonparametric.smoothers_lowess import lowess
    try: df[column_dates] = df[column_dates].apply(lambda x: parse(x,yearfirst=True,dayfirst=dayfirst))
    except: pass

    
    if filter_zeros:
        df = df.loc[df[column_ys]>0,:]
    
    if groupbyday:
        df = df.groupby(column_dates).mean().reset_index()
        
    y = lowess(df[column_ys],df[column_dates],frac=smoothing,return_sorted=False)
    
    if color:
        plt.plot(df[column_dates],y,color=color,label=label)
        if print_points:
            plt.plot(df[column_dates],df[column_ys],"o",color=color)
    else:
        plt.plot(df[column_dates],y,label=label)
        if print_points:
            plt.plot(df[column_dates],df[column_ys],"o")

In [25]:
## Sentiment analysis
"""
Available languages: 
german       portuguese 
arabic       hindi   
russian      chinese  
indonesian   spanish 
english      korean      
urdu         french   
pashto  """
    
language = "spanish"
filenames = ["./data/Pedro_Sanchez_PSOE.csv","./data/Pablo_Iglesias_Podemos.csv","./data/Albert_Rivera_Ciudadanos.csv","./data/Mariano_Rajoy_PP.csv"]

separator_csv = "\t" #tab
header = None
index_col = None 
columns_csv = ["Date","Newspaper","Article"]
column_text = "Article" #leave empty to not get the valence
column_date = "Date" #leave empty to not parse the date
dayfirst = False
extra_stop_words = [_.lower() for _ in ["Podemos","Pablo","Iglesias","Ciudadanos","Albert","Rivera","Pedro","Sánchez","PSOE","Mariano","Rajoy","PP"]]

cached_stop = set(stopwords.words(language) + extra_stop_words)
    
## Fill up
for filename in filenames:
    print(filename)
    df = pd.read_csv(filename,sep=separator_csv,header=header,index_col=index_col)
    df.columns = columns_csv
    df = df.drop_duplicates(subset=[column_text])
    
    if column_text:
        valences_articles = []
        for article in df[column_text]:
            try:
                t_article = remove_punctuation(article)
                t_article = remove_stop_words(t_article)
                #t_article = stem_string(t_article)
                valence = get_happiness_delete_neutral(t_article,language)
            except: 
                valence = np.NaN
            valences_articles.append(valence)
        df["Valence"] = valences_articles

    if column_date:
        dates = []
        for date in df[column_date]:
            try:
                date = parse(date,fuzzy=True,dayfirst=dayfirst)
            except: 
                date = pd.NaT
            dates.append(date)
        df["Date_parsed"] = dates

    df.to_csv(filename[:-4]+"_formatted.csv",sep=separator_csv,index=index_col)

In [29]:
## Plot happiness

print_points = False
smoothing = 0.2
colors = ["#912CEE","orange","#00BFFF","red"] #Watch out, have 
labels = ["Podemos","Ciudadanos","PP","PSOE"]
plt.figure(figsize=(15,5))
column_ys = "Valence"
column_date = "Date_parsed"

valences  = []
for i,filename in enumerate(filenames): #enumerate gives you the index adn the content of the elements in the list   
    df = pd.read_csv(filename[:-4]+"_formatted.csv",sep=separator_csv,index_col=index_col)
    valences.append(df[column_ys].values)
    if (len(colors) >= len(filenames)) and (len(labels) >= len(filenames) ):
        dates_vs_yvalues(df,column_ys,column_date,groupbyday = True,filter_zeros=True,color=colors[i],smoothing=smoothing,print_points=print_points,label=labels[i])
    else:
        dates_vs_yvalues(df,column_ys,column_date,groupbyday = True,filter_zeros=True,smoothing=smoothing,print_points=print_points)        

compare_samples(valences,parametric=True)
elections = parse("Dec 20, 2015")
plt.plot([elections,elections],[6.1,6.3],color="grey")
plt.legend()
plt.show()


Multiple Comparison of Means - Tukey HSD,FWER=0.05
=============================================
group1 group2 meandiff  lower   upper  reject
---------------------------------------------
 pop0   pop1   0.0253  -0.0038  0.0544 False 
 pop0   pop2  -0.0362  -0.0742  0.0019 False 
 pop0   pop3   -0.052  -0.0817 -0.0222  True 
 pop1   pop2  -0.0615  -0.1022 -0.0207  True 
 pop1   pop3  -0.0773  -0.1104 -0.0442  True 
 pop2   pop3  -0.0158   -0.057  0.0254 False 
---------------------------------------------
Means: , 6.207445859019601, 6.2327378678673435, 6.171276248561799, 6.155461246531734
STDs: , 0.2096490063447481, 0.23645909867206838, 0.2809897568605098, 0.24835029650668283
P-value: 0.0000000043
/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/matplotlib/axes/_axes.py:519: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

Classifier

Partially coded Check: classifier.ipynb

Latent Dirichlet Allocation (Semantic analysis)


In [2]:
from matplotlib import cm
import lda
import pylab as plt
from collections import Counter
import bisect
import re
import pickle
import string

#Delete punctuation
def remove_punctuation(string_to_remove):
    transtable = {ord(c): None for c in string.punctuation}
    return string_to_remove.translate(transtable).lower()

def remove_stop_words(text,cached_stop):
    text = ' '.join([word for word in text.lower().split() if word not in cached_stop])
    return text

#Remove endings
def stem_string(string_to_stem,language="english"):
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer(language)
    return " ".join([stemmer.stem(word) for word in string_to_stem.split(" ")])

def bi_contains(lst, item):
    """ efficient `item in lst` for sorted lists """
    pos = bisect.bisect_left(lst, item)
    return [((item <= lst[-1]) and (lst[pos] == item)),pos]

def create_dictionary(lang,cached_stop):
    """
    top 5000 words, using labMTsimple
    """
    from labMTsimple.storyLab import emotionFileReader, emotion, stopper, emotionV
    labMT,labMTvector,labMTwordList = emotionFileReader(stopval=0.0,lang=lang,returnVector=True)
    vocab = sorted(list(set(labMTwordList) - set(cached_stop)))
    pickle.dump(vocab,open('./data/vocab.txt',"wb+"))
    
def create_corpus(all_articles,language="english"):
    """
    create the corpus (a numpy array (kind of a list of lists) with the number of times each word appear among all articles)
    """
    vocab = pickle.load(open('./data/vocab.txt',"rb+"))
    len_vocab = len(vocab)
    print("Number of articles: {0}".format(len(all_articles)))
    allMessages = np.zeros((len(all_articles),len_vocab))
    j = -1
    for mess in all_articles:
        mess = remove_stop_words(remove_punctuation(mess),cached_stop)
        j += 1
        # Kind-of efficient way to count words (better to use a dictionary)
        file = re.split(' |\n',mess)
        c = Counter(file)
        del c['']
        values = np.zeros(len_vocab)
        for word in c:
            pos = bi_contains(vocab, word)
            if pos[0]:
                values[pos[1]] = c[word]
        allMessages[j,:] += values
        
    allMessages = np.array(allMessages,dtype=int)
    pickle.dump(allMessages, open('./data/corpus.dat',"wb+"))

def lda_run(allMessages,vocab,topicNum=10,n_iter=1000):
    """
    run lda with allMessages and print topics
    """
    print(topicNum)
    model = lda.LDA(n_topics=topicNum, n_iter=n_iter, random_state=1)
    
    model.fit(allMessages)
    topic_word = model.topic_word_
    print(model.loglikelihood())
    np.savetxt("./data/ldaTopics"+str(topicNum)+".dat",np.asarray(topic_word))
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:]#-[:n_top_words:-1]
        freq_words = np.array(topic_dist)[np.argsort(topic_dist)][:]#-n_top_words:-1]

        print('Topic {}: {}'.format(i, ' '.join(topic_words[::-1][:5])))
        print('Topic {}: {}'.format(i, ' '.join([str(_) for _ in freq_words[::-1][:5]])))


    doc_topic = model.doc_topic_

    allDisSorted = []
    for i in range(allMessages.shape[0]):
        allDisSorted.append(doc_topic[i])

    allDisSorted = np.asarray(allDisSorted)
    np.savetxt("./data/allDistComb1Day"+str(topicNum)+".dat",allDisSorted)
    return model.loglikelihood()
    

def plotNumTopics(x,y,ax2,loglikelihood=False):
    """
    plot points and the smooth line. if KL divergence then divides the results by comb(x,2) to get the mean 
    """
    from statsmodels.nonparametric.smoothers_lowess import lowess
    import pylab as plt
    from scipy.misc import comb
    x = np.asarray(x)
    if loglikelihood:
        y = np.asarray(y)
    else:
        y = np.asarray(y)/comb(x,2)
            
    a = lowess(y,x,frac = 0.3)
    ax2.plot(x,y,'.',linewidth=2,markersize=7,color='orange')

    ax2.plot(x,a[:,1],color='orange',linewidth=2,label='Distance')
    ax2.set_xlabel('Number of Topics',fontsize=12)
    ax2.set_ylabel('Average distance between topic',fontsize=12)



def find_distances(tit="",retTopics = False):
    """
    Finds the distance between topics using:
    (1) the frequency at which each topic appears every day (topics that always appear in the same dates are probably related) 
    (2) the frequency of words appearing in the topics (topics with the same words are probably related)
    """
    from scipy.stats import pearsonr,spearmanr,kendalltau,entropy
    import pylab as plt
    from scipy.spatial.distance import pdist, squareform
    from scipy.cluster.hierarchy import linkage, dendrogram,leaders,fcluster


    def KL_H(u,v):
        return entropy(u,v)*entropy(v,u)/entropy(u)/entropy(v)


    varValues = np.transpose(np.loadtxt("./data/allDistComb1Day"+tit+".dat"))
    data_dist_a = pdist(varValues,lambda u,v: KL_H(u,v))

    varValues2 = (np.loadtxt("./data/ldaTopics"+tit+".dat"))
    data_dist_b = pdist(varValues2,lambda u,v: KL_H(u,v))

    return np.mean(data_dist_a),np.mean(data_dist_b)

def find_number_topics(range_lookup,n_iter=1000):
    """
    iterates through range_lookup and plots the distance between topics, to choose the best number
    
    """
    allMessages =  pickle.load(open('./data/corpus.dat',"rb+"))
    vocab = pickle.load(open('./data/vocab.txt',"rb+"))
        
    distances = []
    for i in range_lookup:
        loglikelihood = lda_run(allMessages,vocab,topicNum = i,n_iter=n_iter)  
        freq_distance, topic_distance = find_distances(tit=str(i),retTopics=True)
        distances.append([loglikelihood,freq_distance,topic_distance])
    
    fig = plt.figure(figsize = (15,5))
    loglikelihood,freq_distance,topic_distance = zip(*distances)
    ax = fig.add_subplot(1,3,1)
    plotNumTopics(range_lookup, loglikelihood,ax,loglikelihood=True)
    ax = fig.add_subplot(1,3,2)
    plotNumTopics(range_lookup, freq_distance,ax,loglikelihood=True)
    ax =fig.add_subplot(1,3,3)
    plotNumTopics(range_lookup, topic_distance,ax,loglikelihood=True)
    plt.show()
    


def correlationMerge(tit="",threshold=0.7):
    """
    Cluster topics that are close (in time and in frequency of words)
    """
    from scipy.stats import pearsonr,spearmanr,kendalltau,entropy
    import pylab as plt
    from scipy.spatial.distance import pdist, squareform
    from scipy.cluster.hierarchy import linkage, dendrogram,leaders,fcluster

    vocab = pickle.load(open('./data/vocab.txt',"rb+"))

    def KL_E(u,v):
        return entropy(u,v)*entropy(v,u)/entropy(u)/entropy(v)

    varValues = np.transpose(np.loadtxt("./data/allDistComb1Day"+str(tit)+".dat"))
    data_dist_a = pdist(varValues,lambda u,v: KL_E(u,v))

    varValues2 = (np.loadtxt("./data/ldaTopics"+str(tit)+".dat"))
    data_dist_b = pdist(varValues2,lambda u,v: KL_E(u,v))
    data_dist = data_dist_a*1 * data_dist_b*1

    #print(data_dist)
    fig = plt.figure(figsize=(8,8))

    # plot first dendrogram:
    ax1 = fig.add_axes([0.05,0.1,0.2,0.6])
    Y = linkage(data_dist, method='weighted') # method?

    t = max(Y[:, 2]) * threshold
    Z1 = dendrogram(Y, orientation='right',
                leaf_font_size=18,color_threshold=t) # adding/removing the axes
                #labels=["Profit","Decay","Profit \n(Ratio)","IMDB rating","Metascore","RottenT \nAudience","RottenT \nCritics","Profit \n(Deviation)"],
    ax1.set_xticks([]) # turn off xticks

    # plot second dendrogram:
    ax2 = fig.add_axes([0.32,0.71,0.58,0.2])
    Z2 = dendrogram(Y,color_threshold=t)
    ax2.set_xticks([])
    ax2.set_yticks([])

    clusters = fcluster(Y,t=t,criterion='distance')
    #print( clusters)
    lisTop = np.asarray(range(len(clusters)))

    #print(np.unique(clusters))
    prov = 0
    for g in np.unique(clusters):
        group = lisTop[clusters==g]
        for i2 in group:
            topic_words = np.array(vocab)[np.argsort(varValues2[i2,:])][:]#-[:n_top_words:-1]
            freq_words = np.array(varValues2[i2,:])[np.argsort(varValues2[i2,:])][:]#-n_top_words:-1]
            prov += np.sum(freq_words[::-1][:10])


    # get the distance matrix:
    D = squareform(data_dist)

    # reorder rows/cols of D to match dendrograms
    idx1 = Z1['leaves']
    idx2 = Z2['leaves']
    D = D[idx1,:]
    D = D[:,idx2]
    # add matrix
    axmatrix = fig.add_axes([0.32,0.1,0.58,0.6])
    im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu)
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])

    # Plot colorbar.
    axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
    plt.colorbar(im, cax=axcolor)
    #plt.savefig("./data/"+'MatrixPrediction'+tit+'.pdf', bbox_inches='tight' ,dpi=100)
    #plt.savefig("./data/"+'MatrixPrediction'+tit+'.png', bbox_inches='tight' ,dpi=100)
    plt.show()
    return clusters,vocab,varValues2

def automatic_annotate(clusters,vocab,varValues2):
    """
    Finds the wods that characterize each cluster (prints them) and by topic (saves them to ./data/topics.csv)
    """
    sumValues = np.sum(varValues2,0)
    lisTop = np.asarray(range(len(clusters)))
    cols = ['g','r','c','b','m','y','k','g','b']
    print(clusters)
    
    for g in np.unique(clusters):
        group = lisTop[clusters==g]
        #print(group)
        prov = np.zeros(len(sumValues))
        for i2 in group:
            topic_words = np.array(vocab)[np.argsort(varValues2[i2,:])][:]#-[:n_top_words:-1]
            freq_words = np.array(varValues2[i2,:])[np.argsort(varValues2[i2,:])][:]#-n_top_words:-1]
            prov += varValues2[i2,:]

            sumValues1 = np.sum(varValues2,0)
  
            topic_words1 = np.array(vocab)[np.argsort(varValues2[i2,:]/sumValues1)][:]#-[:n_top_words:-1]
            sumValues2 = np.ones(len(np.sum(varValues2,0)))
            topic_words2 = np.array(vocab)[np.argsort(varValues2[i2,:])][:]#-[:n_top_words:-1]           
            x = []
            indX = 0
            while len(x) < 20:
                x += list(topic_words1[::-1][indX:indX+1])
                x += list(topic_words2[::-1][indX:indX+1])
                indX += 1
            

            print(i2,' '.join(x))
            #print('Topic {}: {}'.format(i2, ' '.join(topic_words[::-1][:50])))
            #print('Topic {}: {}'.format(i2, ' '.join([str(_) for _ in freq_words[::-1][:50]])))
        sumValues1 = np.sum(varValues2,0)
        topic_words1 = np.array(vocab)[np.argsort(prov/sumValues)][:]#-[:n_top_words:-1]
        sumValues2 = np.ones(len(np.sum(varValues2,0)))
        topic_words2 = np.array(vocab)[np.argsort(prov/sumValues)][:]#-[:n_top_words:-1]
        x = []
        indX = 0
        while len(x) < 20:
            x += list(topic_words1[::-1][indX:indX+1])
            if topic_words2[::-1][indX:indX+1] not in x:
                x += list(topic_words2[::-1][indX:indX+1])
            indX += 1
        
        print("All cluster: ",' '.join(x))
        print("-"*30)
   

    with open("./data/topics.csv","w+") as f:
        for i2 in range(len(clusters)):
        
            topic_words = np.array(vocab)[np.argsort(varValues2[i2,:])][:]#-[:n_top_words:-1]
            freq_words = np.array(varValues2[i2,:])[np.argsort(varValues2[i2,:])][:]#-n_top_words:-1]
         
            sumValues1 = np.sum(varValues2,0)
            topic_words1 = np.array(vocab)[np.argsort(varValues2[i2,:]/sumValues1)][:]#-[:n_top_words:-1]
            sumValues2 = np.ones(len(np.sum(varValues2,0)))
            topic_words2 = np.array(vocab)[np.argsort(varValues2[i2,:]/sumValues2)][:]#-[:n_top_words:-1]
            x = []
            indX = 0
            while len(x) < 30:
                x += list(topic_words1[::-1][indX:indX+1])
                x += list(topic_words2[::-1][indX:indX+1])
                
                indX += 1


            f.write("- {0}\t{1}\n".format(i2,' '.join(x)))

def weightWord(tit="",word='hoi'):
    """
    returns how much weight a word has in every article (based on its topic)
    """
    vocab = pickle.load(open('./data/vocab.txt',"rb"))

    #By day
    varValues = np.transpose(np.loadtxt("./data/allDistComb1Day"+str(tit)+".dat"))
    #By freq
    varValues2 = (np.loadtxt("./data/ldaTopics"+str(tit)+".dat"))

    weigthTopics = np.ones(int(tit))
    
    len_vocab = len(vocab)
    # Kind-of efficient way to count words (better to use a dictionary)
    file = re.split(' |\n',word)
    c = Counter(file)
    del c['']
    values = np.zeros(len_vocab)
    count = 0 
    for word in c:
        pos = bi_contains(vocab, word)
        if pos[0]:
            weigthTopics += np.log10(varValues2[:,pos[1]]/np.sum(varValues2[:,pos[1]])*c[word])
            

    weigthTopics = (weigthTopics - np.min(weigthTopics))
    weigthTopics /= np.sum(weigthTopics)
    
    return weigthTopics

    
def plot_imshows(all_topics,filenames,num_topics):
    """
    given all_topics = 2D array, one dimension = topics, the other = different queries we are comparing
    """
    all_topics[all_topics==0] = np.NaN

    plt.figure(figsize = (15,10))
    plt.subplot(2,1,1)
    plt.imshow(all_topics*10,interpolation="none",aspect="auto",cmap=plt.cm.YlGnBu)
    plt.yticks(range(len(filenames)),[_[_.find("a/")+2:_.find(".csv")] for _ in filenames])
    plt.xticks(range(num_topics),[str(_) for _ in range(num_topics)])
    plt.grid(False)
    plt.colorbar()
    plt.title("Not normalized")
    for topic_number in range(num_topics):
        all_topics[:,topic_number] = (all_topics[:,topic_number] - np.mean(all_topics[:,topic_number]))/np.std(all_topics[:,topic_number])

    plt.subplot(2,1,2)
    plt.title("Normalized")
    plt.imshow(all_topics,interpolation="none",aspect="auto", cmap=plt.cm.YlGnBu)
    plt.yticks(range(len(filenames)),[_[_.find("a/")+2:_.find(".csv")] for _ in filenames])
    plt.xticks(range(num_topics),[str(_) for _ in range(num_topics)])
    plt.grid(False)
    plt.colorbar()
    plt.show()

In [47]:
import pandas as pd
import re
from nltk.corpus import stopwords
filenames = ["./data/Pablo_Iglesias_Podemos.csv","./data/Albert_Rivera_Ciudadanos.csv","./data/Mariano_Rajoy_PP.csv","./data/Pedro_Sanchez_PSOE.csv"]
separator_csv = "\t" #tab
header = None
index_col = None 
columns_csv = ["Date","Newspaper","Article"]
column_text = "Article" #leave empty to not get the valence
column_date = "Date" #leave empty to not parse the date
language = "spanish"

## Put all the articles together
all_messages = []
## Fill up
for filename in filenames:
    print(filename)
    df = pd.read_csv(filename,sep=separator_csv,header=header,index_col=index_col)
    df.columns = columns_csv
    all_messages = all_messages + [article for article in df[column_text].values]
    
extra_stop_words = [_.lower() for _ in ["elpais.com","elpais","Podemos","Pablo","Iglesias","Ciudadanos","Albert","Rivera","Pedro","Sanchez","Sánchez","PSOE","Mariano","Rajoy","PP"]]
cached_stop = set(stopwords.words(language) + extra_stop_words)

## Create dictionary with words 
create_dictionary(lang=language, cached_stop=cached_stop)

## Create corpus (counts words for every article)
create_corpus(all_messages,language=language)


./data/Pablo_Iglesias_Podemos.csv
./data/Albert_Rivera_Ciudadanos.csv
./data/Mariano_Rajoy_PP.csv
./data/Pedro_Sanchez_PSOE.csv
Number of articles: 2803

In [ ]:
## Tof ind the number of topics. Start with a large range and low number of iterations. SLOW
find_number_topics(range(2,100,10),n_iter=100)

In [ ]:
## Tof ind the number of topics. Narrow the range and middle number of iterations. SLOW
find_number_topics(range(5,50,2),n_iter=1000)

In [48]:
## Settle to some number of topics. SLOW
find_number_topics([20],n_iter=10000)


WARNING:lda:all zero column in document-term matrix found
20
-3465191.4084653277
Topic 0: campaña si mejor hoy sido
Topic 0: 0.0286869722833 0.0125920886285 0.00934264839534 0.00878415085526 0.00853028833705
Topic 1: españa años cambio país electoral
Topic 1: 0.0219070064422 0.0214398064754 0.0137050514699 0.0102789183802 0.00944834066145
Topic 2: gobierno presidente ayer españa pacto
Topic 2: 0.0309569097496 0.0273298552974 0.019696170927 0.0193587705129 0.0192744204094
Topic 3: años dos mas cataluña elecciones
Topic 3: 0.0184205482078 0.0144934244052 0.0131843831377 0.0117818389225 0.0114078271318
Topic 4: si partido puede r gobierno
Topic 4: 0.0206547237813 0.0142878798789 0.0137194116733 0.0122792922192 0.0115592324921
Topic 5: política aunque cada años maría
Topic 5: 0.0109647887746 0.0100660847059 0.00916738063712 0.00766954052257 0.00760962691798
Topic 6: debate país candidatos cara televisión
Topic 6: 0.0407272923546 0.0229154501784 0.0172725800674 0.0164384166597 0.0141812686153
Topic 7: francia ataque país seguridad españa
Topic 7: 0.0136860553804 0.00962908469332 0.00941271292334 0.00854722584344 0.00778992464853
Topic 8: candidato madrid gobierno líder socialista
Topic 8: 0.0188069727451 0.0177142462775 0.0131708046493 0.0119055424237 0.0113879351496
Topic 9: programa años reforma partido laboral
Topic 9: 0.017577103152 0.0137739815086 0.0137225879729 0.0129002914013 0.012232175437
Topic 10: partido si general federal secretario
Topic 10: 0.0212821325549 0.0179793147719 0.0156620151983 0.0151825739072 0.0149694888889
Topic 11: españa política cataluña constitución reforma
Topic 11: 0.0151562504903 0.0146228117244 0.0145600542225 0.0106063316044 0.00985324158187
Topic 12: gobierno españa elecciones política ser
Topic 12: 0.0267931527861 0.0226711902563 0.0164089779515 0.0127626264828 0.0105034739425
Topic 13: congreso elecciones presidente diputados rey
Topic 13: 0.0240264677386 0.0159453787902 0.0128172153263 0.0104710927283 0.0102538591544
Topic 14: debate presidente líder cara debates
Topic 14: 0.0458033025315 0.0258525701797 0.0255479788461 0.0215502175924 0.0211694784254
Topic 15: campaña programas televisión cara partidos
Topic 15: 0.0256183755777 0.0192139259276 0.0179445755465 0.0160982477195 0.0157520612519
Topic 16: congreso acuerdo partido mesa dos
Topic 16: 0.0333892753511 0.0208728365071 0.020343972894 0.0187221244804 0.0173470790863
Topic 17: hoy sino grupo socialista candidato
Topic 17: 0.0102507587076 0.00998278253063 0.00937983613257 0.00937983613257 0.00937983613257
Topic 18: partido votos elecciones resultado dos
Topic 18: 0.0270465997852 0.021814794078 0.0215394358828 0.0158792952054 0.0142883367447
Topic 19: presidente líder violencia candidato ayer
Topic 19: 0.0355392017656 0.0219080383295 0.0219080383295 0.0215834868192 0.0178511444498

In [49]:
## Cluster topics
clusters,vocab,varValues2 = correlationMerge(20,threshold=0.6)



In [50]:
## Tells you what every cluster is about, and save the words in every topic to a file ./data/topics.csv
automatic_annotate(clusters,vocab,varValues2)


[2 2 2 2 2 2 1 3 2 2 2 2 2 2 1 1 2 1 2 4]
6 formato debate arroyo país pantalla candidatos usuarios cara vega televisión vieron redes lectores sociales realización electoral digitales diario horario cada
14 enfrentar debate norteamericano presidente luna líder cancelar cara relativamente debates chiste candidato molestar país universidades ayer piedra solo cohetes gobierno
15 pastor campaña factores programas mil televisión tradiciones cara garaje partidos barro política renuncias socialista rápida según agrega electoral conversión políticos
17 cuales hoy entidades sino estadounidense grupo colegios socialista delegado candidato financieros líder llame millones afán parte acerca político sencillo nuevo
All cluster:  digital emisión formato arroyo página cuales pantalla diseñado tv entidades cadenas tuits usuarios estadounidense mundial vega canales colegios andrea pastor
------------------------------
0 hijo campaña esposa si ss mejor entrega hoy guste sido eficaz política mirando centro club partido dejando día wars electoral
1 j españa residencia años coyuntura cambio venía país kilómetros electoral frío elecciones pila cuatro hacían momento semejante tres viva mayoría
2 comunicado gobierno tropas presidente prestar ayer castro españa directiva pacto continuación partidos posiblemente líder horizonte tras específica acuerdo visitar quiere
3 alcaldía años acabaron dos abandonar mas desarrollar cataluña sagrada elecciones contento gobierno pinta barcelona f partido delito frente fotografías sido
4 r si p partido puedes puede urgente r dimensión gobierno proposición elecciones retos votos caer izquierda humilde pacto simón partidos
5 entretenimiento política barrio aunque formatos cada profesores años argentina maría docentes ser café año limitada tan aires entretenimiento ¡qué hoy
8 hija candidato prestación madrid encontró gobierno orgánica líder hijas socialista recepción dijo villanueva presidente libros general cumplen secretario cena corrupción
9 ingresos programa pobreza años salario reforma reducción partido empleos laboral saldo voto embarazo según femenino ley indemnización social enseñanza ayer
10 intentarlo partido ordinario si empiece general prohibición federal post secretario pongan gobierno mecanismo socialistas inconveniente cataluña imposición fuerzas estatutos españa
11 corona españa convivencia política bandera cataluña causa constitución instrumentos reforma sucesión político exigir electoral producto nuevo procedimientos ley rigor sistema
12 mercados gobierno ajuste españa perdida elecciones optimismo política crece ser adelantado si usan legislatura griego comisión daría país relacionados cuenta
13 teoría congreso firmas elecciones rebelión presidente desastre diputados cuentan rey requisito propio abril ahora blog situación juntas semana llegaba gobierno
16 reglamento congreso senadores acuerdo obstáculos partido aceptación mesa separado dos pretensiones cámara independientes partidos ampliamente presidente proposiciones grupos marcelo diputados
18 valorado partido regional votos pierde elecciones región resultado notable dos zaragoza voto sant solo municipio campaña distritos fuerza recibieron menos
All cluster:  congreso diputados ley formar fuerza poder parlamento constitucional entonces claro dirección medidas pueden discurso euros independencia quieren r podría realidad
------------------------------
7 embajada francia militar ataque militares país silencio seguridad soldados españa misiones español gol según agente policía herido embajada presos exteriores
All cluster:  embajada militar militares silencio soldados misiones gol agente herido presos adicional inteligencia heridos revisar ocupan fallecido occidental pensamiento asesinato grados
------------------------------
19 escribió presidente condena líder detenido violencia tuit candidato parejas ayer vengo si vigencia gobierno fomento mas sales mensaje dependencias ningún
All cluster:  escribió condena detenido tuit parejas vengo vigencia fomento sales dependencias consentimiento etiquetas agradezco armada encontrarse mentir útiles decretos tela hierro
------------------------------

In [51]:
## Returns of the word weight by day
weightWord(20,word="prisión")


Out[51]:
array([ 0.03573729,  0.03696589,  0.02545959,  0.42841658,  0.01953572,
        0.04490829,  0.03384607,  0.03924666,  0.0426419 ,  0.03641078,
        0.        ,  0.00907863,  0.02201745,  0.02710525,  0.01979245,
        0.0428206 ,  0.01553519,  0.05109621,  0.00767803,  0.06170741])

In [52]:
## Plots what topics are important for the different data sets

import pandas as pd
import re
from nltk.corpus import stopwords
filenames = ["./data/Pablo_Iglesias_Podemos.csv","./data/Albert_Rivera_Ciudadanos.csv","./data/Mariano_Rajoy_PP.csv","./data/Pedro_Sanchez_PSOE.csv"]

separator_csv = "\t" #tab
header = None
index_col = None 
columns_csv = ["Date","Newspaper","Article"]
column_text = "Article" #leave empty to not get the valence
column_date = "Date" #leave empty to not parse the date
language = "spanish"
extra_stop_words = [_.lower() for _ in ["elpais.com","elpais","Podemos","Pablo","Iglesias","Ciudadanos","Albert","Rivera","Pedro","Sanchez","Sánchez","PSOE","Mariano","Rajoy","PP"]]

cached_stop = set(stopwords.words(language) + extra_stop_words)
num_topics = 20

all_topics = np.zeros((len(filenames),num_topics))
## Fill up
for i,filename in enumerate(filenames):
    print(filename)
    df = pd.read_csv(filename,sep=separator_csv,header=header,index_col=index_col)
    df.columns = columns_csv
    all_messages =  " ".join(df["Article"].values)
    mess = remove_stop_words(remove_punctuation(all_messages),cached_stop)
    all_topics[i,:] = weightWord(num_topics,word=mess)
    print(all_topics[i,:])

plot_imshows(all_topics,filenames,num_topics)


./data/Pablo_Iglesias_Podemos.csv
[ 0.0871278   0.06126705  0.02846147  0.06108005  0.04953509  0.10587406
  0.06209662  0.06511508  0.07018493  0.05241638  0.0158719   0.05085792
  0.04888463  0.04977496  0.00317499  0.04026726  0.01261875  0.08438401
  0.          0.05100704]
./data/Albert_Rivera_Ciudadanos.csv
[ 0.08621772  0.05587206  0.02658259  0.07426361  0.04194938  0.09106686
  0.07213482  0.07564686  0.06185776  0.04615836  0.01674264  0.06487386
  0.05611718  0.05218169  0.          0.0366947   0.0100268   0.08837279
  0.00261742  0.0406229 ]
./data/Mariano_Rajoy_PP.csv
[ 0.08515701  0.06520322  0.02908716  0.07225688  0.04729776  0.09252304
  0.06134866  0.06160554  0.07017827  0.05675866  0.01622324  0.05049147
  0.05476302  0.04818749  0.00269903  0.04408163  0.0102077   0.07880732
  0.          0.05312289]
./data/Pedro_Sanchez_PSOE.csv
[ 0.08157824  0.06669365  0.02844501  0.06734884  0.04612649  0.10225567
  0.05907193  0.06454068  0.0738567   0.05982238  0.01398857  0.04570904
  0.05062676  0.0509661   0.          0.04265173  0.00972529  0.08039625
  0.0007525   0.05544417]

In [4]:
make_line_hist?