In [67]:
import numpy as np
import bs4
from bs4 import BeautifulSoup
from mechanize import Browser
from urllib2 import urlopen
import re
from datetime import datetime
import time
import sqlite3
import json
import time
import requests
import matplotlib.pyplot as plt
%matplotlib inline
import sys

In [303]:
date = datetime(year=2016, month=3, day=19)
url = "https://en.wikipedia.org/wiki/{:s}_{:d}".format(date.strftime("%B"), 19) 
page = urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, 'html.parser')

In [304]:
soup.find('span', {'id': 'Holidays_and_observances'}).find_next('ul')


Out[304]:
<ul>\n<li>Christian <a class="mw-redirect" href="/wiki/Feast_day" title="Feast day">feast day</a>:\n<ul>\n<li><a href="/wiki/Saint_Joseph" title="Saint Joseph">Joseph of Nazareth</a> (<a href="/wiki/Western_Christianity" title="Western Christianity">Western Christianity</a>)</li>\n<li><a href="/wiki/March_19_(Eastern_Orthodox_liturgics)" title="March 19 (Eastern Orthodox liturgics)">March 19 (Eastern Orthodox liturgics)</a></li>\n</ul>\n</li>\n<li>Earliest day on which <a href="/wiki/Maundy_Thursday" title="Maundy Thursday">Maundy Thursday</a> can fall, while April 22 is the latest; celebrated on Thursday before <a href="/wiki/Easter" title="Easter">Easter</a>. (<a href="/wiki/Christianity" title="Christianity">Christianity</a>)</li>\n<li><a href="/wiki/Flag_days_in_Finland#Days_on_which_flying_the_Finnish_flag_is_an_established_custom" title="Flag days in Finland">Minna Canth's Birthday</a> (<a href="/wiki/Finland" title="Finland">Finland</a>)</li>\n<li><a href="/wiki/St_Joseph%27s_Day" title="St Joseph's Day">St Joseph's Day</a> (<a class="mw-redirect" href="/wiki/Roman_Catholicism" title="Roman Catholicism">Roman Catholicism</a> and <a href="/wiki/Anglican_Communion" title="Anglican Communion">Anglican Communion</a>) related observances:\n<ul>\n<li><a href="/wiki/Father%27s_Day" title="Father's Day">Father's Day</a> (Spain, Portugal, Belgium, Italy, <a href="/wiki/Honduras" title="Honduras">Honduras</a>, and <a href="/wiki/Bolivia" title="Bolivia">Bolivia</a>)</li>\n<li><a href="/wiki/Falles" title="Falles">Las Fallas</a>, celebrated on the week leading to March 19. (<a href="/wiki/Valencia" title="Valencia">Valencia</a>)</li>\n<li><a href="/wiki/Mission_San_Juan_Capistrano" title="Mission San Juan Capistrano">"Return of the Swallow"</a>, annual observance of the <a href="/wiki/American_cliff_swallow" title="American cliff swallow">swallows</a>' return to <a href="/wiki/Mission_San_Juan_Capistrano" title="Mission San Juan Capistrano">Mission San Juan Capistrano</a> in <a href="/wiki/California" title="California">California</a>.</li>\n</ul>\n</li>\n<li><a href="/wiki/Kashubian_Unity_Day" title="Kashubian Unity Day">Kashubian Unity Day</a> (<a href="/wiki/Poland" title="Poland">Poland</a>)</li>\n<li>The first day of <a href="/wiki/Quinquatria" title="Quinquatria">Quinquatria</a>, held in honor of <a href="/wiki/Minerva" title="Minerva">Minerva</a>. (<a href="/wiki/Roman_Empire" title="Roman Empire">Roman Empire</a>)</li>\n</ul>

In [310]:
conn = sqlite3.connect('/Users/spardy/WikipediaDates.db')
c = conn.cursor()

for month in [3]:#xrange(2, 13):
    for day in [19]:#xrange(1, 32):
        try:
            date = datetime(year=2016, month=month, day=day)
        except ValueError:
            continue

        url = "https://en.wikipedia.org/wiki/{:s}_{:d}".format(date.strftime("%B"), day) 
        page = urlopen(url)
        contents = page.read()
        soup = BeautifulSoup(contents, 'html.parser')

        #loop over non-holidays
        for _id in ['Events', 'Births', 'Deaths']:
            #Create a table, dropping the old one if it exists
            table_name = "{:s}{:d}{:d}".format(_id, month, day)
            c.execute("DROP TABLE if exists %s" % table_name)
            c.execute("CREATE TABLE %s (year, name)" % table_name)

            entries = soup.find('span', {'id': _id}).find_next('ul')
            for entry in entries:
                if (entry is not None) and (entry.name is not None):
                    m = re.match(r".*?([0-9]+) (.*)", entry.text)
                    if m is not None:
                        yr = m.groups()[0]
                        name = m.groups()[1].replace('</li>', '')
                        c.execute("INSERT INTO %s VALUES (?, ?)" % table_name, (yr, name))


        holidays = soup.find('span', {'id': 'Holidays_and_observances'}).find_next('ul')
        table_name = "{:s}{:d}{:d}".format('Holidays', month, day)
        c.execute("DROP TABLE if exists %s" % table_name)
        c.execute("CREATE TABLE %s (name)" % table_name)
        for holiday in holidays.text.split('\n'):
            if holiday != '':
                c.execute("INSERT INTO %s VALUES (?)" % table_name, (holiday,)) 

        conn.commit()

conn.close()

In [324]:
dataset = np.zeros((12, 31, 4))+np.nan

with open('/Users/spardy/Code/Web/Blog/resources/wikipedia_calendar.csv', 'w') as f:
    f.write('Date,Events,Births,Deaths,Holidays\n')
    with sqlite3.connect('/Users/spardy/WikipediaDates.db') as conn:
        c = conn.cursor()
        for month in xrange(1, 13):
            for day in xrange(1, 32):
                try:
                    date = datetime(year=2016, month=month, day=day)
                except ValueError:
                    continue
                f.write('2016-{:02d}-{:02d},'.format(month, day))
                for i, _id in enumerate(['Events', 'Births', 'Deaths']):
                    table_name = "{:s}{:d}{:d}".format(_id, month, day)
                    data = c.execute("SELECT count(*) FROM sqlite_master WHERE name ='%s' and type='table';" % table_name).fetchall()
                    if data[0][0] > 0:
                        data = c.execute("SELECT * FROM %s" % table_name).fetchall()
                        dataset[month-1, day-1, i] = len(data)
                        f.write("{:d},".format(len(data)))
                table_name = "{:s}{:d}{:d}".format('Holidays', month, day)
                data = c.execute("SELECT count(*) FROM sqlite_master WHERE name ='%s' and type='table';" % table_name).fetchall()
                if data[0][0] > 0:
                    data = c.execute("SELECT * FROM %s" % table_name).fetchall()

                    if any(['Christian feast day:' in d for d in data]):
                        dataset[month-1, day-1, 3] = len(data) - 1
                        f.write("{:d}".format(len(data) - 1))
                    else:
                        dataset[month-1, day-1, 3] = len(data)
                        f.write("{:d}".format(len(data)))
                
                f.write('\n')
        
        #print "On January {:d} there are {:d} Events, {:d} Births, {:d} Deaths, and {:d} Holidays".format(day, *nums)

In [319]:
with sqlite3.connect('/Users/spardy/WikipediaDates.db') as conn:
        c = conn.cursor()
        table_name = "{:s}{:d}{:d}".format('Holidays', 3, 16)
        data = c.execute("SELECT count(*) FROM sqlite_master WHERE name ='%s' and type='table';" % table_name).fetchall()
        if data[0][0] > 0:
            data = c.execute("SELECT * FROM %s" % table_name).fetchall()
            print data
            print any(['Christian feast day:' in d for d in data])


[(u'Christian feast day:',), (u'Abb\xe1n',), (u'Heribert of Cologne',), (u'March 16 (Eastern Orthodox liturgics)',), (u'Day of the Book Smugglers (Lithuania)',), (u'Latvian Legion Day (Latvia)',), (u"Saint Urho's Day (Finnish Americans and Finnish Canadians)",)]
True

In [325]:
#plt.plot(dataset[:, :, 0].ravel())
#plt.plot(dataset[:, :, 1].ravel())
#plt.plot(dataset[:, :, 2].ravel())
plt.plot(dataset[:, :, 3].ravel())


Out[325]:
[<matplotlib.lines.Line2D at 0x11a5b43d0>]

In [326]:
for i in xrange(4):
    print np.nanmin(dataset[:, :, i]), np.nanmax(dataset[:, :, i])
    print np.nanargmin(dataset[:, :, i].ravel()), np.nanargmax(dataset[:, :, i].ravel())


30.0 156.0
198 0
109.0 735.0
59 88
56.0 418.0
59 88
5.0 42.0
95 217

Old using EarthCalendar.net they do not allow this type of use


In [88]:
year = 2016
end_pattern = 'Copyright'
holidays = {}

for month in xrange(1, 13):
    for day in xrange(1, 31):
        try:
            date = datetime(year=year, month=month, day=day)
        except ValueError:
            continue
        
        url = "http://www.earthcalendar.net/_php/lookup.php?mode=date&m={:d}&d={:d}&y={:d}".format(month,
                                                                                           day,
                                                                                           year) 
        page = urlopen(url)
        contents = page.read()
        soup = BeautifulSoup(contents, 'html.parser')

        holidays[date] = {}

        for holiday, place in zip(soup.find_all('font', {"face": "Tahoma",
                                                 "size": "2",
                                                 "color": "#000040"})[::2],
                                     soup.find_all('font', {"face": "Tahoma",
                                                "size": "2",
                                                "color": "#000040"})[1::2]):
            if holiday.contents[0].find(end_pattern) > -1:
                break

            if isinstance(holiday.contents[0], bs4.element.Tag):
                if holiday.contents[0].name == 'a':
                    _holiday = holiday.contents[0].contents[0]
            else:
                _holiday = holiday.contents[0]
            
            if isinstance(place.contents[0], bs4.element.Tag):
                if place.contents[0].name == 'select':
                    place_list = place.contents[0].find_all('option')
                    _place = [place_entry.contents[0] for place_entry in place_list]
            else:
                _place = place.contents[0]
                
                
            holidays[date][_holiday] = _place

In [92]:
for day, holiday in holidays.iteritems():
    if  len(holiday.keys()) == 0:   
        print day, len(holiday.keys())


2016-02-13 00:00:00 0
2016-12-29 00:00:00 0
2016-04-20 00:00:00 0
2016-10-22 00:00:00 0
2016-02-29 00:00:00 0
2016-02-20 00:00:00 0
2016-01-23 00:00:00 0

In [93]:
for day, holiday in holidays.iteritems():
    print day, len(holiday.keys())


2016-07-18 00:00:00 2
2016-04-04 00:00:00 8
2016-09-17 00:00:00 3
2016-06-05 00:00:00 5
2016-01-10 00:00:00 1
2016-10-11 00:00:00 3
2016-08-02 00:00:00 6
2016-03-29 00:00:00 4
2016-12-20 00:00:00 3
2016-11-29 00:00:00 3
2016-01-06 00:00:00 20
2016-05-12 00:00:00 7
2016-06-17 00:00:00 4
2016-03-01 00:00:00 15
2016-09-29 00:00:00 5
2016-08-22 00:00:00 1
2016-07-15 00:00:00 3
2016-11-17 00:00:00 7
2016-10-30 00:00:00 2
2016-04-19 00:00:00 6
2016-05-20 00:00:00 5
2016-02-02 00:00:00 9
2016-09-02 00:00:00 1
2016-08-26 00:00:00 4
2016-07-27 00:00:00 4
2016-11-05 00:00:00 3
2016-06-12 00:00:00 4
2016-09-14 00:00:00 2
2016-01-19 00:00:00 2
2016-10-18 00:00:00 4
2016-03-22 00:00:00 7
2016-04-11 00:00:00 3
2016-01-15 00:00:00 2
2016-08-05 00:00:00 7
2016-07-23 00:00:00 7
2016-06-24 00:00:00 17
2016-09-26 00:00:00 1
2016-03-26 00:00:00 5
2016-10-06 00:00:00 5
2016-05-09 00:00:00 9
2016-08-09 00:00:00 5
2016-12-11 00:00:00 1
2016-02-21 00:00:00 4
2016-11-26 00:00:00 4
2016-03-14 00:00:00 8
2016-12-07 00:00:00 7
2016-07-08 00:00:00 2
2016-06-20 00:00:00 2
2016-05-29 00:00:00 3
2016-08-29 00:00:00 2
2016-02-25 00:00:00 4
2016-04-22 00:00:00 2
2016-07-04 00:00:00 10
2016-11-14 00:00:00 5
2016-09-07 00:00:00 4
2016-01-24 00:00:00 1
2016-10-25 00:00:00 5
2016-02-13 00:00:00 0
2016-11-02 00:00:00 10
2016-06-11 00:00:00 3
2016-05-17 00:00:00 2
2016-04-02 00:00:00 2
2016-07-16 00:00:00 5
2016-01-20 00:00:00 7
2016-09-19 00:00:00 5
2016-03-19 00:00:00 4
2016-06-07 00:00:00 4
2016-10-13 00:00:00 2
2016-12-18 00:00:00 2
2016-10-01 00:00:00 10
2016-05-14 00:00:00 1
2016-04-14 00:00:00 9
2016-03-07 00:00:00 1
2016-06-19 00:00:00 6
2016-12-14 00:00:00 1
2016-08-20 00:00:00 7
2016-02-16 00:00:00 2
2016-05-02 00:00:00 6
2016-07-13 00:00:00 3
2016-11-23 00:00:00 2
2016-04-17 00:00:00 2
2016-03-11 00:00:00 2
2016-02-04 00:00:00 3
2016-05-22 00:00:00 4
2016-11-11 00:00:00 16
2016-08-24 00:00:00 3
2016-07-25 00:00:00 12
2016-01-29 00:00:00 1
2016-04-29 00:00:00 5
2016-09-08 00:00:00 17
2016-06-14 00:00:00 7
2016-10-20 00:00:00 4
2016-03-20 00:00:00 5
2016-02-08 00:00:00 4
2016-12-29 00:00:00 0
2016-01-09 00:00:00 2
2016-04-09 00:00:00 5
2016-10-08 00:00:00 4
2016-07-21 00:00:00 5
2016-06-26 00:00:00 5
2016-03-24 00:00:00 2
2016-09-20 00:00:00 1
2016-08-15 00:00:00 27
2016-05-11 00:00:00 1
2016-12-09 00:00:00 2
2016-02-23 00:00:00 4
2016-01-05 00:00:00 2
2016-11-24 00:00:00 2
2016-03-12 00:00:00 8
2016-12-05 00:00:00 6
2016-06-22 00:00:00 3
2016-02-27 00:00:00 1
2016-08-19 00:00:00 7
2016-07-02 00:00:00 2
2016-04-20 00:00:00 0
2016-11-12 00:00:00 4
2016-09-01 00:00:00 10
2016-01-26 00:00:00 4
2016-10-27 00:00:00 5
2016-02-15 00:00:00 5
2016-05-19 00:00:00 4
2016-01-22 00:00:00 2
2016-07-30 00:00:00 3
2016-06-01 00:00:00 11
2016-03-17 00:00:00 4
2016-10-15 00:00:00 7
2016-09-13 00:00:00 2
2016-08-06 00:00:00 5
2016-12-16 00:00:00 6
2016-01-02 00:00:00 11
2016-10-03 00:00:00 4
2016-04-12 00:00:00 3
2016-03-05 00:00:00 4
2016-12-12 00:00:00 8
2016-09-25 00:00:00 5
2016-06-29 00:00:00 3
2016-05-04 00:00:00 9
2016-02-18 00:00:00 1
2016-08-10 00:00:00 4
2016-07-11 00:00:00 6
2016-11-21 00:00:00 5
2016-03-09 00:00:00 1
2016-02-06 00:00:00 4
2016-11-09 00:00:00 5
2016-08-30 00:00:00 6
2016-05-24 00:00:00 7
2016-04-27 00:00:00 5
2016-07-07 00:00:00 5
2016-06-08 00:00:00 5
2016-09-10 00:00:00 3
2016-10-22 00:00:00 0
2016-12-27 00:00:00 6
2016-02-10 00:00:00 2
2016-01-11 00:00:00 8
2016-10-10 00:00:00 6
2016-07-19 00:00:00 5
2016-04-07 00:00:00 5
2016-03-30 00:00:00 3
2016-12-23 00:00:00 6
2016-06-04 00:00:00 2
2016-09-22 00:00:00 5
2016-05-13 00:00:00 3
2016-08-13 00:00:00 8
2016-11-30 00:00:00 6
2016-01-07 00:00:00 8
2016-12-03 00:00:00 1
2016-06-16 00:00:00 4
2016-03-02 00:00:00 4
2016-02-29 00:00:00 0
2016-11-18 00:00:00 6
2016-05-01 00:00:00 25
2016-08-17 00:00:00 5
2016-04-18 00:00:00 4
2016-09-03 00:00:00 4
2016-10-29 00:00:00 5
2016-11-06 00:00:00 4
2016-05-21 00:00:00 5
2016-02-01 00:00:00 7
2016-01-16 00:00:00 4
2016-10-17 00:00:00 7
2016-04-30 00:00:00 14
2016-07-28 00:00:00 3
2016-03-23 00:00:00 2
2016-06-03 00:00:00 3
2016-12-30 00:00:00 2
2016-09-15 00:00:00 6
2016-08-04 00:00:00 5
2016-10-05 00:00:00 4
2016-04-10 00:00:00 1
2016-01-12 00:00:00 6
2016-12-10 00:00:00 6
2016-09-27 00:00:00 2
2016-03-27 00:00:00 4
2016-02-20 00:00:00 0
2016-05-06 00:00:00 8
2016-11-27 00:00:00 2
2016-08-08 00:00:00 8
2016-07-09 00:00:00 7
2016-03-15 00:00:00 3
2016-12-06 00:00:00 6
2016-11-15 00:00:00 6
2016-08-28 00:00:00 2
2016-02-24 00:00:00 3
2016-05-26 00:00:00 4
2016-01-25 00:00:00 5
2016-04-25 00:00:00 12
2016-10-24 00:00:00 4
2016-07-05 00:00:00 6
2016-06-10 00:00:00 6
2016-09-04 00:00:00 1
2016-02-12 00:00:00 5
2016-12-25 00:00:00 17
2016-11-03 00:00:00 9
2016-10-12 00:00:00 13
2016-07-17 00:00:00 4
2016-01-21 00:00:00 3
2016-04-05 00:00:00 3
2016-03-28 00:00:00 1
2016-12-21 00:00:00 7
2016-09-16 00:00:00 10
2016-06-06 00:00:00 8
2016-05-15 00:00:00 8
2016-08-03 00:00:00 8
2016-09-28 00:00:00 4
2016-01-01 00:00:00 21
2016-11-28 00:00:00 5
2016-12-01 00:00:00 5
2016-06-18 00:00:00 1
2016-11-16 00:00:00 4
2016-08-23 00:00:00 3
2016-05-03 00:00:00 10
2016-04-16 00:00:00 7
2016-07-14 00:00:00 7
2016-08-27 00:00:00 3
2016-11-04 00:00:00 9
2016-05-23 00:00:00 5
2016-02-03 00:00:00 6
2016-01-18 00:00:00 3
2016-10-19 00:00:00 3
2016-07-26 00:00:00 4
2016-04-28 00:00:00 5
2016-03-21 00:00:00 23
2016-12-28 00:00:00 1
2016-09-09 00:00:00 2
2016-06-13 00:00:00 3
2016-07-22 00:00:00 4
2016-10-07 00:00:00 4
2016-09-21 00:00:00 10
2016-04-08 00:00:00 3
2016-01-14 00:00:00 5
2016-12-08 00:00:00 8
2016-06-25 00:00:00 4
2016-03-25 00:00:00 8
2016-02-22 00:00:00 5
2016-11-25 00:00:00 8
2016-08-14 00:00:00 8
2016-05-08 00:00:00 13
2016-06-21 00:00:00 10
2016-08-18 00:00:00 1
2016-03-13 00:00:00 1
2016-12-04 00:00:00 1
2016-11-13 00:00:00 1
2016-05-28 00:00:00 6
2016-02-26 00:00:00 1
2016-01-27 00:00:00 5
2016-10-26 00:00:00 4
2016-07-03 00:00:00 5
2016-04-23 00:00:00 8
2016-05-16 00:00:00 1
2016-09-06 00:00:00 7
2016-02-14 00:00:00 2
2016-11-01 00:00:00 14
2016-10-14 00:00:00 9
2016-04-03 00:00:00 1
2016-01-23 00:00:00 0
2016-12-19 00:00:00 4
2016-09-18 00:00:00 2
2016-03-18 00:00:00 5
2016-04-15 00:00:00 11
2016-08-01 00:00:00 13
2016-06-28 00:00:00 6
2016-09-30 00:00:00 4
2016-01-03 00:00:00 2
2016-10-02 00:00:00 4
2016-03-06 00:00:00 4
2016-12-15 00:00:00 6
2016-11-22 00:00:00 7
2016-05-05 00:00:00 11
2016-08-21 00:00:00 1
2016-02-17 00:00:00 2
2016-03-10 00:00:00 2
2016-07-12 00:00:00 8
2016-05-25 00:00:00 14
2016-08-25 00:00:00 5
2016-02-05 00:00:00 6
2016-11-10 00:00:00 6
2016-10-21 00:00:00 7
2016-04-26 00:00:00 2
2016-07-24 00:00:00 6
2016-01-28 00:00:00 1
2016-12-26 00:00:00 16
2016-02-09 00:00:00 1
2016-09-11 00:00:00 5
2016-06-15 00:00:00 3
2016-04-06 00:00:00 4
2016-07-20 00:00:00 6
2016-09-23 00:00:00 4
2016-01-08 00:00:00 6
2016-10-09 00:00:00 8
2016-06-27 00:00:00 4
2016-12-22 00:00:00 3
2016-01-04 00:00:00 6
2016-08-12 00:00:00 5
2016-05-10 00:00:00 5
2016-03-03 00:00:00 8
2016-06-23 00:00:00 7
2016-08-16 00:00:00 7
2016-12-02 00:00:00 4
2016-02-28 00:00:00 2
2016-05-30 00:00:00 8
2016-11-19 00:00:00 7
2016-10-28 00:00:00 2
2016-07-01 00:00:00 14
2016-04-21 00:00:00 10
2016-05-18 00:00:00 5
2016-07-29 00:00:00 5
2016-11-07 00:00:00 6
2016-09-12 00:00:00 1
2016-01-17 00:00:00 1
2016-04-01 00:00:00 5
2016-10-16 00:00:00 8
2016-12-17 00:00:00 3
2016-06-02 00:00:00 3
2016-03-16 00:00:00 5
2016-01-13 00:00:00 5
2016-04-13 00:00:00 8
2016-08-07 00:00:00 4
2016-09-24 00:00:00 11
2016-06-30 00:00:00 5
2016-10-04 00:00:00 5
2016-08-11 00:00:00 4
2016-03-04 00:00:00 1
2016-12-13 00:00:00 3
2016-11-20 00:00:00 4
2016-05-07 00:00:00 1
2016-02-19 00:00:00 3
2016-03-08 00:00:00 7
2016-07-10 00:00:00 3
2016-05-27 00:00:00 3
2016-07-06 00:00:00 5
2016-02-07 00:00:00 2
2016-11-08 00:00:00 7
2016-10-23 00:00:00 6
2016-09-05 00:00:00 1
2016-04-24 00:00:00 2
2016-01-30 00:00:00 1
2016-12-24 00:00:00 6
2016-02-11 00:00:00 4
2016-06-09 00:00:00 3

In [ ]: