In [67]:
import numpy as np
import bs4
from bs4 import BeautifulSoup
from mechanize import Browser
from urllib2 import urlopen
import re
from datetime import datetime
import time
import sqlite3
import json
import time
import requests
import matplotlib.pyplot as plt
%matplotlib inline
import sys
In [303]:
date = datetime(year=2016, month=3, day=19)
url = "https://en.wikipedia.org/wiki/{:s}_{:d}".format(date.strftime("%B"), 19)
page = urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, 'html.parser')
In [304]:
soup.find('span', {'id': 'Holidays_and_observances'}).find_next('ul')
Out[304]:
In [310]:
conn = sqlite3.connect('/Users/spardy/WikipediaDates.db')
c = conn.cursor()
for month in [3]:#xrange(2, 13):
for day in [19]:#xrange(1, 32):
try:
date = datetime(year=2016, month=month, day=day)
except ValueError:
continue
url = "https://en.wikipedia.org/wiki/{:s}_{:d}".format(date.strftime("%B"), day)
page = urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, 'html.parser')
#loop over non-holidays
for _id in ['Events', 'Births', 'Deaths']:
#Create a table, dropping the old one if it exists
table_name = "{:s}{:d}{:d}".format(_id, month, day)
c.execute("DROP TABLE if exists %s" % table_name)
c.execute("CREATE TABLE %s (year, name)" % table_name)
entries = soup.find('span', {'id': _id}).find_next('ul')
for entry in entries:
if (entry is not None) and (entry.name is not None):
m = re.match(r".*?([0-9]+) (.*)", entry.text)
if m is not None:
yr = m.groups()[0]
name = m.groups()[1].replace('</li>', '')
c.execute("INSERT INTO %s VALUES (?, ?)" % table_name, (yr, name))
holidays = soup.find('span', {'id': 'Holidays_and_observances'}).find_next('ul')
table_name = "{:s}{:d}{:d}".format('Holidays', month, day)
c.execute("DROP TABLE if exists %s" % table_name)
c.execute("CREATE TABLE %s (name)" % table_name)
for holiday in holidays.text.split('\n'):
if holiday != '':
c.execute("INSERT INTO %s VALUES (?)" % table_name, (holiday,))
conn.commit()
conn.close()
In [324]:
dataset = np.zeros((12, 31, 4))+np.nan
with open('/Users/spardy/Code/Web/Blog/resources/wikipedia_calendar.csv', 'w') as f:
f.write('Date,Events,Births,Deaths,Holidays\n')
with sqlite3.connect('/Users/spardy/WikipediaDates.db') as conn:
c = conn.cursor()
for month in xrange(1, 13):
for day in xrange(1, 32):
try:
date = datetime(year=2016, month=month, day=day)
except ValueError:
continue
f.write('2016-{:02d}-{:02d},'.format(month, day))
for i, _id in enumerate(['Events', 'Births', 'Deaths']):
table_name = "{:s}{:d}{:d}".format(_id, month, day)
data = c.execute("SELECT count(*) FROM sqlite_master WHERE name ='%s' and type='table';" % table_name).fetchall()
if data[0][0] > 0:
data = c.execute("SELECT * FROM %s" % table_name).fetchall()
dataset[month-1, day-1, i] = len(data)
f.write("{:d},".format(len(data)))
table_name = "{:s}{:d}{:d}".format('Holidays', month, day)
data = c.execute("SELECT count(*) FROM sqlite_master WHERE name ='%s' and type='table';" % table_name).fetchall()
if data[0][0] > 0:
data = c.execute("SELECT * FROM %s" % table_name).fetchall()
if any(['Christian feast day:' in d for d in data]):
dataset[month-1, day-1, 3] = len(data) - 1
f.write("{:d}".format(len(data) - 1))
else:
dataset[month-1, day-1, 3] = len(data)
f.write("{:d}".format(len(data)))
f.write('\n')
#print "On January {:d} there are {:d} Events, {:d} Births, {:d} Deaths, and {:d} Holidays".format(day, *nums)
In [319]:
with sqlite3.connect('/Users/spardy/WikipediaDates.db') as conn:
c = conn.cursor()
table_name = "{:s}{:d}{:d}".format('Holidays', 3, 16)
data = c.execute("SELECT count(*) FROM sqlite_master WHERE name ='%s' and type='table';" % table_name).fetchall()
if data[0][0] > 0:
data = c.execute("SELECT * FROM %s" % table_name).fetchall()
print data
print any(['Christian feast day:' in d for d in data])
In [325]:
#plt.plot(dataset[:, :, 0].ravel())
#plt.plot(dataset[:, :, 1].ravel())
#plt.plot(dataset[:, :, 2].ravel())
plt.plot(dataset[:, :, 3].ravel())
Out[325]:
In [326]:
for i in xrange(4):
print np.nanmin(dataset[:, :, i]), np.nanmax(dataset[:, :, i])
print np.nanargmin(dataset[:, :, i].ravel()), np.nanargmax(dataset[:, :, i].ravel())
In [88]:
year = 2016
end_pattern = 'Copyright'
holidays = {}
for month in xrange(1, 13):
for day in xrange(1, 31):
try:
date = datetime(year=year, month=month, day=day)
except ValueError:
continue
url = "http://www.earthcalendar.net/_php/lookup.php?mode=date&m={:d}&d={:d}&y={:d}".format(month,
day,
year)
page = urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, 'html.parser')
holidays[date] = {}
for holiday, place in zip(soup.find_all('font', {"face": "Tahoma",
"size": "2",
"color": "#000040"})[::2],
soup.find_all('font', {"face": "Tahoma",
"size": "2",
"color": "#000040"})[1::2]):
if holiday.contents[0].find(end_pattern) > -1:
break
if isinstance(holiday.contents[0], bs4.element.Tag):
if holiday.contents[0].name == 'a':
_holiday = holiday.contents[0].contents[0]
else:
_holiday = holiday.contents[0]
if isinstance(place.contents[0], bs4.element.Tag):
if place.contents[0].name == 'select':
place_list = place.contents[0].find_all('option')
_place = [place_entry.contents[0] for place_entry in place_list]
else:
_place = place.contents[0]
holidays[date][_holiday] = _place
In [92]:
for day, holiday in holidays.iteritems():
if len(holiday.keys()) == 0:
print day, len(holiday.keys())
In [93]:
for day, holiday in holidays.iteritems():
print day, len(holiday.keys())
In [ ]: