In [3]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
In [37]:
URLS = ["http://www.hsa-haiku.org/hendersonawards/henderson.htm",
"http://www.hsa-haiku.org/bradyawards/brady.htm"]
In [6]:
soup = BeautifulSoup(page, 'html.parser')
In [38]:
CSS_PATH = "table > tr > td > blockquote > p"
In [39]:
haikus = []
for URL in URLS:
page = urllib2.urlopen(URL).read()
soup = BeautifulSoup(page, 'html.parser')
haikus.extend(soup.select(CSS_PATH))
In [40]:
len(haikus)
Out[40]:
In [44]:
for i in xrange(10):
print i, haikus[280+i].text
In [53]:
URL = "http://www.hsa-haiku.org/museumhaikuliteratureawards/museumhaikuliterature-award.htm"
CSS_PATH = "p.haiku"
page = urllib2.urlopen(URL).read()
soup = BeautifulSoup(page, 'html.parser')
temp = soup.select(CSS_PATH)
In [57]:
len(temp)
haikus.extend(temp)
In [173]:
count = 0
for k in haikus + j_haikus + c_haikus:
for c in k.text:
if c.isalpha():
count += 1
print count
In [97]:
URL = "http://www.hsa-haiku.org/virgilioawards/virgilio.htm"
CSS_PATH = "div"
page = urllib2.urlopen(URL).read()
soup = BeautifulSoup(page, 'html.parser')
temp = soup.select(CSS_PATH)
In [107]:
temp[2].select("p")
Out[107]:
In [189]:
data = []
for k in haikus + j_haikus + c_haikus:
k = k.text.lower().strip()
if k is None:
continue
l = "\t".join(l.strip() for l in k.splitlines() if l is not None)
data.append(l)
print len(data)
data = pd.DataFrame(data, columns=["text"])
In [118]:
data.to_json("haikus.json")
In [119]:
data = pd.read_json("haikus.json")
In [201]:
data.text.apply(lambda x: len(x)).mean()
with open("haiku.txt", "wb+") as fp:
for k in data.values:
print >> fp, k[0].encode('ascii', 'ignore')
In [204]:
%matplotlib inline
In [205]:
data.text.apply(lambda x: len(x)).hist()
Out[205]:
In [135]:
# Source http://sacred-texts.com/shi/jh/index.htm
URLS = ["http://sacred-texts.com/shi/jh/jh0%s.htm" % k for k in xrange(2,8)]
URLS
Out[135]:
In [133]:
CSS_PATH = "table > tr:nth-of-type(1) > td:nth-of-type(1) > p"
j_haikus = []
for URL in URLS:
page = urllib2.urlopen(URL).read()
soup = BeautifulSoup(page, 'html.parser')
j_haikus.extend(soup.select(CSS_PATH))
print len(j_haikus)
In [131]:
CSS_PATH = "table > tr:nth-of-type(1) > td:nth-of-type(1) > p"
j_haikus = []
page = urllib2.urlopen(URL).read()
soup = BeautifulSoup(page, 'html.parser')
j_haikus.extend(soup.select(CSS_PATH))
print len(j_haikus)
In [132]:
j_haikus
Out[132]:
In [156]:
# Contest Archive
# http://www.thehaikufoundation.org/contest-archive-details/?IDsponsor=8
BASE_URL = "http://www.thehaikufoundation.org/contest-archive-details/?IDsponsor=8"
URL_SELECTOR = "main.content li a"
page = urllib2.urlopen(BASE_URL).read()
soup = BeautifulSoup(page, 'html.parser')
URLS = [k.attrs['href'] for k in soup.select(URL_SELECTOR)]
In [157]:
URLS
Out[157]:
In [162]:
CSS_PATH = "table pre"
c_haikus = []
for URL in URLS:
page = urllib2.urlopen(URL).read()
soup = BeautifulSoup(page, 'html.parser')
c_haikus.extend(soup.select(CSS_PATH))
print len(c_haikus)
In [166]:
BASE_URL = "http://www.thehaikufoundation.org/contest-archive/"
URL_SELECTOR = "div.entry-content li a"
page = urllib2.urlopen(BASE_URL).read()
soup = BeautifulSoup(page, 'html.parser')
URLS = []
BASE_URLS = [k.attrs['href'] for k in soup.select(URL_SELECTOR)]
print len(BASE_URLS)
URL_SELECTOR = "main.content li a"
for u in BASE_URLS:
page = urllib2.urlopen(u).read()
soup = BeautifulSoup(page, 'html.parser')
URLS.extend([k.attrs['href'] for k in soup.select(URL_SELECTOR)])
print len(URLS)
In [169]:
URLS[:10]
Out[169]:
In [171]:
CSS_PATH = "table pre"
c_haikus = []
for URL in URLS:
page = urllib2.urlopen(URL).read()
soup = BeautifulSoup(page, 'html.parser')
c_haikus.extend(soup.select(CSS_PATH))
print len(c_haikus), "->",
In [172]:
c_haikus[:10]
Out[172]:
In [182]:
c_haikus[10].text.strip().lower().splitlines()
Out[182]:
In [210]:
BASE_URL = "http://www.dijitari.com/xvii/categories/all_vdesc_1.html"
page = urllib2.urlopen(BASE_URL).read()
soup = BeautifulSoup(page, 'html.parser')
In [220]:
URL_SELECTOR = "table > tr > td > font p a"
BASE_URLS = [k.attrs['href'] for k in soup.select(URL_SELECTOR)]
print len(BASE_URLS)
print BASE_URLS
BASE_URLS = [u'all_vdesc_1.html', u'all_vdesc_2.html', u'all_vdesc_3.html', u'all_vdesc_4.html',
u'all_vdesc_5.html', u'all_vdesc_6.html', u'all_vdesc_7.html', u'all_vdesc_8.html',
u'all_vdesc_9.html', u'all_vdesc_10.html', u'all_vdesc_11.html', u'all_vdesc_12.html',
u'all_vdesc_13.html', u'all_vdesc_14.html', u'all_vdesc_15.html', u'all_vdesc_16.html',
u'all_vdesc_17.html', u'all_vdesc_18.html', u'all_vdesc_19.html', u'all_vdesc_20.html',
u'all_vdesc_21.html', u'all_vdesc_22.html', u'all_vdesc_23.html', u'all_vdesc_24.html',
u'all_vdesc_25.html', u'all_vdesc_26.html', u'all_vdesc_27.html', u'all_vdesc_28.html',
u'all_vdesc_29.html', u'all_vdesc_30.html', u'all_vdesc_31.html', u'all_vdesc_32.html',
u'all_vdesc_33.html', u'all_vdesc_34.html']
BASE_URL = "http://www.dijitari.com/xvii/categories/"
BASE_URLS = [BASE_URL + k for k in BASE_URLS]
print BASE_URLS
In [239]:
CSS_PATH = "p table > tr > td > font > font"
URLS = BASE_URLS
x_haikus = []
for URL in URLS:
page = urllib2.urlopen(URL).read()
soup = BeautifulSoup(page, 'html.parser')
x_haikus.extend(soup.select(CSS_PATH))
print len(x_haikus), "->",
In [228]:
CSS_PATH = "p table > tr > td > font > font"
x_haikus = soup.select(CSS_PATH)
In [236]:
x_haikus[0].text.split("--")[0].replace(u'\xa0', ' ').splitlines()
Out[236]:
In [241]:
"""
data = []
for k in haikus + j_haikus + c_haikus:
k = k.text.lower().strip()
if k is None:
continue
l = "\t".join(l.strip() for l in k.splitlines() if l is not None)
data.append(l)
print len(data)
data = pd.DataFrame(data, columns=["text"])
"""
data_x = []
for k in x_haikus:
k = k.text.split("--")[0].replace(u'\xa0', ' ').lower().strip()
if k is None:
continue
l = "\t".join(l.strip() for l in k.splitlines() if l is not None)
data_x.append(l)
print len(data_x)
data_x = pd.DataFrame(data_x, columns=["text"])
data_x.head()
Out[241]:
In [243]:
print "data_x", data_x.text.apply(lambda x: len(x)).sum()
print "data", data.text.apply(lambda x: len(x)).sum()
In [249]:
data_all = data.append(data_x)
print data_all.shape, data.shape, data_x.shape
In [250]:
data_all.head()
Out[250]:
In [251]:
with open("haiku_all.txt", "wb+") as fp:
for k in data_all.values:
print >> fp, k[0].encode('ascii', 'ignore')
In [252]:
data_all.text.apply(lambda x: len(x)).hist()
Out[252]:
In [260]:
data_all.text[data_all.text.apply(lambda x: len(x)) > 100].values[8].replace(u'\xa0', ' ')
Out[260]:
In [ ]: