In [126]:
import requests
import time
sampleurl1 = 'http://www.dailymile.com/people/danstoner/entries/35646080/workout_data' # with gear and effort
sampleurl2 = 'http://www.dailymile.com/people/danstoner/entries/35103337/workout_data' # with gear, effort, map, and comment
sampleurl3 = 'http://www.dailymile.com/people/danstoner/entries/35419201/workout_data' # empty page, only comment
sampleurl4 = 'http://www.dailymile.com/people/danstoner/entries/35373143/workout_data' # includes weather
#sampleurl5 = 'http://www.dailymile.com/people/danstoner/entries/35542790' # with gear, effort, map, and weather
url = 'http://www.dailymile.com/people/danstoner/entries/35587204/workout_data'
#requests.packages.urllib3.disable_warnings()
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
In [78]:
d = pq('<span><p class="hello">Hi</p><p>Bye</p></span>')
In [84]:
for each in d.children():
print each.text
In [2]:
r = requests.get(sampleurl1)
r.raise_for_status()
r.content
blurb = pq(r.content)
for detail in blurb('dt'):
print detail.text
#blurb().text()
In [3]:
r = requests.get(sampleurl2)
r.raise_for_status()
r.content
blurb = pq(r.content)
for detail in blurb('dt'):
print detail.text
In [8]:
r = requests.get(sampleurl3)
r.raise_for_status()
blurb = pq('<document>' + r.content + '</document>')
for detail in blurb('dt'):
print detail.text
In [16]:
#print blurb.children()
for detail in blurb('dt'):
print detail.text()
In [118]:
r = requests.get(sampleurl4)
r.raise_for_status()
#r.content
blurb = pq(r.content)
#print blurb.children().children()
for each in blurb.children().children():
print each.text
#print blurb
#for detail in blurb('dt'):
# print detail.text
# print detail.keys
<dl id="workout_details" class="details left">
<dt>Gear</dt>
<dd>
<ul class="keyword_list">
<li><a href="#" onclick="javascript: return false"><span>Merrell Trail Glove 3 - pair 1</span></a></li>
</ul>
</dd>
<dt style="display: none">Effort</dt>
<dd style="display: none">
<ul class="effort-rating" style="display: block">
<li class="current-rating" style="width: 60%;">Currently 3/5 Effort.</li>
</ul>
</dd>
<dt>Weather</dt>
<dd>
<ul class="keyword_list">
<li><a href="#" onclick="javascript: return false"><span>hot</span></a></li>
<li><a href="#" onclick="javascript: return false"><span>sunny</span></a></li>
</ul>
</dd>
<dt style="display: none">Calories</dt>
<dd style="display: none">
1067
</dd>
</dl>
<div class="clearl empty_div"></div>
In [230]:
r = requests.get(sampleurl4)
r.raise_for_status()
soup = BeautifulSoup(r.content)
In [240]:
soup
Out[240]:
In [241]:
dt_items = soup.find_all("dt")
dd_items = soup.find_all("dd")
In [233]:
for each in dt_items:
print each.get_text()
In [234]:
for each in dd_items:
print "ITEM:", each.get_text()
In [242]:
dt_texts = []
dd_texts = []
texts = {}
for each in dt_items:
dt_texts.append(each.text)
for each in dd_items:
dd_texts.append(each.text.replace('\n',''))
In [243]:
dt_texts
Out[243]:
In [244]:
dd_texts
Out[244]:
In [238]:
texts = {}
for i in range(0, len(dt_texts)):
texts[dt_texts[i]] = dd_texts[i]
In [245]:
texts
Out[245]:
In [258]:
dt_texts = []
dd_texts = []
for each in dt_items:
dt_texts.append(each.text)
for each in dd_items:
dd_texts.append(each.text.strip('\n').strip(' ').strip('\n'))
In [259]:
dd_texts
Out[259]:
In [297]:
extended_stuff = {}
for i in range(0, len(dt_texts)):
extended_stuff[dt_texts[i]] = dd_texts[i]
In [298]:
extended_stuff
Out[298]:
In [299]:
if 'Effort' in extended_stuff:
effort_loc = extended_stuff['Effort'].find('/') - 1
extended_stuff['Effort'] = extended_stuff['Effort'][effort_loc]
if 'Weather' in extended_stuff:
extended_stuff['Weather'] = extended_stuff['Weather'].replace('\n',' ')
In [300]:
extended_stuff
Out[300]:
In [125]:
# using sampleurl4
r = requests.get(sampleurl4)
r.raise_for_status()
#print r.content
blurb = pq(r.content)
#print blurb.contents()
for detail in blurb.children():
if detail.text == 'Gear':
print "GEAR!"
else:
print "not gear"
#for item in detail.items():
#print "ITEM:", detail.html()
#print "BLEH:",blurb.items('dt').closest('dd')
#print "ITEM:", detail.closest('dd')
#print "ITEM:",detail.children()
#for detail in blurb('dt'):
# if detail.text == 'Gear':
# for each in detail.iterkeys():
# print each
# print "."
#print detail.keys()
In [10]:
r = requests.get(url,timeout=5)
r.raise_for_status()
In [11]:
r.content
Out[11]:
In [50]:
doc = pq(r.content)
doc('li.current-rating').text()
Out[50]:
In [72]:
doc('ul.keyword_list').text()
Out[72]:
In [73]:
doc('dt')
Out[73]:
In [74]:
gear = ''
for detail in doc('dt'):
if detail.text == 'Gear':
gear = doc.find('span').text()
In [75]:
gear
Out[75]:
In [ ]: