In [126]:
import requests
import time
sampleurl1 = 'http://www.dailymile.com/people/danstoner/entries/35646080/workout_data' # with gear and effort
sampleurl2 = 'http://www.dailymile.com/people/danstoner/entries/35103337/workout_data' # with gear, effort, map, and comment
sampleurl3 = 'http://www.dailymile.com/people/danstoner/entries/35419201/workout_data' # empty page, only comment
sampleurl4 = 'http://www.dailymile.com/people/danstoner/entries/35373143/workout_data' # includes weather

#sampleurl5 = 'http://www.dailymile.com/people/danstoner/entries/35542790' # with gear, effort, map, and weather
url = 'http://www.dailymile.com/people/danstoner/entries/35587204/workout_data'
#requests.packages.urllib3.disable_warnings()
from pyquery import PyQuery as pq

from bs4 import BeautifulSoup

pyquery Samples


In [78]:
d = pq('<span><p class="hello">Hi</p><p>Bye</p></span>')

In [84]:
for each in d.children():
    print each.text


Hi
Bye

In [2]:
r = requests.get(sampleurl1)
r.raise_for_status()
r.content
blurb = pq(r.content)
for detail in blurb('dt'):
    print detail.text
#blurb().text()


Gear
Effort
Calories

In [3]:
r = requests.get(sampleurl2)
r.raise_for_status()
r.content
blurb = pq(r.content)
for detail in blurb('dt'):
    print detail.text


Gear
Effort
Calories

In [8]:
r = requests.get(sampleurl3)
r.raise_for_status()
blurb = pq('<document>' + r.content + '</document>')
for detail in blurb('dt'):
    print detail.text

In [16]:
#print blurb.children()
for detail in blurb('dt'):
    print detail.text()

In [118]:
r = requests.get(sampleurl4)
r.raise_for_status()
#r.content
blurb = pq(r.content)
#print blurb.children().children()
for each in blurb.children().children():
    print each.text
#print blurb
#for detail in blurb('dt'):
#    print detail.text
#    print detail.keys


Gear

    
Effort

    
Weather

    
Calories

    1067
  

Sample

<dl id="workout_details" class="details left"> 
  <dt>Gear</dt>
  <dd>
    <ul class="keyword_list">
      <li><a href="#" onclick="javascript: return false"><span>Merrell Trail Glove 3 - pair 1</span></a></li>
    </ul>
  </dd>
  <dt style="display: none">Effort</dt>
  <dd style="display: none">
    <ul class="effort-rating" style="display: block">
      <li class="current-rating" style="width: 60%;">Currently 3/5 Effort.</li>
    </ul>
  </dd>
  <dt>Weather</dt>
  <dd>
    <ul class="keyword_list">
      <li><a href="#" onclick="javascript: return false"><span>hot</span></a></li>
      <li><a href="#" onclick="javascript: return false"><span>sunny</span></a></li>
    </ul>
  </dd>
  <dt style="display: none">Calories</dt>
  <dd style="display: none">
    1067
  </dd>
</dl>
<div class="clearl empty_div"></div>

Beautiful Soup Samples


In [230]:
r = requests.get(sampleurl4)
r.raise_for_status()
soup = BeautifulSoup(r.content)

In [240]:
soup


Out[240]:
<html><body><dl class="details left" id="workout_details">
<dt>Gear</dt>
<dd>
<ul class="keyword_list">
<li><a href="#" onclick="javascript: return false"><span>Merrell Trail Glove 3 - pair 1</span></a></li>
</ul>
</dd>
<dt style="display: none">Effort</dt>
<dd style="display: none">
<ul class="effort-rating" style="display: block">
<li class="current-rating" style="width: 60%;">Currently 3/5 Effort.</li>
</ul>
</dd>
<dt>Weather</dt>
<dd>
<ul class="keyword_list">
<li><a href="#" onclick="javascript: return false"><span>hot</span></a></li>
<li><a href="#" onclick="javascript: return false"><span>sunny</span></a></li>
</ul>
</dd>
<dt style="display: none">Calories</dt>
<dd style="display: none">
    1067
  </dd>
</dl>
<div class="clearl empty_div"></div>
</body></html>

In [241]:
dt_items = soup.find_all("dt")
dd_items = soup.find_all("dd")

In [233]:
for each in dt_items:
    print each.get_text()


Gear
Effort
Weather
Calories

In [234]:
for each in dd_items:
    print "ITEM:", each.get_text()


ITEM: 

Merrell Trail Glove 3 - pair 1


ITEM: 

Currently 3/5 Effort.


ITEM: 

hot
sunny


ITEM: 
    1067
  

In [242]:
dt_texts = []
dd_texts = []
texts = {}
for each in dt_items:
    dt_texts.append(each.text)
for each in dd_items:
    dd_texts.append(each.text.replace('\n',''))

In [243]:
dt_texts


Out[243]:
[u'Gear', u'Effort', u'Weather', u'Calories']

In [244]:
dd_texts


Out[244]:
[u'Merrell Trail Glove 3 - pair 1',
 u'Currently 3/5 Effort.',
 u'hotsunny',
 u'    1067  ']

In [238]:
texts = {}
for i in range(0, len(dt_texts)):
    texts[dt_texts[i]] = dd_texts[i]

In [245]:
texts


Out[245]:
{}

again


In [258]:
dt_texts = []
dd_texts = []
for each in dt_items:
    dt_texts.append(each.text)
for each in dd_items:
    dd_texts.append(each.text.strip('\n').strip(' ').strip('\n'))

In [259]:
dd_texts


Out[259]:
[u'Merrell Trail Glove 3 - pair 1',
 u'Currently 3/5 Effort.',
 u'hot\nsunny',
 u'1067']

In [297]:
extended_stuff = {}
for i in range(0, len(dt_texts)):
    extended_stuff[dt_texts[i]] = dd_texts[i]

In [298]:
extended_stuff


Out[298]:
{u'Calories': u'1067',
 u'Effort': u'Currently 3/5 Effort.',
 u'Gear': u'Merrell Trail Glove 3 - pair 1',
 u'Weather': u'hot\nsunny'}

In [299]:
if 'Effort' in extended_stuff:
    effort_loc = extended_stuff['Effort'].find('/') - 1
    extended_stuff['Effort'] = extended_stuff['Effort'][effort_loc]
if 'Weather' in extended_stuff:
    extended_stuff['Weather'] = extended_stuff['Weather'].replace('\n',' ')

In [300]:
extended_stuff


Out[300]:
{u'Calories': u'1067',
 u'Effort': u'3',
 u'Gear': u'Merrell Trail Glove 3 - pair 1',
 u'Weather': u'hot sunny'}

pyqyery get Details sample


In [125]:
# using sampleurl4
r = requests.get(sampleurl4)
r.raise_for_status()
#print r.content
blurb = pq(r.content)
#print blurb.contents()
for detail in blurb.children():
    if detail.text == 'Gear':
        print "GEAR!"
    else:
        print "not gear"
    #for item in detail.items():
    #print "ITEM:", detail.html()
    #print "BLEH:",blurb.items('dt').closest('dd')
    #print "ITEM:", detail.closest('dd')
    #print "ITEM:",detail.children()

#for detail in blurb('dt'):
#    if detail.text == 'Gear':
#        for each in detail.iterkeys():
#            print each
#            print "."
        
    #print detail.keys()


not gear
not gear

In [10]:
r = requests.get(url,timeout=5)
r.raise_for_status()

In [11]:
r.content


Out[11]:
'<dl id="workout_details" class="details left"> \n  <dt>Gear</dt>\n  <dd>\n    <ul class="keyword_list">\n      <li><a href="#" onclick="javascript: return false"><span>VFF SeeYa LS</span></a></li>\n    </ul>\n  </dd>\n  <dt style="display: none">Effort</dt>\n  <dd style="display: none">\n    <ul class="effort-rating" style="display: block">\n      <li class="current-rating" style="width: 60%;">Currently 3/5 Effort.</li>\n    </ul>\n  </dd>\n  <dt style="display: none">Calories</dt>\n  <dd style="display: none">\n    489\n  </dd>\n</dl>\n<div class="clearl empty_div"></div>\n'

In [50]:
doc = pq(r.content)
doc('li.current-rating').text()


Out[50]:
'Currently 3/5 Effort.'

In [72]:
doc('ul.keyword_list').text()


Out[72]:
'spikes - New Balance MXC700BS'

In [73]:
doc('dt')


Out[73]:
[<dt>, <dt>, <dt>]

In [74]:
gear = ''
for detail in doc('dt'):
    if detail.text == 'Gear':
        gear = doc.find('span').text()

In [75]:
gear


Out[75]:
'spikes - New Balance MXC700BS'

In [ ]: