Python imports


In [10]:
from oneflow.core.models.nonrel import *
from oneflow.core.stats import *
from oneflow.base.utils.dateutils import *

from mongoengine.queryset import Q
from statsd import statsd

from oneflow import VERSION as version
print u'1flow v%s, statsd %s' % (version, u', '.join(u'%s' % (v,) for k,v in statsd.__dict__.items() if k != '_sock'))

Absolutize URLs now


In [2]:
Article.objects(content_type=0).filter(content_error='', url_error='').count()


Out[2]:
25644

In [3]:
Article.objects(content_type=0).count()


Out[3]:
92708

In [4]:
article = Article.objects.get(url='http://boingboing.net/2013/07/26/flexible-interactive-electron.html')

In [11]:
print article.feedparser_original_data


{'slash_comments': u'0', 'published_parsed': time.struct_time(tm_year=2013, tm_mon=7, tm_mday=26, tm_hour=10, tm_min=15, tm_sec=28, tm_wday=4, tm_yday=207, tm_isdst=0), 'author': u'David Pescovitz', 'links': [{'href': u'http://feedproxy.google.com/~r/boingboing/iBag/~3/Jgs2SnQKEJQ/story01.htm', 'type': u'text/html', 'rel': u'alternate'}], 'tags': [{'term': u'Post', 'scheme': None, 'label': None}, {'term': u'Technology', 'scheme': None, 'label': None}, {'term': u'Science', 'scheme': None, 'label': None}], 'feedburner_origlink': u'http://rss.feedsportal.com/c/35208/f/653965/s/2f340de7/sc/5/l/0Lboingboing0Bnet0C20A130C0A70C260Cflexible0Einteractive0Eelectron0Bhtml/story01.htm', 'title': u'Flexible, interactive electronic\xa0"skin"', 'wfw_commentrss': u'http://boingboing.net/2013/07/26/flexible-interactive-electron.html/feed', 'summary': u'<img alt="NewImage" border="0" class="alignright" height="291" src="http://media.boingboing.net/wp-content/uploads/2013/07/NewImage154.png" title="NewImage.png" width="375" />UC Berkeley researchers demonstrated a new kind of thin, flexible "electronic skin" that lights up when you touch it. Press on it, and it glows brighter. The prototype film consists of 16 x 16 pixels, each outfitted with a transistor, organic LED, and pressure sensor. It\'s made using the same fabrication tools employed by the semiconductor industry. I can\'t wait until fashion designers and other makers can get bolts of this stuff for cheap! "<a href="http://newscenter.berkeley.edu/2013/07/21/first-interactive-e-skin-built-on-plastic/">Paper-thin e-skin responds to touch, holds promise for sensory robotics and interactive environments</a>"<img border="0" height="1" src="http://rss.feedsportal.com/c/35208/f/653965/s/2f340de7/sc/5/mf.gif" width="1" /><div class="mf-viral"><table border="0"><tr><td valign="middle"><a href="http://share.feedsportal.com/share/twitter/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/twitter.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/facebook/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/facebook.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/linkedin/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/linkedin.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/gplus/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/googleplus.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/email/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/email.png" /></a></td><td valign="middle"></td></tr></table></div><br /><br /><a href="http://da.feedsportal.com/r/172449329748/u/49/f/653965/c/35208/s/2f340de7/a2.htm"><img border="0" src="http://da.feedsportal.com/r/172449329748/u/49/f/653965/c/35208/s/2f340de7/a2.img" /></a><img border="0" height="1" src="http://pi.feedsportal.com/r/172449329748/u/49/f/653965/c/35208/s/2f340de7/a2t.img" width="1" /><img height="1" src="http://feeds.feedburner.com/~r/boingboing/iBag/~4/Jgs2SnQKEJQ" width="1" />', 'content': [{'base': u'http://feeds.feedburner.com/boingboing/iBag', 'type': u'text/html', 'value': u'<img alt="NewImage" border="0" class="alignright" height="291" src="http://media.boingboing.net/wp-content/uploads/2013/07/NewImage154.png" title="NewImage.png" width="375" />UC Berkeley researchers demonstrated a new kind of thin, flexible "electronic skin" that lights up when you touch it. Press on it, and it glows brighter. The prototype film consists of 16 x 16 pixels, each outfitted with a transistor, organic LED, and pressure sensor. It\'s made using the same fabrication tools employed by the semiconductor industry. I can\'t wait until fashion designers and other makers can get bolts of this stuff for cheap! "<a href="http://newscenter.berkeley.edu/2013/07/21/first-interactive-e-skin-built-on-plastic/">Paper-thin e-skin responds to touch, holds promise for sensory robotics and interactive environments</a>"<img border="0" height="1" src="http://rss.feedsportal.com/c/35208/f/653965/s/2f340de7/sc/5/mf.gif" width="1" /><div class="mf-viral"><table border="0"><tr><td valign="middle"><a href="http://share.feedsportal.com/share/twitter/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/twitter.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/facebook/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/facebook.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/linkedin/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/linkedin.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/gplus/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/googleplus.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/email/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/email.png" /></a></td><td valign="middle"></td></tr></table></div><br /><br /><a href="http://da.feedsportal.com/r/172449329748/u/49/f/653965/c/35208/s/2f340de7/a2.htm"><img border="0" src="http://da.feedsportal.com/r/172449329748/u/49/f/653965/c/35208/s/2f340de7/a2.img" /></a><img border="0" height="1" src="http://pi.feedsportal.com/r/172449329748/u/49/f/653965/c/35208/s/2f340de7/a2t.img" width="1" /><img height="1" src="http://feeds.feedburner.com/~r/boingboing/iBag/~4/Jgs2SnQKEJQ" width="1" />', 'language': None}, {'base': u'http://feeds.feedburner.com/boingboing/iBag', 'type': u'text/html', 'value': u'UC Berkeley researchers demonstrated a new kind of thin, flexible "electronic skin" that lights up when you touch it. Press on it, and it glows brighter.<img border="0" height="1" src="http://rss.feedsportal.com/c/35208/f/653965/s/2f340de7/sc/5/mf.gif" width="1" /><div class="mf-viral"><table border="0"><tr><td valign="middle"><a href="http://share.feedsportal.com/share/twitter/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/twitter.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/facebook/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/facebook.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/linkedin/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/linkedin.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/gplus/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/googleplus.png" /></a>&nbsp;<a href="http://share.feedsportal.com/share/email/?u=http%3A%2F%2Fboingboing.net%2F2013%2F07%2F26%2Fflexible-interactive-electron.html&amp;t=Flexible%2C+interactive+electronic%C2%A0%22skin%22" target="_blank"><img border="0" src="http://res3.feedsportal.com/social/email.png" /></a></td><td valign="middle"></td></tr></table></div><br /><br /><a href="http://da.feedsportal.com/r/172449329748/u/49/f/653965/c/35208/s/2f340de7/a2.htm"><img border="0" src="http://da.feedsportal.com/r/172449329748/u/49/f/653965/c/35208/s/2f340de7/a2.img" /></a><img border="0" height="1" src="http://pi.feedsportal.com/r/172449329748/u/49/f/653965/c/35208/s/2f340de7/a2t.img" width="1" />', 'language': None}], 'guidislink': False, 'title_detail': {'base': u'http://feeds.feedburner.com/boingboing/iBag', 'type': u'text/plain', 'value': u'Flexible, interactive electronic\xa0"skin"', 'language': None}, 'link': u'http://feedproxy.google.com/~r/boingboing/iBag/~3/Jgs2SnQKEJQ/story01.htm', 'authors': [{}], 'author_detail': {'name': u'David Pescovitz'}, 'id': u'http://boingboing.net/?p=245647', 'published': u'Fri, 26 Jul 2013 10:15:28 PDT'}

In [4]:
# NOTE: 10k in 15secs, 20k in 1 min, 100k in 2 min, 400k in 10 mins, 
# 1M in 31 mins, 1045395 launched in 34 minutes.

count = 0
start_time = pytime.time()

for article in Article.objects(url_absolute__ne=True).no_cache():
    #LOGGER.info('%s %s', article, article.url_absolute)
    
    try:
        article.absolutize_url.delay()
    except Exception, e:
        print str(e)
        continue
    count += 1
    #LOGGER.info('%s %s %s', article, article.url_absolute, count)
    
print '\n%s >> %s launched in %s.\n' % (now().isoformat(), count, naturaldelta(pytime.time() - start_time))


2013-07-21T07:56:08.984567+00:00 >> 1045395 launched in 34 minutes.

Manual check & experimentations


In [2]:
Article.objects.get(id='51ca270ce7a05b27f6bb665b').url


Out[2]:
u'http://www.theawl.com/2011/09/corks-v-screw-tops'

In [ ]:
for article in arts[:10]:
    article.absolutize_url()

In [33]:
r = requests.get('http://rss.feedsportal.com/c/707/f/9951/s/2b27496a/l/0L0Sreseaux0Etelecoms0Bnet0Cactualites0Clire0Elancement0Emondial0Edu0Esamsung0Egalaxy0Es40E25980A0Bhtml/story01.htm', headers={'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/22.0' })


Starting new HTTP connection (1): rss.feedsportal.com
"GET /c/707/f/9951/s/2b27496a/l/0L0Sreseaux0Etelecoms0Bnet0Cactualites0Clire0Elancement0Emondial0Edu0Esamsung0Egalaxy0Es40E25980A0Bhtml/story01.htm HTTP/1.1" 301 0
Starting new HTTP connection (1): da.feedsportal.com
"GET /c/707/f/9951/s/2b27496a/l/0L0Sreseaux0Etelecoms0Bnet0Cactualites0Clire0Elancement0Emondial0Edu0Esamsung0Egalaxy0Es40E25980A0Bhtml/ia1.htm HTTP/1.1" 200 2918

In [17]:
r.headers


Out[17]:
CaseInsensitiveDict({'date': 'Thu, 18 Jul 2013 09:35:02 GMT', 'content-length': '2918', 'content-type': 'text/html;charset=utf-8', 'connection': 'close', 'server': 'FeedsPortal'})

In [9]:
resp = r.history[-1]

In [27]:
b = BeautifulSoup(r.content)
for a in b.findAll('a'):
    if 'here to continue' in a.text:
        print a['href']


http://www.reseaux-telecoms.net/actualites/lire-lancement-mondial-du-samsung-galaxy-s4-25980.html

In [31]:
r = requests.get('http://rss.feedsportal.com/c/707/f/9951/s/2b27496a/l/0L0Sreseaux0Etelecoms0Bnet0Cactualites0Clire0Elancement0Emondial0Edu0Esamsung0Egalaxy0Es40E25980A0Bhtml/story01.htm')


Starting new HTTP connection (1): rss.feedsportal.com
"GET /c/707/f/9951/s/2b27496a/l/0L0Sreseaux0Etelecoms0Bnet0Cactualites0Clire0Elancement0Emondial0Edu0Esamsung0Egalaxy0Es40E25980A0Bhtml/story01.htm HTTP/1.1" 301 0
Starting new HTTP connection (1): da.feedsportal.com
"GET /c/707/f/9951/s/2b27496a/l/0L0Sreseaux0Etelecoms0Bnet0Cactualites0Clire0Elancement0Emondial0Edu0Esamsung0Egalaxy0Es40E25980A0Bhtml/ia1.htm HTTP/1.1" 301 0
Starting new HTTP connection (1): www.reseaux-telecoms.net
"GET /actualites/lire-lancement-mondial-du-samsung-galaxy-s4-25980.html HTTP/1.1" 200 None

In [ ]: