Python imports


In [1]:
import time
from oneflow.core.models import *

In [9]:
arts = Article.objects(url__startswith='http://da.feed')
arts.count()


Out[9]:
932

In [11]:
for article in arts[:10]:
    article.absolutize_url()


http://da.feedsportal.com/c/32839/f/529606/s/10a6081d/l/0L0Spresence0Epc0N0Cactualite0CLong0ETerm0EHSDPA0E417570C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/10e28bb1/l/0L0Spresence0Epc0N0Cactualite0Cmicron0Eflash0E418660C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11056020/l/0L0Spresence0Epc0N0Cactualite0Chologramme0E4190A20C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/1112405f/l/0L0Spresence0Epc0N0Cactualite0Cpseudo0Eholographie0E419230C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/111b8af7/l/0L0Spresence0Epc0N0Cactualite0CMini0EDefine0E419450C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/1121e984/l/0L0Spresence0Epc0N0Cactualite0CSSD0E419570C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/113bca95/l/0L0Spresence0Epc0N0Cactualite0CFusion0EE0E350A0EE0E240A0EC0E50A0EC0E30A0E419830C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11477dc2/l/0L0Spresence0Epc0N0Cactualite0Cozmo0Esouris0Ewifi0E420A130C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/1149f346/l/0L0Spresence0Epc0N0Cactualite0Cqualcomme0Eatheros0E420A140C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/114b0bca/l/0L0Spresence0Epc0N0Cactualite0Cssd0Epuce0E420A190C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11504934/l/0L0Spresence0Epc0N0Cactualite0CSurface0E420A330C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/115203d5/l/0L0Spresence0Epc0N0Cactualite0Csandforce0Eowc0Epata0E420A430C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11558a4b/l/0L0Spresence0Epc0N0Cactualite0Cpegasystems0Ecloud0Ebpm0E420A540C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11734bc0/l/0L0Spresence0Epc0N0Cactualite0CNintendo0E3DS0E420A850C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/1174a368/l/0L0Spresence0Epc0N0Cactualite0CBlackwidow0E420A870C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11752be5/l/0L0Spresence0Epc0N0Cactualite0CPrix0Ei50E250A0AK0E420A930C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11752be6/l/0L0Spresence0Epc0N0Cactualite0CVAIO0E3D0E420A810C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/117f2692/l/0L0Spresence0Epc0N0Cactualite0CK3810E990A0A0E4210A50C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/117f5a65/l/0L0Spresence0Epc0N0Cactualite0CRadeon0E6950A0E1Go0E4210A90C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11827a6f/l/0L0Spresence0Epc0N0Cactualite0Ccloud0EUS0Eministere0EMicrosoft0E421130C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11845dca/l/0L0Spresence0Epc0N0Cactualite0CWPA0EWiFi0Ecloud0E42110A0C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/1184ea5f/l/0L0Spresence0Epc0N0Cactualite0CX580EP670EPure0E421140C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/11872088/l/0L0Spresence0Epc0N0Cactualite0CAMOLED0E4210A80C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/1188ceb3/l/0L0Spresence0Epc0N0Cactualite0CChrome0EH0B2640E421190C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/118d1542/l/0L0Spresence0Epc0N0Cactualite0Ccloud0Eubuntu0Eeucalyptus0Eopenstack0E42130A0C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/118f13a8/l/0L0Spresence0Epc0N0Cactualite0CHyDrive0EHLDS0E421340C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/119358de/l/0L0Spresence0Epc0N0Cactualite0C20A0Enm0E421360C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/1195fa52/l/0L0Spresence0Epc0N0Cactualite0CBulldozer0Eperformances0E421450C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/1199a40a/l/0L0Spresence0Epc0N0Cactualite0Ctablette0Etactile0Etaxe0E42150A0C0T0FRSS0E11/ia1.htm
http://da.feedsportal.com/c/32839/f/529606/s/119d350d/l/0L0Spresence0Epc0N0Cactualite0CXeon0ECore0Ei70E421570C0T0FRSS0E11/ia1.htm

In [ ]:


Content parsing (BoilerPipe)

Code and counts


In [2]:
# content
empty_articles    = Article.objects(content_type=0) # not counted
unparsed_articles = Article.objects(content_type__exists=False)
pending_articles  = empty_articles.filter(content_error__exists=False)
error_articles    = empty_articles.filter(content_error__exists=True)
error2_articles   = Article.objects(content_error__exists=True)
html_articles     = Article.objects(content_type=1)
markdown_articles = Article.objects(content_type=2)

# counts
total_articles_count    = Article.objects().count()
unparsed_articles_count = unparsed_articles.count()
pending_articles_count  = pending_articles.count()
error_articles_count    = error_articles.count()
error2_articles_count   = error2_articles.count()
html_articles_count     = html_articles.count()
markdown_articles_count = markdown_articles.count()

print '- %s: CONTENT pending: %s, errors: %s/%s, md: %s, html:%s, unparsed: %s, total: %s; fetched: %s, dupes: %s' % (
    time.strftime('%Y%m%d %H:%M'),
    pending_articles_count, 
    error_articles_count, error2_articles_count,  
    markdown_articles_count, 
    html_articles_count, 
    unparsed_articles_count,
    total_articles_count,
    global_feed_stats.fetched(), global_feed_stats.dupes()
)


- 20130711 13:56: CONTENT pending: 26263, errors: 4710/7800, md: 67586, html:0, unparsed: 2291701, total: 2390258; fetched: 196455, dupes: 4225707

Values history

  • 20130711 13:56: CONTENT pending: 26263, errors: 4710/7800, md: 67586, html:0, unparsed: 2291701, total: 2390258; fetched: 196455, dupes: 4225707

  • 20130706 ??:??: CONTENT pending: 14967, errors: 0854, md: 33341, html:0, unparsed: 2291701, total: 2340863

  • 20130707 14:16: CONTENT pending: 20967, errors: 1006, md: 33968, html:0, unparsed: 2291701, total: 2347642
  • 20130707 14:18: CONTENT pending: 20867, errors: 1092, md: 33982, html:0, unparsed: 2291701, total: 2347642 (after JAVA_HOME in .env, run:100)
  • 20130707 14:21: CONTENT pending: 20539, errors: 1402, md: 34088, html:0, unparsed: 2291701, total: 2347727 (run:100)
  • 20130707 15:29: CONTENT pending: 19915, errors: 1961, md: 34537, html:0, unparsed: 2291701, total: 2348114 (restart w03+max=30, run:100)
  • 20130707 15:47: CONTENT pending: 19181, errors: 2266, md: 35056, html:0, unparsed: 2291701, total: 2348203 (run: 10k > was incomplete)
  • 20130707 15:55: CONTENT pending: 18668, errors: 2859, md: 35056, html:0, unparsed: 2291701, total: 2348284 (run: remaining)
  • 20130707 16:51: CONTENT pending: 18862, errors: 2964, md: 35194, html:0, unparsed: 2291701, total: 2348721 (after w03 restart…)
  • 20130707 18:07: CONTENT pending: 18588, errors: 3342, md: 35596, html:0, unparsed: 2291701, total: 2349227 (again)
  • 20130707 18:24: CONTENT pending: 16327, errors: 3468, md: 37913, html:0, unparsed: 2291701, total: 2349390 (crontab restart */2 min…)
  • 20130707 18:35: CONTENT pending: 14844, errors: 3530, md: 39346, html:0, unparsed: 2291701, total: 2349421
  • 20130707 20:21: CONTENT pending: 12081, errors: 3094, md: 43329, html:0, unparsed: 2291701, total: 2350195
  • 20130707 20:25: CONTENT pending: 12137, errors: 3028/4268, md: 43414, html:0, unparsed: 2291701, total: 2350280
  • 20130707 21:31: CONTENT pending: 7728, errors: 3857/5097, md: 47437, html:0, unparsed: 2291701, total: 2350723; fetched: 117387, dupes: 2272633
  • 20130707 21:39: CONTENT pending: 7296, errors: 3985/5225, md: 47786, html:0, unparsed: 2291701, total: 2350768; fetched: 117477, dupes: 2280329
  • 20130708 07:19: CONTENT pending: 960, errors: 4878/6661, md: 55770, html:0, unparsed: 2291701, total: 2353309; fetched: 122555, dupes: 2723165
  • 20130708 07:34: CONTENT pending: 86, errors: 5001/6786, md: 56548, html:0, unparsed: 2291701, total: 2353336; fetched: 122609, dupes: 2725029
  • 20130708 07:40: CONTENT pending: 48, errors: 5005/6791, md: 56602, html:0, unparsed: 2291701, total: 2353356; fetched: 122649, dupes: 2730639
  • 20130708 08:52: CONTENT pending: 47, errors: 4929/6799, md: 56957, html:0, unparsed: 2291701, total: 2353634; fetched: 123205, dupes: 2767447
  • 20130708 09:44: CONTENT pending: 8, errors: 4685/6811, md: 57240, html:0, unparsed: 2291701, total: 2353634; fetched: 123205, dupes: 2767447
  • 20130708 10:44: CONTENT pending: 1, errors: 4363/6829, md: 57569, html:0, unparsed: 2291701, total: 2353634; fetched: 123205, dupes: 2767447
  • 20130708 15:31: CONTENT pending: 1, errors: 4233/6833, md: 57699, html:0, unparsed: 2291701, total: 2353634; fetched: 123205, dupes: 2767447

Full content parsing (Readability)

Code and counts


In [4]:
# full_content
f_empty_articles    = Article.objects(full_content_type=0) # not counted
f_unparsed_articles = Article.objects(full_content_type__exists=False)
f_pending_articles  = f_empty_articles.filter(full_content_error__exists=False)
f_error_articles    = f_empty_articles.filter(full_content_error__exists=True)
f_error2_articles   = Article.objects(full_content_error__exists=True)
f_html_articles     = Article.objects(full_content_type=1)
f_markdown_articles = Article.objects(full_content_type=2)

# counts
f_unparsed_articles_count = f_unparsed_articles.count()
f_pending_articles_count  = f_pending_articles.count()
f_error_articles_count    = f_error_articles.count()
f_error2_articles_count   = f_error2_articles.count()
f_html_articles_count     = f_html_articles.count()
f_markdown_articles_count = f_markdown_articles.count()

print '- %s: FULL_CONTENT pending: %s, errors: %s/%s, md: %s, html:%s, unparsed: %s, total: %s' % (
    time.strftime('%Y%m%d %H:%M'),
    f_pending_articles_count,
    f_error_articles_count, f_error2_articles_count,
    f_markdown_articles_count, 
    f_html_articles_count, 
    f_unparsed_articles_count,
    total_articles_count)


- 20130708 16:00: FULL_CONTENT pending: 35373, errors: 16810/16812, md: 9750, html:0, unparsed: 2291701, total: 2353634

Values history

  • 20130707 16:53: FULL_CONTENT pending: 50145, errors: 2177, md: 4704, html:0, unparsed: 2291701, total: 2348721
  • 20130707 18:07: FULL_CONTENT pending: 50141, errors: 2193, md: 5198, html:0, unparsed: 2291701, total: 2349227
  • 20130707 18:24: FULL_CONTENT pending: 50189, errors: 2195, md: 5314, html:0, unparsed: 2291701, total: 2349390
  • 20130707 18:37: FULL_CONTENT pending: 50165, errors: 2198, md: 5378, html:0, unparsed: 2291701, total: 2349421
  • 20130707 20:22: FULL_CONTENT pending: 50223, errors: 2258, md: 6055, html:0, unparsed: 2291701, total: 2350195
  • 20130707 20:25: FULL_CONTENT pending: 50214, errors: 2259/2259, md: 6110, html:0, unparsed: 2291701, total: 2350280
  • 20130707 21:40: FULL_CONTENT pending: 50147, errors: 2279/2279, md: 6643, html:0, unparsed: 2291701, total: 2350768
  • 20130708 07:20: FULL_CONTENT pending: 50286, errors: 2397/2397, md: 8925, html:0, unparsed: 2291701, total: 2353309
  • 20130708 07:41: FULL_CONTENT pending: 49914, errors: 2424/2424, md: 9325, html:0, unparsed: 2291701, total: 2353356
  • 20130708 08:53: FULL_CONTENT pending: 48100, errors: 4084/4085, md: 9750, html:0, unparsed: 2291701, total: 2353634
  • 20130708 09:45: FULL_CONTENT pending: 46568, errors: 5617/5617, md: 9750, html:0, unparsed: 2291701, total: 2353634
  • 20130708 10:45: FULL_CONTENT pending: 44822, errors: 7363/7363, md: 9750, html:0, unparsed: 2291701, total: 2353634
  • 20130708 15:32: FULL_CONTENT pending: 36197, errors: 15986/15987, md: 9750, html:0, unparsed: 2291701, total: 2353634

Various things and checks


In [12]:
# NOTE: re-enable content parsing in constance first!!
for article in pending_articles:
    article.parse_content.delay()

In [5]:
# NOTE: re-enable content parsing in constance first!!
for article in f_pending_articles:
    article.parse_full_content.delay()

In [12]:
article = Article.objects.get(id="51d968514adc897c6893920f")

In [20]:
print (article.content_error, article.full_content_error, 
    article.content, article.full_content, 
    article.content_type, article.full_content_type)


(u'HTTP Error 404: Not Found', u'This article could not be fetched or is otherwise invalid. This is most likely an issue with fetching the article from the source server. Please check that the source server is available and that your URL was properly escaped.', None, None, 0, 0)

In [ ]: