In [ ]:
from urllib2 import HTTPError
import urllib2
import urllib
import os
import shutil
import zipfile
In [ ]:
get_html = lambda url: urllib2.urlopen(url).read()
download_file = lambda url, fout: urllib.urlretrieve(url, fout)
In [ ]:
for i in xrange(50, 409):
print i
try:
raw = get_html('http://mnemosyne-proj.org/node/%d' % i)
except HTTPError:
continue
for s in raw.split('.cards"')[:-1]:
fin = s.split('href="')[-1]
deck = fin.split('/')[-1]
print deck
download_file(fin + '.cards', os.path.join('shared_decks', deck + '.zip'))
In [ ]:
fs = [x for x in os.listdir('shared_decks') if '.zip' in x]
for f in fs:
abs_f = os.path.join('shared_decks', f)
try:
with zipfile.ZipFile(abs_f, 'r') as g:
g.extractall(abs_f.replace('.zip', ''))
os.rename(os.path.join(abs_f.replace('.zip', ''), 'cards.xml'), abs_f.replace('.zip', '.xml'))
shutil.rmtree(abs_f.replace('.zip', ''))
except:
pass
In [ ]: