In [19]:
import os
import shutil
import sh
from pandas import DataFrame, Series
from itertools import islice
import yaml
REPOS_LIST = "/Users/raymondyee/C/src/gitenberg/Second-Folio/list_of_repos.txt"
GITENBERG_DIR = "/Users/raymondyee/C/src/gitenberg/"
repos=open(REPOS_LIST).read().strip().split("\n")
for (i,repo) in enumerate(islice(repos,None)):
pg_id = repo.split("_")[-1]
dest = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
if os.path.exists(dest):
with open(dest, "r+") as yfile:
y = yaml.load(yfile)
covers = y.get('covers', [])
#print (i, dest, len(covers))
# check for existence of covers
covers1 = []
for cover in covers:
cover_path = cover.get("image_path")
if cover_path is not None:
if os.path.exists(os.path.join(GITENBERG_DIR, repo, cover_path)):
covers1.append(cover)
if len(covers1) < len(covers):
print (i, dest, len(covers), len(covers1))
y['covers'] = covers1
yfile.seek(0)
yfile.write(yaml.safe_dump(y,default_flow_style=False,allow_unicode=True))
yfile.truncate()
fixes:
(0, '/Users/raymondyee/C/src/gitenberg/Adventures-of-Huckleberry-Finn_76/metadata.yaml', 6, 1) (16, '/Users/raymondyee/C/src/gitenberg/Dracula_345/metadata.yaml', 2, 1) (18, '/Users/raymondyee/C/src/gitenberg/Household-Stories-by-the-Brothers-Grimm_19068/metadata.yaml', 2, 1) (38, '/Users/raymondyee/C/src/gitenberg/Les-Mis-rables_135/metadata.yaml', 2, 1) (40, '/Users/raymondyee/C/src/gitenberg/The-Count-of-Monte-Cristo_1184/metadata.yaml', 2, 1)
In [20]:
def git_cover_metadata(repo):
metadata_path = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
try:
if os.path.exists(metadata_path):
sh.cd(os.path.join(GITENBERG_DIR, repo))
print ("add")
sh.git("add", "metadata.yaml")
print ("commit")
try:
sh.git("commit", "-m", "remove covers from metadata.yaml with bad paths")
except:
pass
print ("push")
sh.git.push()
else:
return None
except Exception as e:
return e
In [21]:
repos_fixed = ['Adventures-of-Huckleberry-Finn_76', 'Dracula_345/metadata.yaml',
'Household-Stories-by-the-Brothers-Grimm_19068', 'Les-Mis-rables_135', 'The-Count-of-Monte-Cristo_1184']
In [23]:
for (i,repo) in enumerate(islice(repos_fixed,None)):
print (i, repo)
print (git_cover_metadata(repo))
In [ ]: