In [1]:
import os
import json
import shutil
import sh
import yaml
from pandas import DataFrame, Series
from itertools import islice
REPOS_LIST = "/Users/raymondyee/C/src/gitenberg/Second-Folio/list_of_repos.txt"
GITENBERG_DIR = "/Users/raymondyee/C/src/gitenberg/"
METADATA_DIR = "/Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata"
COVERS_DATA = "/Users/raymondyee/C/src/gitenberg/Second-Folio/covers_data.json"
repos=open(REPOS_LIST).read().strip().split("\n")
In [3]:
for repo in islice(repos,0,2):
pg_id = repo.split("_")[-1]
source = os.path.join(METADATA_DIR, "{}.yaml".format(pg_id))
print (source)
if os.path.exists(source):
with open(source, "r") as yfile:
yaml_0 = open(source).read()
yfile.seek(0)
y = yaml.load(yfile)
yaml_1 = yaml.safe_dump(y,default_flow_style=False,allow_unicode=True)
print (source, yaml_0 == yaml_1)
# compare the yaml file with what it would be if we serialized it in the way I think it's being done by Eric
Now ready to compare files in /Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata with the repos
In [29]:
# comparing source / destination
for repo in islice(repos,0,1):
pg_id = repo.split("_")[-1]
source = os.path.join(METADATA_DIR, "{}.yaml".format(pg_id))
dest = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
#print (repo, source, dest)
#print "diff {0} {1}".format(source, dest)
try:
output = sh.diff(source, dest)
except Exception as e:
shutil.copyfile(source, dest)
print (source, dest)
#print (output)
#print (e)
In [38]:
def git_reload_metadata(repo):
metadata_path = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
try:
if os.path.exists(metadata_path):
sh.cd(os.path.join(GITENBERG_DIR, repo))
print ("add")
sh.git("add", "metadata.yaml")
print ("commit")
try:
sh.git("commit", "-m", "update metadata.yaml with RTC as publisher; adding _version")
except:
pass
print ("push")
sh.git.push()
else:
return None
except Exception as e:
return e
In [41]:
# logic for adding cover metadata
def do_git_reload_metadata():
for (i,repo) in enumerate(islice(repos,1,None)):
print (i, repo)
print (git_reload_metadata(repo))
do_git_reload_metadata()
In [ ]:
import yaml
In [30]:
covers_data = json.loads(open(COVERS_DATA).read())
covers_data_dict = dict([(c['GitHub repo'], c) for c in covers_data])
for repo in islice(repos,0,1):
pg_id = repo.split("_")[-1]
dest = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
cover_artist = covers_data_dict[repo]['cover_artist']
cover_metadata = {
"cover_type": "original",
"image_path": "cover.jpg",
"rights": "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)",
"rights_url": "https://creativecommons.org/licenses/by-nc/4.0/",
"attribution": u"{}, 2015".format(cover_artist)
}
print (dest)
# is cover_metadata already in the YAML?
if os.path.exists(dest):
with open(dest, "r+") as yfile:
y = yaml.load(yfile)
covers = y.get('covers', [])
covers_set = [set(c.items()) for c in covers]
if set(cover_metadata.items()) not in covers_set:
covers.append(cover_metadata)
y['covers'] = covers
yfile.seek(0)
yfile.write(yaml.safe_dump(y,default_flow_style=False,allow_unicode=True))
yfile.truncate()
else:
print ("already in", dest, cover_metadata)
In [ ]:
def git_cover_metadata(repo):
metadata_path = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
try:
if os.path.exists(metadata_path):
sh.cd(os.path.join(GITENBERG_DIR, repo))
print ("add")
sh.git("add", "metadata.yaml")
print ("commit")
try:
sh.git("commit", "-m", "update metadata.yaml with cover attribution")
except:
pass
print ("push")
sh.git.push()
else:
return None
except Exception as e:
return e
In [ ]:
for (i,repo) in enumerate(islice(repos[1:],None)):
print (i, repo)
print (git_cover_metadata(repo))
In [ ]:
dest
In [ ]:
# yaml.safe_dump(pg_json,default_flow_style=False,allow_unicode=True)
# http://pyyaml.org/wiki/PyYAMLDocumentation
with open(dest, "rw") as yfile:
y = yaml.load(yfile)
print(y)
In [ ]:
yfile = open(dest, "rw")
y = yaml.load(yfile)
yfile.close()
In [ ]:
y.get('covers')
In [ ]:
"""u"中国"
"""
In [ ]:
import codecs
def lit_to_unicode(s):
ok_type = [unicode, int]
if type(s) in ok_type:
return s
elif isinstance(s, str):
return codecs.decode(s, "unicode_escape")
elif isinstance (s, list):
return [lit_to_unicode(item) for item in s]
elif isinstance(s, dict):
return dict([(k,lit_to_unicode(v)) for (k,v) in s.items()])
else:
raise Exception("unexpected type", type(s))
In [ ]:
# fix unicode problems in the source yamls
from IPython.display import (HTML, display)
import difflib
differ = difflib.HtmlDiff()
changed_yaml = []
for repo in islice(repos,0,None):
pg_id = repo.split("_")[-1]
source = os.path.join(METADATA_DIR, "{}.yaml".format(pg_id))
with open(source, "r+") as yfile:
y = yaml.load(yfile)
yfile.seek(0)
old_dump = yfile.read()
new_dump = yaml.safe_dump(lit_to_unicode(y),default_flow_style=False,allow_unicode=True)
if old_dump != new_dump:
yfile.seek(0)
yfile.write(new_dump)
yfile.truncate()
yfile.close()
changed_yaml.append(repo)
In [ ]:
changed_yaml
In [37]:
# now update publisher metadata in repos
"""
publication_date: 2015-08-01
publisher: Recovering the Classics
rights: CC BY-NC
rights_url: http://creativecommons.org/licenses/by-nc/4.0/
"""
for repo in islice(repos,1,None):
pg_id = repo.split("_")[-1]
dest = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
print (dest)
with open(dest, "r+") as yfile:
y = yaml.load(yfile)
yfile.seek(0)
old_dump = yfile.read()
# make changes
y['publisher'] = 'Recovering the Classics'
y['publication_date'] = '2015-08-01'
y['rights'] = 'CC BY-NC'
y['rights_url'] = 'http://creativecommons.org/licenses/by-nc/4.0/'
y['_version'] = '0.0.1'
new_dump = yaml.safe_dump(y,default_flow_style=False,allow_unicode=True)
if old_dump != new_dump:
print ("writing new_dump")
yfile.seek(0)
yfile.write(new_dump)
yfile.truncate()
yfile.close()
In [ ]: