In [1]:
import os
import json
import shutil
import sh
import yaml
from pandas import DataFrame, Series
from itertools import islice

REPOS_LIST = "/Users/raymondyee/C/src/gitenberg/Second-Folio/list_of_repos.txt"
GITENBERG_DIR = "/Users/raymondyee/C/src/gitenberg/"

METADATA_DIR = "/Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata"
COVERS_DATA = "/Users/raymondyee/C/src/gitenberg/Second-Folio/covers_data.json"

repos=open(REPOS_LIST).read().strip().split("\n")

In [3]:
for repo in islice(repos,0,2):
    pg_id = repo.split("_")[-1]
    source = os.path.join(METADATA_DIR, "{}.yaml".format(pg_id))
    print (source)
    
    if os.path.exists(source):
        with open(source, "r") as yfile: 
            yaml_0 = open(source).read()
            yfile.seek(0)
            y = yaml.load(yfile)
            yaml_1 = yaml.safe_dump(y,default_flow_style=False,allow_unicode=True)

            print (source, yaml_0 == yaml_1)
    
    # compare the yaml file with what it would be if we serialized it in the way I think it's being done by Eric


/Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata/76.yaml
('/Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata/76.yaml', True)
/Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata/996.yaml
('/Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata/996.yaml', True)

Now ready to compare files in /Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata with the repos


In [29]:
# comparing source / destination

for repo in islice(repos,0,1):
    pg_id = repo.split("_")[-1]
    source = os.path.join(METADATA_DIR, "{}.yaml".format(pg_id))
    dest = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
    
    #print (repo, source, dest)
    #print "diff {0} {1}".format(source, dest)
    try:
         output = sh.diff(source, dest)
    except Exception as e:
        shutil.copyfile(source, dest)
        print (source, dest)
        #print (output)
        #print (e)


('/Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata/76.yaml', '/Users/raymondyee/C/src/gitenberg/Adventures-of-Huckleberry-Finn_76/metadata.yaml')

In [38]:
def git_reload_metadata(repo):
    metadata_path = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
    
    try:
        if os.path.exists(metadata_path):
            sh.cd(os.path.join(GITENBERG_DIR, repo))
            print ("add")
            sh.git("add", "metadata.yaml")
            print ("commit")
            try:
                sh.git("commit", "-m", "update metadata.yaml with RTC as publisher; adding _version")
            except:
                pass
            print ("push")
            sh.git.push()
        else:
            return None
    except Exception as e:
        return e

In [41]:
# logic for adding cover metadata

def do_git_reload_metadata():
    for (i,repo) in enumerate(islice(repos,1,None)):
        print (i, repo)
        print (git_reload_metadata(repo))

do_git_reload_metadata()


(0, 'Don-Quixote_996')
add
commit
push
None
(1, 'Dubliners_2814')
add
commit
push
None
(2, 'Jane-Eyre_1260')
add
commit
push
None
(3, 'Moby-Dick--Or-The-Whale_2701')
add
commit
push
None
(4, 'Narrative-of-the-Life-of-Frederick-Douglass-an-American-Slave_23')
add
commit
push
None
(5, 'Pride-and-Prejudice_1342')
add
commit
push
None
(6, 'The-Adventures-of-Sherlock-Holmes_1661')
add
commit
push
None
(7, 'The-Brothers-Karamazov_28054')
add
commit
push
None
(8, 'The-Time-Machine_35')
add
commit
push
None
(9, 'Frankenstein_84')
add
commit
push
None
(10, 'Middlemarch_145')
add
commit
push
None
(11, 'A-Tale-of-Two-Cities_98')
add
commit
push
None
(12, 'The-Call-of-the-Wild_215')
add
commit
push
None
(13, 'Crime-and-Punishment_2554')
add
commit
push
None
(14, 'The-Strange-Case-of-Dr.-Jekyll-and-Mr.-Hyde_42')
add
commit
push
None
(15, 'Dracula_345')
add
commit
push
None
(16, 'Flatland--A-Romance-of-Many-Dimensions--Illustrated-_201')
add
commit
push
None
(17, 'Household-Stories-by-the-Brothers-Grimm_19068')
add
commit
push
None
(18, 'Heart-of-Darkness_219')
add
commit
push
None
(19, 'A-Journey-into-the-Interior-of-the-Earth_3748')
add
commit
push
None
(20, 'Jude-the-Obscure_153')
add
commit
push
None
(21, 'King-Solomon-s-Mines_2166')
add
commit
push
None
(22, 'Little-Women_514')
add
commit
push
None
(23, 'Madame-Bovary_2413')
add
commit
push
None
(24, 'The-Life-and-Adventures-of-Robinson-Crusoe_521')
add
commit
push
None
(25, 'The-Awakening-and-Selected-Short-Stories_160')
add
commit
push
None
(26, 'The-Jungle_140')
add
commit
push
None
(27, 'The-Jungle-Book_236')
add
commit
push
None
(28, 'Metamorphosis_5200')
add
commit
push
None
(29, 'The-Picture-of-Dorian-Gray_174')
add
commit
push
None
(30, 'The-Red-Badge-of-Courage_73')
add
commit
push
None
(31, 'The-Scarlet-Letter_33')
add
commit
push
None
(32, 'The-War-of-the-Worlds_36')
add
commit
push
None
(33, 'The-Wonderful-Wizard-of-Oz_55')
add
commit
push
None
(34, 'This-Side-of-Paradise_805')
add
commit
push
None
(35, 'Anna-Karenina_1399')
add
commit
push
None
(36, 'Gulliver-s-Travels_829')
add
commit
push
None
(37, 'Les-Mis-rables_135')
add
commit
push
None
(38, 'Swann-s-Way_7178')
add
commit
push
None
(39, 'The-Count-of-Monte-Cristo_1184')
add
commit
push
None
(40, 'The-Hunchback-of-Notre-Dame_6539')
add
commit
push
None
(41, 'The-Three-Musketeers_1257')
add
commit
push
None
(42, 'Through-the-Looking-Glass_12')
add
commit
push
None
(43, 'Twenty-Thousand-Leagues-under-the-Sea_164')
add
commit
push
None
(44, 'War-and-Peace_2600')
add
commit
push
None
(45, 'Winesburg-Ohio--A-Group-of-Tales-of-Ohio-Small-Town-Life_416')
add
commit
push
None
(46, 'My-Antonia_242')
add
commit
push
None
(47, 'Divine-Comedy-Longfellow-s-Translation-Hell_1001')
add
commit
push
None
(48, 'The-Works-of-Edgar-Allan-Poe-The-Raven-EditionTable-Of-Contents-And-Index-Of-The-Five-Volumes_25525')
add
commit
push
None

In [ ]:
import yaml

In [30]:
covers_data = json.loads(open(COVERS_DATA).read())
covers_data_dict = dict([(c['GitHub repo'], c) for c in covers_data])

for repo in islice(repos,0,1):
    pg_id = repo.split("_")[-1]
    dest = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
    cover_artist = covers_data_dict[repo]['cover_artist']

    cover_metadata = {
      "cover_type": "original",
      "image_path": "cover.jpg",
      "rights": "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)",
      "rights_url": "https://creativecommons.org/licenses/by-nc/4.0/",
      "attribution": u"{}, 2015".format(cover_artist)
    }
    
    print (dest)
    
    # is cover_metadata already in the YAML?
    if os.path.exists(dest):
        with open(dest, "r+") as yfile: 
            y = yaml.load(yfile)
            covers = y.get('covers', [])
            covers_set = [set(c.items()) for c in covers]
            if set(cover_metadata.items()) not in covers_set:
                covers.append(cover_metadata)
                y['covers'] = covers
                yfile.seek(0)
                yfile.write(yaml.safe_dump(y,default_flow_style=False,allow_unicode=True))
                yfile.truncate()
            else:
                print ("already in", dest, cover_metadata)


/Users/raymondyee/C/src/gitenberg/Adventures-of-Huckleberry-Finn_76/metadata.yaml

In [ ]:
def git_cover_metadata(repo):
    metadata_path = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
    
    try:
        if os.path.exists(metadata_path):
            sh.cd(os.path.join(GITENBERG_DIR, repo))
            print ("add")
            sh.git("add", "metadata.yaml")
            print ("commit")
            try:
                sh.git("commit", "-m", "update metadata.yaml with cover attribution")
            except:
                pass
            print ("push")
            sh.git.push()
        else:
            return None
    except Exception as e:
        return e

In [ ]:
for (i,repo) in enumerate(islice(repos[1:],None)):
    print (i, repo)
    print (git_cover_metadata(repo))

In [ ]:
dest

In [ ]:
# yaml.safe_dump(pg_json,default_flow_style=False,allow_unicode=True)
# http://pyyaml.org/wiki/PyYAMLDocumentation

with open(dest, "rw") as yfile: 
    y = yaml.load(yfile)
    print(y)

In [ ]:
yfile = open(dest, "rw") 
y = yaml.load(yfile)
yfile.close()

In [ ]:
y.get('covers')

In [ ]:
"""u"中国"
"""

In [ ]:
import codecs

def lit_to_unicode(s):
    
    ok_type = [unicode, int]
    
    if type(s) in ok_type:
        return s
    elif isinstance(s, str):
        return codecs.decode(s, "unicode_escape")
    elif isinstance (s, list):
        return [lit_to_unicode(item) for item in s]
    elif isinstance(s, dict):
        return dict([(k,lit_to_unicode(v)) for (k,v) in s.items()])
    else:
        raise Exception("unexpected type", type(s))

In [ ]:
# fix unicode problems in the source yamls

from IPython.display import (HTML, display)
import difflib

differ = difflib.HtmlDiff()

changed_yaml = []

for repo in islice(repos,0,None):
    pg_id = repo.split("_")[-1]
    source = os.path.join(METADATA_DIR, "{}.yaml".format(pg_id))
    
    with open(source, "r+") as yfile: 
        y = yaml.load(yfile)
        yfile.seek(0)
        old_dump = yfile.read()
        
        new_dump = yaml.safe_dump(lit_to_unicode(y),default_flow_style=False,allow_unicode=True)
        
        if old_dump != new_dump:
            yfile.seek(0)
            yfile.write(new_dump)
            yfile.truncate()
            yfile.close()
            changed_yaml.append(repo)

In [ ]:
changed_yaml

In [37]:
# now update publisher metadata in repos
"""
publication_date: 2015-08-01
publisher: Recovering the Classics
rights: CC BY-NC
rights_url: http://creativecommons.org/licenses/by-nc/4.0/
"""

for repo in islice(repos,1,None):
    pg_id = repo.split("_")[-1]
    dest = os.path.join(GITENBERG_DIR, repo, "metadata.yaml")
    print (dest)
    
    with open(dest, "r+") as yfile: 
        y = yaml.load(yfile)
        yfile.seek(0)
        old_dump = yfile.read()
        
        # make changes
        y['publisher'] = 'Recovering the Classics'
        y['publication_date'] = '2015-08-01'
        y['rights'] = 'CC BY-NC'
        y['rights_url'] = 'http://creativecommons.org/licenses/by-nc/4.0/'
        y['_version'] = '0.0.1'
        
        new_dump = yaml.safe_dump(y,default_flow_style=False,allow_unicode=True)
        
        if old_dump != new_dump:
            print ("writing new_dump")
            yfile.seek(0)
            yfile.write(new_dump)
            yfile.truncate()
            yfile.close()


/Users/raymondyee/C/src/gitenberg/Don-Quixote_996/metadata.yaml
/Users/raymondyee/C/src/gitenberg/Dubliners_2814/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Jane-Eyre_1260/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Moby-Dick--Or-The-Whale_2701/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Narrative-of-the-Life-of-Frederick-Douglass-an-American-Slave_23/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Pride-and-Prejudice_1342/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Adventures-of-Sherlock-Holmes_1661/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Brothers-Karamazov_28054/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Time-Machine_35/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Frankenstein_84/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Middlemarch_145/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/A-Tale-of-Two-Cities_98/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Call-of-the-Wild_215/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Crime-and-Punishment_2554/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Strange-Case-of-Dr.-Jekyll-and-Mr.-Hyde_42/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Dracula_345/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Flatland--A-Romance-of-Many-Dimensions--Illustrated-_201/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Household-Stories-by-the-Brothers-Grimm_19068/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Heart-of-Darkness_219/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/A-Journey-into-the-Interior-of-the-Earth_3748/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Jude-the-Obscure_153/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/King-Solomon-s-Mines_2166/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Little-Women_514/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Madame-Bovary_2413/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Life-and-Adventures-of-Robinson-Crusoe_521/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Awakening-and-Selected-Short-Stories_160/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Jungle_140/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Jungle-Book_236/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Metamorphosis_5200/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Picture-of-Dorian-Gray_174/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Red-Badge-of-Courage_73/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Scarlet-Letter_33/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-War-of-the-Worlds_36/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Wonderful-Wizard-of-Oz_55/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/This-Side-of-Paradise_805/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Anna-Karenina_1399/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Gulliver-s-Travels_829/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Les-Mis-rables_135/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Swann-s-Way_7178/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Count-of-Monte-Cristo_1184/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Hunchback-of-Notre-Dame_6539/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Three-Musketeers_1257/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Through-the-Looking-Glass_12/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Twenty-Thousand-Leagues-under-the-Sea_164/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/War-and-Peace_2600/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Winesburg-Ohio--A-Group-of-Tales-of-Ohio-Small-Town-Life_416/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/My-Antonia_242/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/Divine-Comedy-Longfellow-s-Translation-Hell_1001/metadata.yaml
writing new_dump
/Users/raymondyee/C/src/gitenberg/The-Works-of-Edgar-Allan-Poe-The-Raven-EditionTable-Of-Contents-And-Index-Of-The-Five-Volumes_25525/metadata.yaml
writing new_dump

In [ ]: