Jennifer 8. Lee et al have been using a Google spreadsheet to track the production of books in Project GITenberg:

This notebook uses the gspread Python library to parse (and write?) the spreadsheet.


In [ ]:
from __future__ import print_function

import gspread
import json

#  rtc50_settings.py holds URL related to the Google spreadsheet
from rtc50_settings import (g_name, g_url, g_key)

OFFICIAL_NAME_KEY = "Name in rtc/books.json, Official Name"

Getting access to the spreadsheet (Method 1): OAuth2

Using OAuth2 for Authorization — gspread 0.2.5 documentation

Created: https://console.developers.google.com/project/nypl50-gspread/apiui/credential#

pip install --upgrade oauth2client

I'd have to share the spreadsheet with 535523918532-5ejclnn335tr2g1u0dqnvh7g39q78mim@developer.gserviceaccount.com -- so let's look at method 2


In [ ]:
g_url

In [ ]:
import json
import gspread
from oauth2client.client import SignedJwtAssertionCredentials

json_key = json.load(open('nypl50-gspread.json'))
scope = ['https://spreadsheets.google.com/feeds']

credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope)

gc = gspread.authorize(credentials)


wks = gc.open_by_key(g_key).sheet1

Calculations on the spreadsheet


In [ ]:
wks

In [ ]:
# load the rows

all_rows = wks.get_all_values()

In [ ]:
# use pandas

import pandas as pd
from pandas import (DataFrame, Series)

In [ ]:
df = DataFrame(all_rows[2:], columns=all_rows[1])
df.index = df.index + 3  # shift index to match spreadsheet
df.head()

In [ ]:
# what does the status mean?

df[df["RTC Good Cover?"] == 'YES']["Gitenberg Status"].value_counts()

In [ ]:
# "RTC 1st GEN" vs "RTC 2nd GEN" vs "RTC Other Gen"

len(df[df["RTC 1st GEN"] == 'X'])

In [ ]:
from second_folio import all_repos

In [ ]:
set(all_repos) - set(df['Gitenberg URL'].map(lambda u: u.split("/")[-1]))

In [ ]:
# just forget the whole part 1/part 2 -- figure out what repos are ready to work on haven't yet been done.

from github3 import (login, GitHub)
from github_settings import (username, password, token)
from itertools import islice

#gh = login(username, password=password)
gh = login(token=token)


def asciidoc_in_repo_root(repo, branch ='master'):
    """return list of asciidocs in the root of repo"""
    repo_branch = repo.branch(branch)
    tree = repo.tree(repo_branch.commit.sha)
    return [hash_.path 
            for hash_ in tree.tree
            if hash_.path.endswith('.asciidoc')]


def asciidocs_for_repo_name(repo_name):
    try:
        repo = gh.repository('GITenberg', repo_name)
        return asciidoc_in_repo_root(repo, branch ='master')
    except Exception as e:
        return e

In [ ]:
# copy CSV to clipboard, making it easy to then paste it to 
# https://github.com/gitenberg-dev/Second-Folio/blob/master/Gitenberg%20Book%20List.csv

df.to_clipboard(encoding='utf-8', sep=',', index=False)

cloning repos


In [ ]:
import sh

sh.cd("/Users/raymondyee/C/src/gitenberg/Adventures-of-Huckleberry-Finn_76")
len(sh.grep (sh.git.remote.show("-n", "origin"), 
        "git@github-GITenberg:GITenberg/Adventures-of-Huckleberry-Finn_76.git", _ok_code=[0,1]))

In [ ]:
from itertools import islice
from second_folio import (repo_cloned, clone_repo)

repos_to_clone = (repo for repo in all_repos if not repo_cloned(repo)[0])
for (i, repo) in enumerate(islice(repos_to_clone,None)):
    output = clone_repo(repo)
    print ("\r{} {} {} {}".format(i, repo, output, repo_cloned(repo)))

In [ ]:
import requests
# rtc_covers_url = "https://raw.githubusercontent.com/plympton/rtc/master/books.json"
rtc_covers_url = "https://raw.githubusercontent.com/rdhyee/rtc/master/books.json"
covers = requests.get(rtc_covers_url).json()
covers_dict = dict([(cover['name'], cover) for cover in covers])

In [ ]:
len(covers_dict)

In [ ]:
# Are there any covers in the Plymton repo not in books.json?

df

In [ ]:
# not that many covers
cover_names = set([cover['name'] for cover in covers])

In [ ]:
# read off cover_map from df
# http://stackoverflow.com/a/9762084
cover_map = dict(filter(lambda (k,v):v,
                  [tuple(x) for x in  df[['Title', OFFICIAL_NAME_KEY]].values]
            ))

In [ ]:
repos_with_covers = list(df[df[OFFICIAL_NAME_KEY].map(lambda s: len(s) > 0)]['Gitenberg URL'].map(lambda u: u.split("/")[-1]))
repos_with_covers

In [ ]:
len(repos_with_covers)

In [ ]:
# compare list of cover repo data in 
# https://raw.githubusercontent.com/gitenberg-dev/Second-Folio/master/covers_data.json

import requests
r = requests.get("https://raw.githubusercontent.com/gitenberg-dev/Second-Folio/master/covers_data.json")
covers_data =  r.json()

In [ ]:
covers_data

In [ ]:
set(repos_with_covers) - set([c['GitHub repo'] for c in covers_data])

In [ ]:
set([c['GitHub repo'] for c in covers_data]) - set(repos_with_covers)

In [ ]:
mapped_cover_names = set(cover_map.values())

In [ ]:
(cover_names - mapped_cover_names), (mapped_cover_names - cover_names)

In [ ]:
[v['covers'][0]['filename']
for (k,v) in covers_dict.items()]

In [ ]:
# Have I downloaded all the big images?

img_path = "/Users/raymondyee/Downloads/rtc/full_images/"
cover_names

In [ ]:
from IPython.display import HTML
from PIL import Image
import jinja2

In [ ]:
# let's look at the images for the books
# https://cdn.rawgit.com/plympton/rtc/master/rtc_books/
# https://cdn.rawgit.com/plympton/rtc/master/rtc_books_resized/

cover_url_base = "https://cdn.rawgit.com/plympton/rtc/master/rtc_books/"
small_cover_url_base = "https://cdn.rawgit.com/plympton/rtc/master/rtc_books_resized/"

from functools import partial

def cover_name_to_url(name, reduce=False):
    
    if reduce:
        url = small_cover_url_base
    else:
        url = cover_url_base
        
    cover = covers_dict.get(name)
    if cover is not None:
        return url + cover['covers'][0]["filename"]
    else:
        return None

def cover_name_to_artist(name):
    cover = covers_dict.get(name)
    if cover is not None:
        return cover['covers'][0]['artist']
    else:
        return None    
    
    
cover_name_to_url_small = partial(cover_name_to_url, reduce=True)
cover_name_to_url_big = partial(cover_name_to_url, reduce=False)


df['big_image_url'] = rtc50[OFFICIAL_NAME_KEY].map(cover_name_to_url_big)
df['small_image_url'] = rtc50[OFFICIAL_NAME_KEY].map(cover_name_to_url_small)


rtc50 = df[df["RTC Good Cover?"] == 'YES']

In [ ]:
rtc50.head()

In [ ]:
results = rtc50[['Title', 'big_image_url']].T.to_dict().values()

In [ ]:
results

In [ ]:
from IPython.display import HTML
from jinja2 import Template

CSS = """
<style>
  .wrap img {
    margin-left: 0px;
    margin-right: 0px;
    display: inline-block;
    width: 100px;
  }
</style>
"""

IMAGES_TEMPLATE = CSS + """
<div class="wrap">
 {% for item in items %}<img title="{{item.Title}}" src="{{item.}}"/>{% endfor %}
</div>
"""
    
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=results))

In [ ]:
#let's try looping over all the images and convert them to png 


def download_big_images(limit=None):
    
    import requests
    from itertools import islice
    import os

    img_path = "/Users/raymondyee/Downloads/rtc/full_images/"
    for image in islice(results,limit):
        # check whether we have the cover already before downloading
        
        url = image['big_image_url']
        
        if url is not None:
            name = url.split("/")[-1]
            dest_path = img_path + name
        
            if not os.path.exists(dest_path):
                print (dest_path)
                content = requests.get(url).content
                with open(img_path + name, "wb") as f:
                    f.write(content)

In [ ]:
download_big_images(limit=None)

In [ ]:
# loop over jpg and convert to png

def convert_small_jpg_to_png():

    import glob

    for f in glob.glob("/Users/raymondyee/Downloads/rtc/resized/*.jp*g"):
        im = Image.open(f)
        png_path = ".".join(f.split(".")[:-1]) + ".png"
        if im.mode not in ["1", "L", "P", "RGB", "RGBA"]:
            im = im.convert("RGB")
        im.save(png_path)

In [ ]:
# image types in covers
from collections import Counter
map(lambda p: p.split(".")[-1], reduce(lambda x,y: x+y, [[c['filename'] for c in cover['covers'] for cover in covers]]))

In [ ]:
df['GitHub repo']=df['Gitenberg URL'].map(lambda u:u.split("/")[-1])

In [ ]:
import numpy as np
df['local_big_file'] = df['big_image_url'].map(lambda u:u.split("/")[-1] if u is not None and u is not np.nan else None)

In [ ]:
df['cover_artist'] = df[OFFICIAL_NAME_KEY].map(cover_name_to_artist)

In [ ]:
df['local_big_file'] = df['local_big_file'].map(lambda s: re.sub(r".png$", ".jpg", s) if s is not None else s)

In [ ]:
def write_covers_data():

    import json

    rtc50 = df[df["RTC Good Cover?"] == 'YES']

    covers_data_path = "/Users/raymondyee/C/src/gitenberg/Second-Folio/covers_data.json"

    with open(covers_data_path, "w") as f:
        f.write(json.dumps(rtc50[['GitHub repo', 'cover_artist', 'local_big_file']].T.to_dict().values(),
                        sort_keys=True,indent=2, separators=(',', ': ')))

In [ ]:
#write_covers_data()

Getting covers into repos


In [ ]:
import sh

In [ ]:
# can control tty settings for sh
# https://amoffat.github.io/sh/#ttys

sh.ls("-1", _tty_out=False ).split()

In [ ]:
dict([(c['GitHub repo'], c) for c in covers_data])

In [ ]:
s = Series(repos)
list(s.map(lambda r: covers_data_dict.get(r).get('local_big_file')))

Getting covers into repos


In [ ]:
import os
import os
import shutil
import sh
from pandas import DataFrame, Series
from itertools import islice

REPOS_LIST = "/Users/raymondyee/C/src/gitenberg/Second-Folio/list_of_repos.txt"
COVERS_DATA = "/Users/raymondyee/C/src/gitenberg/Second-Folio/covers_data.json"
GITENBERG_DIR = "/Users/raymondyee/C/src/gitenberg/"
COVERS_DIR = "/Users/raymondyee/Downloads/rtc/full_images/"

repos=open(REPOS_LIST).read().strip().split("\n")
covers_data = json.loads(open(COVERS_DATA).read())
covers_data_dict = dict([(c['GitHub repo'], c) for c in covers_data])

def copy_repo_cover(repo, dry_run=False):
    cover_file = covers_data_dict[repo]['local_big_file']
    
    local_cover_path = None
    copied = False
    
    if cover_file is not None:
        local_cover_path = os.path.join(COVERS_DIR, cover_file)
        destination = os.path.join(GITENBERG_DIR, repo, "cover.jpg")
        if os.path.exists(local_cover_path) and not os.path.exists(destination):
            if not dry_run:
                shutil.copyfile(local_cover_path, destination)
                copied = True
            
    return (local_cover_path, copied)

def git_pull(repo):
    sh.cd(os.path.join(GITENBERG_DIR, repo))
    return sh.git("pull")


def copy_covers():
    for (i,repo) in enumerate(islice(repos,None)):
        print (i, repo, copy_repo_cover(repo, dry_run=False))

In [ ]:
copy_covers()

In [ ]:
# let's compute missing covers
for repo in repos:
    destination = os.path.join(GITENBERG_DIR, repo, "cover.jpg")
    if not os.path.exists(destination):
        print (repo)

In [ ]:
def git_add_cover_commit_push(repo):
    cover_path = os.path.join(GITENBERG_DIR, repo, "cover.jpg")
    
    try:
        if os.path.exists(cover_path):
            sh.cd(os.path.join(GITENBERG_DIR, repo))
            print ("add")
            sh.git("add", "cover.jpg")
            print ("commit")
            try:
                sh.git("commit", "-m", "add cover.jpg")
            except:
                pass
            print ("push")
            sh.git.push()
        else:
            return None
    except Exception as e:
        return e

In [ ]:
for (i,repo) in enumerate(islice(repos,None)):
    print (i, repo)
    print (git_add_cover_commit_push(repo))

In [ ]:
def git_pull(repo):
    sh.cd(os.path.join(GITENBERG_DIR, repo))
    sh.git("pull")

In [ ]:
for (i,repo) in enumerate(islice(repos,None)):
    print (i, repo)
    git_pull(repo)

In [ ]:
sh.cd("/Users/raymondyee/C/src/gitenberg/Jane-Eyre_1260")
sh.git.push()

Generalized structure for iterating over repos


In [ ]:
import os
import json
import shutil
import sh
import yaml
from pandas import DataFrame, Series
from itertools import islice

REPOS_LIST = "/Users/raymondyee/C/src/gitenberg/Second-Folio/list_of_repos.txt"
GITENBERG_DIR = "/Users/raymondyee/C/src/gitenberg/"

METADATA_DIR = "/Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata"
COVERS_DATA = "/Users/raymondyee/C/src/gitenberg/Second-Folio/covers_data.json"

Travis work


In [ ]:
import os
import glob
import sh
import yaml

from gitenberg import metadata
import jinja2

from second_folio import (GITENBERG_DIR, 
                          all_repos, 
                          apply_to_repos, 
                          travis_setup_releases, 
                          git_pull,
                          apply_travis,
                          finish_travis,
                          repo_is_buildable,
                          has_travis_with_gitenberg_build,
                          slugify,
                          latest_epub,
                          repo_version
                          )
from github_settings import (username, password)

In [ ]:
from itertools import islice, izip

repos = list(islice(all_repos,0,None))

# determine which repos are "buildable"
repos_statues = list(izip(repos, 
                          apply_to_repos(repo_is_buildable, repos=repos), 
                          apply_to_repos(has_travis_with_gitenberg_build, repos=repos) ))

# we want to apply travis to repos that are buildable but that don't yet have .travis.yml. 

repos_to_travisfy = [repo[0] for repo in repos_statues if repo[1] and not repo[2]]
repos_to_travisfy

In [ ]:
from __future__ import print_function

In [ ]:
for (i, repo) in enumerate(islice(repos_to_travisfy,1)):
    print (i, repo, end=" ")
    r1 = apply_travis(repo, username, password, overwrite_travis=True)
    print (r1, end=" ")
    if r1:
        r2 = finish_travis(repo)
        print (r2)
    else:
        print ("n/a")

In [ ]:
import requests
url = "https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/releases/download/0.0.17/Adventures-of-Huckleberry-Finn.epub"
r = requests.head(url)

In [ ]:
r.status_code, r.url, r.url == url

In [ ]:
epub_urls = list(apply_to_repos(latest_epub))

In [ ]:
import pandas as pd
from pandas import DataFrame

In [ ]:
df = DataFrame({'epub_url':epub_urls}, index=all_repos)
df.head()

In [ ]:
df['status_code'] = df.epub_url.apply(lambda u: requests.head(u).status_code)

In [ ]:
df['buildable'] = df.index.map(repo_is_buildable)

In [ ]:
k = df[df['status_code'] == 404][:3]
k['status_code'] = k.epub_url.apply(lambda u: requests.head(u).status_code)
k.head()

In [ ]:
df.ix[k.index] = k

In [ ]:
list(k.epub_url)

In [ ]:
df[(df.status_code == 404) & (df.buildable)]

In [ ]:
df['metadata_url'] = df.index.map(lambda repo: "https://github.com/GITenberg/{}/raw/master/metadata.yaml".format(repo))

In [ ]:
print "\n".join(list(df[~df.buildable].index))

In [ ]:
df.buildable.value_counts()

In [ ]:
df.to_clipboard(index_label="repo", sep=',')

In [ ]:
df[df.status_code == 404]

Misc


In [ ]:
md.metadata.get("title"), md.metadata.get("_repo"), md.metadata.get("_version"),

In [ ]:
# figure out what elements to feed to template
# 

from jinja2 import Environment, PackageLoader, meta
env = Environment()
parsed_content = env.parse(template)
meta.find_undeclared_variables(parsed_content)

In [ ]:
import sh

sh.cd("/Users/raymondyee/C/src/gitenberg/Adventures-of-Huckleberry-Finn_76")
sh.travis.whoami()

In [ ]:
from itertools import islice, izip

repos = list(islice(second_folio.all_repos,1,None))

list(izip(repos, apply_to_repos(git_mv_asciidoc, repos=repos)))

In [ ]:
list(apply_to_repos(git_pull))

In [ ]:
from __future__ import print_function

In [ ]:
line = "Detected repository as GITenberg/Don-Quixote_996, is this correct? |yes| "
"Detected" in line

In [ ]: