Jennifer 8. Lee et al have been using a Google spreadsheet to track the production of books in Project GITenberg:
This notebook uses the gspread Python library to parse (and write?) the spreadsheet.
In [ ]:
from __future__ import print_function
import gspread
import json
# rtc50_settings.py holds URL related to the Google spreadsheet
from rtc50_settings import (g_name, g_url, g_key)
OFFICIAL_NAME_KEY = "Name in rtc/books.json, Official Name"
Using OAuth2 for Authorization — gspread 0.2.5 documentation
Created: https://console.developers.google.com/project/nypl50-gspread/apiui/credential#
pip install --upgrade oauth2client
I'd have to share the spreadsheet with 535523918532-5ejclnn335tr2g1u0dqnvh7g39q78mim@developer.gserviceaccount.com
-- so let's look at method 2
In [ ]:
g_url
In [ ]:
import json
import gspread
from oauth2client.client import SignedJwtAssertionCredentials
json_key = json.load(open('nypl50-gspread.json'))
scope = ['https://spreadsheets.google.com/feeds']
credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope)
gc = gspread.authorize(credentials)
wks = gc.open_by_key(g_key).sheet1
In [ ]:
wks
In [ ]:
# load the rows
all_rows = wks.get_all_values()
In [ ]:
# use pandas
import pandas as pd
from pandas import (DataFrame, Series)
In [ ]:
df = DataFrame(all_rows[2:], columns=all_rows[1])
df.index = df.index + 3 # shift index to match spreadsheet
df.head()
In [ ]:
# what does the status mean?
df[df["RTC Good Cover?"] == 'YES']["Gitenberg Status"].value_counts()
In [ ]:
# "RTC 1st GEN" vs "RTC 2nd GEN" vs "RTC Other Gen"
len(df[df["RTC 1st GEN"] == 'X'])
In [ ]:
from second_folio import all_repos
In [ ]:
set(all_repos) - set(df['Gitenberg URL'].map(lambda u: u.split("/")[-1]))
In [ ]:
# just forget the whole part 1/part 2 -- figure out what repos are ready to work on haven't yet been done.
from github3 import (login, GitHub)
from github_settings import (username, password, token)
from itertools import islice
#gh = login(username, password=password)
gh = login(token=token)
def asciidoc_in_repo_root(repo, branch ='master'):
"""return list of asciidocs in the root of repo"""
repo_branch = repo.branch(branch)
tree = repo.tree(repo_branch.commit.sha)
return [hash_.path
for hash_ in tree.tree
if hash_.path.endswith('.asciidoc')]
def asciidocs_for_repo_name(repo_name):
try:
repo = gh.repository('GITenberg', repo_name)
return asciidoc_in_repo_root(repo, branch ='master')
except Exception as e:
return e
In [ ]:
# copy CSV to clipboard, making it easy to then paste it to
# https://github.com/gitenberg-dev/Second-Folio/blob/master/Gitenberg%20Book%20List.csv
df.to_clipboard(encoding='utf-8', sep=',', index=False)
In [ ]:
import sh
sh.cd("/Users/raymondyee/C/src/gitenberg/Adventures-of-Huckleberry-Finn_76")
len(sh.grep (sh.git.remote.show("-n", "origin"),
"git@github-GITenberg:GITenberg/Adventures-of-Huckleberry-Finn_76.git", _ok_code=[0,1]))
In [ ]:
from itertools import islice
from second_folio import (repo_cloned, clone_repo)
repos_to_clone = (repo for repo in all_repos if not repo_cloned(repo)[0])
for (i, repo) in enumerate(islice(repos_to_clone,None)):
output = clone_repo(repo)
print ("\r{} {} {} {}".format(i, repo, output, repo_cloned(repo)))
In [ ]:
import requests
# rtc_covers_url = "https://raw.githubusercontent.com/plympton/rtc/master/books.json"
rtc_covers_url = "https://raw.githubusercontent.com/rdhyee/rtc/master/books.json"
covers = requests.get(rtc_covers_url).json()
covers_dict = dict([(cover['name'], cover) for cover in covers])
In [ ]:
len(covers_dict)
In [ ]:
# Are there any covers in the Plymton repo not in books.json?
df
In [ ]:
# not that many covers
cover_names = set([cover['name'] for cover in covers])
In [ ]:
# read off cover_map from df
# http://stackoverflow.com/a/9762084
cover_map = dict(filter(lambda (k,v):v,
[tuple(x) for x in df[['Title', OFFICIAL_NAME_KEY]].values]
))
In [ ]:
repos_with_covers = list(df[df[OFFICIAL_NAME_KEY].map(lambda s: len(s) > 0)]['Gitenberg URL'].map(lambda u: u.split("/")[-1]))
repos_with_covers
In [ ]:
len(repos_with_covers)
In [ ]:
# compare list of cover repo data in
# https://raw.githubusercontent.com/gitenberg-dev/Second-Folio/master/covers_data.json
import requests
r = requests.get("https://raw.githubusercontent.com/gitenberg-dev/Second-Folio/master/covers_data.json")
covers_data = r.json()
In [ ]:
covers_data
In [ ]:
set(repos_with_covers) - set([c['GitHub repo'] for c in covers_data])
In [ ]:
set([c['GitHub repo'] for c in covers_data]) - set(repos_with_covers)
In [ ]:
mapped_cover_names = set(cover_map.values())
In [ ]:
(cover_names - mapped_cover_names), (mapped_cover_names - cover_names)
In [ ]:
[v['covers'][0]['filename']
for (k,v) in covers_dict.items()]
In [ ]:
# Have I downloaded all the big images?
img_path = "/Users/raymondyee/Downloads/rtc/full_images/"
cover_names
In [ ]:
from IPython.display import HTML
from PIL import Image
import jinja2
In [ ]:
# let's look at the images for the books
# https://cdn.rawgit.com/plympton/rtc/master/rtc_books/
# https://cdn.rawgit.com/plympton/rtc/master/rtc_books_resized/
cover_url_base = "https://cdn.rawgit.com/plympton/rtc/master/rtc_books/"
small_cover_url_base = "https://cdn.rawgit.com/plympton/rtc/master/rtc_books_resized/"
from functools import partial
def cover_name_to_url(name, reduce=False):
if reduce:
url = small_cover_url_base
else:
url = cover_url_base
cover = covers_dict.get(name)
if cover is not None:
return url + cover['covers'][0]["filename"]
else:
return None
def cover_name_to_artist(name):
cover = covers_dict.get(name)
if cover is not None:
return cover['covers'][0]['artist']
else:
return None
cover_name_to_url_small = partial(cover_name_to_url, reduce=True)
cover_name_to_url_big = partial(cover_name_to_url, reduce=False)
df['big_image_url'] = rtc50[OFFICIAL_NAME_KEY].map(cover_name_to_url_big)
df['small_image_url'] = rtc50[OFFICIAL_NAME_KEY].map(cover_name_to_url_small)
rtc50 = df[df["RTC Good Cover?"] == 'YES']
In [ ]:
rtc50.head()
In [ ]:
results = rtc50[['Title', 'big_image_url']].T.to_dict().values()
In [ ]:
results
In [ ]:
from IPython.display import HTML
from jinja2 import Template
CSS = """
<style>
.wrap img {
margin-left: 0px;
margin-right: 0px;
display: inline-block;
width: 100px;
}
</style>
"""
IMAGES_TEMPLATE = CSS + """
<div class="wrap">
{% for item in items %}<img title="{{item.Title}}" src="{{item.}}"/>{% endfor %}
</div>
"""
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=results))
In [ ]:
#let's try looping over all the images and convert them to png
def download_big_images(limit=None):
import requests
from itertools import islice
import os
img_path = "/Users/raymondyee/Downloads/rtc/full_images/"
for image in islice(results,limit):
# check whether we have the cover already before downloading
url = image['big_image_url']
if url is not None:
name = url.split("/")[-1]
dest_path = img_path + name
if not os.path.exists(dest_path):
print (dest_path)
content = requests.get(url).content
with open(img_path + name, "wb") as f:
f.write(content)
In [ ]:
download_big_images(limit=None)
In [ ]:
# loop over jpg and convert to png
def convert_small_jpg_to_png():
import glob
for f in glob.glob("/Users/raymondyee/Downloads/rtc/resized/*.jp*g"):
im = Image.open(f)
png_path = ".".join(f.split(".")[:-1]) + ".png"
if im.mode not in ["1", "L", "P", "RGB", "RGBA"]:
im = im.convert("RGB")
im.save(png_path)
In [ ]:
# image types in covers
from collections import Counter
map(lambda p: p.split(".")[-1], reduce(lambda x,y: x+y, [[c['filename'] for c in cover['covers'] for cover in covers]]))
In [ ]:
df['GitHub repo']=df['Gitenberg URL'].map(lambda u:u.split("/")[-1])
In [ ]:
import numpy as np
df['local_big_file'] = df['big_image_url'].map(lambda u:u.split("/")[-1] if u is not None and u is not np.nan else None)
In [ ]:
df['cover_artist'] = df[OFFICIAL_NAME_KEY].map(cover_name_to_artist)
In [ ]:
df['local_big_file'] = df['local_big_file'].map(lambda s: re.sub(r".png$", ".jpg", s) if s is not None else s)
In [ ]:
def write_covers_data():
import json
rtc50 = df[df["RTC Good Cover?"] == 'YES']
covers_data_path = "/Users/raymondyee/C/src/gitenberg/Second-Folio/covers_data.json"
with open(covers_data_path, "w") as f:
f.write(json.dumps(rtc50[['GitHub repo', 'cover_artist', 'local_big_file']].T.to_dict().values(),
sort_keys=True,indent=2, separators=(',', ': ')))
In [ ]:
#write_covers_data()
In [ ]:
import sh
In [ ]:
# can control tty settings for sh
# https://amoffat.github.io/sh/#ttys
sh.ls("-1", _tty_out=False ).split()
In [ ]:
dict([(c['GitHub repo'], c) for c in covers_data])
In [ ]:
s = Series(repos)
list(s.map(lambda r: covers_data_dict.get(r).get('local_big_file')))
In [ ]:
import os
import os
import shutil
import sh
from pandas import DataFrame, Series
from itertools import islice
REPOS_LIST = "/Users/raymondyee/C/src/gitenberg/Second-Folio/list_of_repos.txt"
COVERS_DATA = "/Users/raymondyee/C/src/gitenberg/Second-Folio/covers_data.json"
GITENBERG_DIR = "/Users/raymondyee/C/src/gitenberg/"
COVERS_DIR = "/Users/raymondyee/Downloads/rtc/full_images/"
repos=open(REPOS_LIST).read().strip().split("\n")
covers_data = json.loads(open(COVERS_DATA).read())
covers_data_dict = dict([(c['GitHub repo'], c) for c in covers_data])
def copy_repo_cover(repo, dry_run=False):
cover_file = covers_data_dict[repo]['local_big_file']
local_cover_path = None
copied = False
if cover_file is not None:
local_cover_path = os.path.join(COVERS_DIR, cover_file)
destination = os.path.join(GITENBERG_DIR, repo, "cover.jpg")
if os.path.exists(local_cover_path) and not os.path.exists(destination):
if not dry_run:
shutil.copyfile(local_cover_path, destination)
copied = True
return (local_cover_path, copied)
def git_pull(repo):
sh.cd(os.path.join(GITENBERG_DIR, repo))
return sh.git("pull")
def copy_covers():
for (i,repo) in enumerate(islice(repos,None)):
print (i, repo, copy_repo_cover(repo, dry_run=False))
In [ ]:
copy_covers()
In [ ]:
# let's compute missing covers
for repo in repos:
destination = os.path.join(GITENBERG_DIR, repo, "cover.jpg")
if not os.path.exists(destination):
print (repo)
In [ ]:
def git_add_cover_commit_push(repo):
cover_path = os.path.join(GITENBERG_DIR, repo, "cover.jpg")
try:
if os.path.exists(cover_path):
sh.cd(os.path.join(GITENBERG_DIR, repo))
print ("add")
sh.git("add", "cover.jpg")
print ("commit")
try:
sh.git("commit", "-m", "add cover.jpg")
except:
pass
print ("push")
sh.git.push()
else:
return None
except Exception as e:
return e
In [ ]:
for (i,repo) in enumerate(islice(repos,None)):
print (i, repo)
print (git_add_cover_commit_push(repo))
In [ ]:
def git_pull(repo):
sh.cd(os.path.join(GITENBERG_DIR, repo))
sh.git("pull")
In [ ]:
for (i,repo) in enumerate(islice(repos,None)):
print (i, repo)
git_pull(repo)
In [ ]:
sh.cd("/Users/raymondyee/C/src/gitenberg/Jane-Eyre_1260")
sh.git.push()
In [ ]:
import os
import json
import shutil
import sh
import yaml
from pandas import DataFrame, Series
from itertools import islice
REPOS_LIST = "/Users/raymondyee/C/src/gitenberg/Second-Folio/list_of_repos.txt"
GITENBERG_DIR = "/Users/raymondyee/C/src/gitenberg/"
METADATA_DIR = "/Users/raymondyee/C/src/gitenberg-dev/giten_site/metadata"
COVERS_DATA = "/Users/raymondyee/C/src/gitenberg/Second-Folio/covers_data.json"
In [ ]:
import os
import glob
import sh
import yaml
from gitenberg import metadata
import jinja2
from second_folio import (GITENBERG_DIR,
all_repos,
apply_to_repos,
travis_setup_releases,
git_pull,
apply_travis,
finish_travis,
repo_is_buildable,
has_travis_with_gitenberg_build,
slugify,
latest_epub,
repo_version
)
from github_settings import (username, password)
In [ ]:
from itertools import islice, izip
repos = list(islice(all_repos,0,None))
# determine which repos are "buildable"
repos_statues = list(izip(repos,
apply_to_repos(repo_is_buildable, repos=repos),
apply_to_repos(has_travis_with_gitenberg_build, repos=repos) ))
# we want to apply travis to repos that are buildable but that don't yet have .travis.yml.
repos_to_travisfy = [repo[0] for repo in repos_statues if repo[1] and not repo[2]]
repos_to_travisfy
In [ ]:
from __future__ import print_function
In [ ]:
for (i, repo) in enumerate(islice(repos_to_travisfy,1)):
print (i, repo, end=" ")
r1 = apply_travis(repo, username, password, overwrite_travis=True)
print (r1, end=" ")
if r1:
r2 = finish_travis(repo)
print (r2)
else:
print ("n/a")
e.g., https://github.com/GITenberg/Metamorphosis_5200/releases/download/0.0.1/Metamorphosis.epub
In [ ]:
import requests
url = "https://github.com/GITenberg/Adventures-of-Huckleberry-Finn_76/releases/download/0.0.17/Adventures-of-Huckleberry-Finn.epub"
r = requests.head(url)
In [ ]:
r.status_code, r.url, r.url == url
In [ ]:
epub_urls = list(apply_to_repos(latest_epub))
In [ ]:
import pandas as pd
from pandas import DataFrame
In [ ]:
df = DataFrame({'epub_url':epub_urls}, index=all_repos)
df.head()
In [ ]:
df['status_code'] = df.epub_url.apply(lambda u: requests.head(u).status_code)
In [ ]:
df['buildable'] = df.index.map(repo_is_buildable)
In [ ]:
k = df[df['status_code'] == 404][:3]
k['status_code'] = k.epub_url.apply(lambda u: requests.head(u).status_code)
k.head()
In [ ]:
df.ix[k.index] = k
In [ ]:
list(k.epub_url)
In [ ]:
df[(df.status_code == 404) & (df.buildable)]
In [ ]:
df['metadata_url'] = df.index.map(lambda repo: "https://github.com/GITenberg/{}/raw/master/metadata.yaml".format(repo))
In [ ]:
print "\n".join(list(df[~df.buildable].index))
In [ ]:
df.buildable.value_counts()
In [ ]:
df.to_clipboard(index_label="repo", sep=',')
In [ ]:
df[df.status_code == 404]
In [ ]:
md.metadata.get("title"), md.metadata.get("_repo"), md.metadata.get("_version"),
In [ ]:
# figure out what elements to feed to template
#
from jinja2 import Environment, PackageLoader, meta
env = Environment()
parsed_content = env.parse(template)
meta.find_undeclared_variables(parsed_content)
In [ ]:
import sh
sh.cd("/Users/raymondyee/C/src/gitenberg/Adventures-of-Huckleberry-Finn_76")
sh.travis.whoami()
In [ ]:
from itertools import islice, izip
repos = list(islice(second_folio.all_repos,1,None))
list(izip(repos, apply_to_repos(git_mv_asciidoc, repos=repos)))
In [ ]:
list(apply_to_repos(git_pull))
In [ ]:
from __future__ import print_function
In [ ]:
line = "Detected repository as GITenberg/Don-Quixote_996, is this correct? |yes| "
"Detected" in line
In [ ]: