In [ ]:
import requests

build database


In [1]:
from __future__ import print_function

from itertools import (islice, izip)

import arrow
import github3
import requests

from sqlalchemy import or_

from github_settings import (ry_username, ry_password,
                             username, password,
                             # token, 
                             GITENBERG_GITHUB_TOKEN,
                             GITENBERG_TRAVIS_ACCESS_TOKEN,
                             RDHYEE_GITHUB_TOKEN,
                             RDHYEE_TRAVIS_ACCESS_TOKEN, 
                             RDHYEE_TRAVIS_PROFILE_TOKEN)


from second_folio import (apply_to_repos, all_repos)

from gitenberg_utils import (GitenbergJob,
                             GitenbergTravisJob,
                             ForkBuildRepo,
                             BuildRepo,
                             BuildRepo2,
                             MetadataWrite,
                             RepoNameFixer,
                             repo_md,
                             GitenbergJobRunner,
                             MetadataWriterRunner,
                             RepoJobRunner,
                             StatusUpdateRunner)


from gitenberg_db import Repo, create_session


INFO:rdflib:RDFLib Version: 4.2.0
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): raw.githubusercontent.com
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): raw.githubusercontent.com

In [ ]:
import logging
logging.getLogger().getEffectiveLevel()

In [ ]:
l = logging.getLogger()
l.setLevel(30)

print (logging.getLogger().getEffectiveLevel())

reading in data using pandas


In [ ]:
# http://www.datacarpentry.org/python-ecology-lesson/08-working-with-sql

import sqlite3
from itertools import islice

# Create a SQL connection to our SQLite database
con = sqlite3.connect("gitenberg.db")

cur = con.cursor()

# the result of a "cursor.execute" can be iterated over by row
for row in islice(cur.execute('SELECT * FROM repos;'), 3):
    print(row)

#Be sure to close the connection.
con.close()

In [ ]:
import pandas as pd
from pandas import DataFrame, Series
import sqlite3

con = sqlite3.connect("gitenberg.db")

df = pd.read_sql('SELECT * FROM repos;', con, parse_dates=('updated','metadata_written'))
df.head()

In [ ]:
df.dtypes

In [ ]:
# let's pull out a list of repos that have been built

Build a specific book

Mr.-Spaceship_32522

In [ ]:
class MetadataWriterRunner2(MetadataWriterRunner):
    
    def __init__(self, dbfname, gh_username, gh_password, access_token=None, max_exceptions=None,
            repos_list=None):
        
        super(MetadataWriterRunner2, self).__init__(dbfname, gh_username, gh_password, 
                access_token, max_exceptions)
        self.repos_list = repos_list
        
    def repos(self, n=None):
        if self.repos_list is not None:
            return islice(self.session().query(Repo).
                          filter(Repo.repo_name.in_(self.repos_list)),
                    n)
        else:
            return []
        
class RepoJobRunner2(RepoJobRunner):
    def __init__(self, dbfname, gh_username, gh_password, access_token=None, max_exceptions=None,
            repos_list=None):
        
        super(RepoJobRunner2, self).__init__(dbfname, gh_username, gh_password, 
                access_token, max_exceptions)
        self.repos_list = repos_list
        
    def repos(self, n=None):
        if self.repos_list is not None:
            return islice(self.session().query(Repo).
                          filter(Repo.repo_name.in_(self.repos_list)),
                    n)
        else:
            return []

In [ ]:
mwr2 = MetadataWriterRunner2("gitenberg.db", username, password, 
                             repos_list=('At-the-Sign-of-the-Eagle_6218',))

In [ ]:
mwr2.run(1)

In [ ]:
rjr2 = RepoJobRunner2("gitenberg.db", username, password, GITENBERG_TRAVIS_ACCESS_TOKEN, max_exceptions=20,
                    repos_list=('At-the-Sign-of-the-Eagle_6218', 
 ))

In [ ]:
rjr2.run(None)

metadatawrite


In [ ]:
mwr = MetadataWriterRunner("gitenberg.db", username, password)

In [ ]:
mwr.run(1)

In [ ]:
mwr.exceptions()

In [ ]:
job = BuildRepo2(username=username,
                password=password,
                repo_name='',
                repo_owner='GITenberg',
                update_travis_commit_msg='build using gitenberg.travis',
                tag_commit_message='build using gitenberg.travis',
                access_token=GITENBERG_TRAVIS_ACCESS_TOKEN)

Building books


In [ ]:
session = create_session("gitenberg.db")

In [ ]:
(session.query(Repo)
 .filter(or_(Repo.buildable == None, Repo.buildable == True))
 .filter(Repo.datebuilt == None)
 .filter(Repo.metadata_written != None)
).count()

In [ ]:
rjr = RepoJobRunner("gitenberg.db", username, password, GITENBERG_TRAVIS_ACCESS_TOKEN, max_exceptions=20)

In [ ]:
rjr.run(50)

In [ ]:
list(rjr.repo_names(1))

In [ ]:
def delete_repo_token(repo_name):
    gtj = GitenbergTravisJob(username, password, repo_name, 'GITenberg',
                   update_travis_commit_msg='build using gitenberg.travis',
                    tag_commit_message='build using gitenberg.travis',
                    access_token=GITENBERG_TRAVIS_ACCESS_TOKEN)

    gtj.delete_repo_token()

In [ ]:
rjr.exceptions()

In [ ]:
rjr.gh.ratelimit_remaining

In [ ]:
dt = arrow.get(rjr.gh.rate_limit()['rate']['reset']) - arrow.now()
rjr.countdown(dt.seconds)

StatusUpdater


In [ ]:
class StatusUpdateRunner2(StatusUpdateRunner):
    def __init__(self, dbfname, gh_username, gh_password, access_token=None, max_exceptions=None,
            repos_list=None):
        
        super(StatusUpdateRunner2, self).__init__(dbfname, gh_username, gh_password, 
                access_token, max_exceptions)
        self.repos_list = repos_list
        
    def repos(self, n=None):
        if self.repos_list is not None:
            return islice(self.session().query(Repo).
                          filter(Repo.repo_name.in_(self.repos_list)),
                    n)
        else:
            return []

In [ ]:
(session.query(Repo)
 .filter(Repo.datebuilt != None)
 .filter(Repo.last_build_id == None)
).count()

In [ ]:
sur = StatusUpdateRunner("gitenberg.db", username, password, GITENBERG_TRAVIS_ACCESS_TOKEN)

In [ ]:
sur.run(None)

In [ ]:
sur.gh.ratelimit_remaining

In [ ]:
dt = arrow.get(sur.gh.rate_limit()['rate']['reset']) - arrow.now()
sur.countdown(dt.seconds)

In [ ]:
sur.exceptions()

overall stats


In [ ]:
(session.query(Repo)
  .filter(Repo.ebooks_in_release_count == 3)
).count()

In [ ]:
session.query(Repo.ebooks_in_release_count).distinct().all()
SELECT ebooks_in_release_count, count (ebooks_in_release_count)
  FROM Repos
 GROUP BY ebooks_in_release_count

In [ ]:
# how many built
(session.query(Repo)
 .filter(Repo.datebuilt != None).count())

In [ ]:
# how many for which we know lastbuilt status

(session.query(Repo)
 .filter(Repo.last_build_state != None).count())

In [ ]:
# http://stackoverflow.com/a/4086229/7782

from sqlalchemy import func
(session.query(Repo.ebooks_in_release_count, func.count(Repo.ebooks_in_release_count))
   .group_by(Repo.ebooks_in_release_count).all())

In [ ]:
from sqlalchemy import func

build_states = (session.query(Repo.last_build_state, func.count(Repo.last_build_state))
   .group_by(Repo.last_build_state).all())
build_states

In [ ]:
__builtin__.sum([v for (k,v) in build_states])

In [ ]:
session.query(Repo).distinct(Repo.ebooks_in_release_count).count()

In [ ]:
sur.gh.ratelimit_remaining

In [ ]:
dt = arrow.get(sur.gh.rate_limit()['rate']['reset']) - arrow.now()
sur.countdown(dt.seconds)

In [ ]:
import json
import unicodecsv as csv
from StringIO import StringIO

# http://stackoverflow.com/a/11884806
def as_dict(repo):
    return {c.name: getattr(repo, c.name) for c in repo.__table__.columns}

# return Repos that have a known build state
results = (session.query(Repo)
 .filter(Repo.last_build_state != None))

# repos_file = StringIO()
with open("built_repos.tsv", "wb") as repos_file:

    headers = [c.name for c in Repo.__table__.columns]
    
    repo_csv = csv.DictWriter(repos_file, headers, encoding='utf-8', delimiter='\t')

    repo_csv.writeheader()
    for result in islice(results,None):
        repo_csv.writerow(as_dict(result))

In [ ]:
!wc built_repos.tsv

debugging errors / failures


In [ ]:
failed_builds = (session.query(Repo)
 .filter(Repo.last_build_state ==  'failed'))

failed_builds.count()

In [ ]:
for (i, repo) in enumerate(islice(failed_builds,None)):
    url = url = "https://travis-ci.org/GITenberg/{repo_name}/builds/{last_build_id}".format(repo_name=repo.repo_name,
                        last_build_id=repo.last_build_id)
    print (url)

let's look at https://travis-ci.org/GITenberg/American-Hand-Book-of-the-Daguerreotype_167/builds/150209405

cannot read from /home/travis/build/GITenberg/American-Hand-Book-of-the-Daguerreotype_167/book.epub

The case of the image file names don't match -- case sensitivity.

For https://travis-ci.org/GITenberg/Literary-Blunders--A-Chapter-in-the--History-of-Human-Error-_371/builds/150224012:

ebook-convert 371.txt book.epub --title "Literary Blunders: A Chapter in the "History of Human Error"" --authors "" ' returned non-zero exit status 1

A problem with how quotes are handled in invocation of ebook-convert


In [ ]:
# 

repo_name = "American-Hand-Book-of-the-Daguerreotype_167"

gtj = GitenbergTravisJob(username, password, repo_name, 'GITenberg',
        update_travis_commit_msg='build using gitenberg.travis',
        tag_commit_message='build using gitenberg.travis',
        access_token=GITENBERG_TRAVIS_ACCESS_TOKEN)

gtj.travis_repo

How to read log files from travis? revisit menegazzo/travispy: Travis CI API for Python


In [ ]:
# How to read log files from travis

b = gtj.travis.build(gtj.travis_repo.last_build_id)
j = b.jobs[-1]
j.id

In [ ]:
j.log.body[:100]

update repos with started status


In [ ]:
(session.query(Repo)
 .filter(Repo.last_build_state == 'started')
).count()

In [ ]:
class StatusUpdateRunnerForStartedJobs(StatusUpdateRunner):

    def repos(self, n):
        return islice((self.session().query(Repo)
                .filter(Repo.last_build_state == 'started')
         ),n)

In [ ]:
sur2 = StatusUpdateRunnerForStartedJobs("gitenberg.db", username, password, GITENBERG_TRAVIS_ACCESS_TOKEN)

In [ ]:
sur2.run(None)

rerunning jobs that have error status


In [ ]:
class ErroredRepoJobRunner(RepoJobRunner):
        
    def repos(self, n):
        return islice((self.session().query(Repo)
                .filter(Repo.last_build_state == 'errored')
         ),n)

In [ ]:
erjr = ErroredRepoJobRunner("gitenberg.db", username, password, GITENBERG_TRAVIS_ACCESS_TOKEN, max_exceptions=20)

In [ ]:
erjr.run(10)

In [ ]:
erjr.gh.ratelimit_remaining

In [ ]:
dt = arrow.get(erjr.gh.rate_limit()['rate']['reset']) - arrow.now()
sur.countdown(dt.seconds)

Misc


In [ ]:
for repo in session.query(Repo).filter_by(ebooks_in_release_count = 3):
    repo.has_metadata = True
    repo.has_source = True
    repo.buildable = True
    repo.updated = arrow.now().isoformat()


session.commit()

In [ ]:
import gitenberg

b = gitenberg.Book(1)
b.parse_book_metadata()
b.meta.metadata

In [ ]:
import yaml
md = repo_md(1)
print (yaml.safe_dump(md,default_flow_style=False,
                                  allow_unicode=True))

In [ ]:
1/0

In [ ]:
def status_for_repo(repo_name):
    rs = GitenbergTravisJob(username=username, password=password, repo_name=repo_name,
              repo_owner='GITenberg', 
              update_travis_commit_msg='check status',
              tag_commit_message='check status',
              access_token=GITENBERG_TRAVIS_ACCESS_TOKEN)
    return rs.status()

results_iter = apply_to_repos(status_for_repo, repos=all_repos)

In [ ]:
results = []

for (i,result) in enumerate(results_iter):
    
    results.append(result)
    if not isinstance(result, Exception):
        print ("\r{}: {}".format(i, result['repo_name']), end="")
    else:
        print ("\r{}: {}".format(i, str(result)), end="")

In [ ]:
[(i, result) for (i, result) in enumerate(results) if isinstance(result, Exception)]

In [ ]:
[result.get('repo_name') for result in results if result.get('ebooks_in_release_count') != 3]

In [ ]:
# update the database based on result

result = results[0]
result

In [ ]:
for result in results:
    repo = session.query(Repo).filter_by(repo_name=result['repo_name']).first()
    repo.updated = arrow.now().isoformat()
    repo.datebuilt = result['last_build_started_at']
    repo.version = result['version']
    repo.ebooks_in_release_count = result['ebooks_in_release_count']
    repo.last_build_id = result['last_build_id']
    repo.last_build_state = result['last_build_state']
    
session.commit()

In [ ]:
# building the rest

session.query(Repo).filter(Repo.datebuilt != None).count()

In [ ]:
repo_names = [repo.repo_name for repo in 
islice(session.query(Repo).filter(Repo.datebuilt == None).order_by(Repo.gutenberg_id.asc()),5)]

In [ ]:
from collections import OrderedDict
from itertools import islice

results = OrderedDict()

In [ ]:
repos_iter = iter(repo_names)

In [ ]:
def build_repos(repo_names, n=None):
    for (i, repo_name) in enumerate(islice(repo_names, n)):
        try:
            bj = BuildRepo2(username=username, password=password, repo_name=repo_name,
                  repo_owner='GITenberg', 
                  update_travis_commit_msg='build using gitenberg.travis',
                  tag_commit_message='build using gitenberg.travis',
                  access_token=GITENBERG_TRAVIS_ACCESS_TOKEN)
            results[repo_name] = (bj, bj.run())
            
            # just mark as started
            repo = session.query(Repo).filter_by(repo_name=result['repo_name']).first()
            repo.updated = arrow.now().isoformat()
            repo.datebuilt = arrow.now().isoformat()
            
        except Exception, e:
            results[repo_name] = e
        
        print ("\r{}: {}".format(i, results[repo_name]), end="")

In [ ]:
build_repos(repos_iter, 1)

wondering if not add_all -- any add or update function? python - SQLAlchemy insert or update example - Stack Overflow


In [ ]:
repo1.version = '0.0.5'

In [ ]:
session.dirty

In [ ]:
session.new

In [ ]:
our_repo = session.query(Repo).filter_by(repo_name='Repo1').first() # doctest:+NORMALIZE_WHITESPACE
our_repo

personal access tokens


In [ ]:
gh = github3.login(ry_username, password=ry_password)

In [ ]:
from itertools import islice


auths = [{'name': auth.name, 'created_at':auth.created_at, 'updated_at':auth.updated_at} 
         for auth in islice(gh.iter_authorizations(),None)]

In [ ]:
sorted(auths, key=lambda r: r['created_at'])

In [ ]: