PySAL Change Log Statistics

This notebook pulls the summary statistics for use in the 6-month releases of PySAL, which is now (2017-07) a meta package.

It assumes the subpackages have been git cloned in a directory below the location of this notebook. It also requires network connectivity for some of the reporting.



In [1]:

    
from __future__ import print_function
import os
import json
import re
import sys
import pandas

from datetime import datetime, timedelta
from time import sleep
from subprocess import check_output
try:
    from urllib import urlopen
except:
    from urllib.request import urlopen

import ssl
import yaml

context = ssl._create_unverified_context()



In [2]:

    
with open('packages.yml') as package_file:
    packages = yaml.load(package_file)



In [3]:

    
CWD = os.path.abspath(os.path.curdir)

Our last main release was 2017-11-03:



In [4]:

    
start_date = '2017-11-03'
since_date = '--since="{start}"'.format(start=start_date)
since_date
since = datetime.strptime(start_date+" 0:0:0", "%Y-%m-%d %H:%M:%S")
since









    Out[4]:





datetime.datetime(2017, 11, 3, 0, 0)



In [ ]:



In [5]:

    
from datetime import datetime, timedelta
ISO8601 = "%Y-%m-%dT%H:%M:%SZ"
PER_PAGE = 100
element_pat = re.compile(r'<(.+?)>')
rel_pat = re.compile(r'rel=[\'"](\w+)[\'"]')



In [6]:

    
def parse_link_header(headers):
    link_s = headers.get('link', '')
    urls = element_pat.findall(link_s)
    rels = rel_pat.findall(link_s)
    d = {}
    for rel,url in zip(rels, urls):
        d[rel] = url
    return d

def get_paged_request(url):
    """get a full list, handling APIv3's paging"""
    results = []
    while url:
        #print("fetching %s" % url, file=sys.stderr)
        f = urlopen(url)
        results.extend(json.load(f))
        links = parse_link_header(f.headers)
        url = links.get('next')
    return results

def get_issues(project="pysal/pysal", state="closed", pulls=False):
    """Get a list of the issues from the Github API."""
    which = 'pulls' if pulls else 'issues'
    url = "https://api.github.com/repos/%s/%s?state=%s&per_page=%i" % (project, which, state, PER_PAGE)
    return get_paged_request(url)


def _parse_datetime(s):
    """Parse dates in the format returned by the Github API."""
    if s:
        return datetime.strptime(s, ISO8601)
    else:
        return datetime.fromtimestamp(0)


def issues2dict(issues):
    """Convert a list of issues to a dict, keyed by issue number."""
    idict = {}
    for i in issues:
        idict[i['number']] = i
    return idict


def is_pull_request(issue):
    """Return True if the given issue is a pull request."""
    return 'pull_request_url' in issue


def issues_closed_since(period=timedelta(days=365), project="pysal/pysal", pulls=False):
    """Get all issues closed since a particular point in time. period
can either be a datetime object, or a timedelta object. In the
latter case, it is used as a time before the present."""

    which = 'pulls' if pulls else 'issues'

    if isinstance(period, timedelta):
        period = datetime.now() - period
    url = "https://api.github.com/repos/%s/%s?state=closed&sort=updated&since=%s&per_page=%i" % (project, which, period.strftime(ISO8601), PER_PAGE)
    allclosed = get_paged_request(url)
    # allclosed = get_issues(project=project, state='closed', pulls=pulls, since=period)
    filtered = [i for i in allclosed if _parse_datetime(i['closed_at']) > period]

    # exclude rejected PRs
    if pulls:
        filtered = [ pr for pr in filtered if pr['merged_at'] ]

    return filtered


def sorted_by_field(issues, field='closed_at', reverse=False):
    """Return a list of issues sorted by closing date date."""
    return sorted(issues, key = lambda i:i[field], reverse=reverse)


def report(issues, show_urls=False):
    """Summary report about a list of issues, printing number and title.
    """
    # titles may have unicode in them, so we must encode everything below
    if show_urls:
        for i in issues:
            role = 'ghpull' if 'merged_at' in i else 'ghissue'
            print('* :%s:`%d`: %s' % (role, i['number'],
                                        i['title'].encode('utf-8')))
    else:
        for i in issues:
            print('* %d: %s' % (i['number'], i['title'].encode('utf-8')))



In [7]:

    
all_issues = {}
all_pulls = {}
total_commits = 0
issue_details = {}
pull_details = {}
for package in packages:
    subpackages = packages[package].split()
    for subpackage in subpackages:
        prj = 'pysal/{subpackage}'.format(subpackage=subpackage)
        os.chdir(CWD)
        os.chdir('tmp/{subpackage}'.format(subpackage=subpackage))
        #sub_issues = issues_closed_since(project=prj, period=since)
        #sleep(5)
        issues = issues_closed_since(since, project=prj,pulls=False)
        pulls = issues_closed_since(since, project=prj,pulls=True)
        issues = sorted_by_field(issues, reverse=True)
        pulls = sorted_by_field(pulls, reverse=True)
        issue_details[subpackage] = issues
        pull_details[subpackage] = pulls
        n_issues, n_pulls = map(len, (issues, pulls))
        n_total = n_issues + n_pulls
        all_issues[subpackage] = n_total, n_pulls
os.chdir(CWD)

The issues are pulled since the last release date of the meta package. However, each package that is going into the meta release, has a specific release tag that pins the code making it into the release. We don't want to report the commits post the packages tag date so we have to do some filtering here before building our change log statistics for the meta package.

For now let's pickle the issues and pull records to filter later and not have to rehit github api



In [8]:

    
import pickle 

pickle.dump( issue_details, open( "issue_details.p", "wb" ) )

pickle.dump( pull_details, open("pull_details.p", "wb"))



In [ ]: