In [1]:
    
import github3
import pandas as pd
from collections import deque
    
In [ ]:
    
# go here to get a github OAuth token --> https://github.com/settings/tokens
    
In [2]:
    
gh = github3.login(token='xxxxxxxxx')
    
In [3]:
    
def fetch_commits_for_user(user, blacklisted_repos=None):
    # repos that I don't care about are listed in 'blacklisted_repos'
    if not blacklisted_repos:
        blacklisted_repos = ['homebrew',]
    all_repos = list(gh.repositories_by(user))
    repos = deque()
    authors = deque()
    messages = deque()
    dates = deque()
    committers = deque()
    shas = deque()
#     all_repos = nsls2_repos + Nikea_repos + skxray_repos + beamline_repos + flattened
    for idx, repo in enumerate(all_repos):
        # make all repo names lowercase
        repo_name = repo.full_name.lower()
        if any([blacklisted in repo_name for blacklisted in blacklisted_repos]):
            # don't process this blacklisted repo
            print('skipping %s. It is blacklisted. %s of %s' % (repo, idx+1, len(all_repos)))
            continue
        print('processing %s. %s of %s' % (repo, idx+1, len(all_repos)))
        try:
            commits = list(repo.commits())
        except github3.exceptions.ClientError:
            continue
        for commit in commits:
            repos.append(repo_name)
            committer = commit.commit.committer['name']
            committers.append(committer)
            try:
                authors.append(commit.author.login)
            except AttributeError:
                # there is no reported author of this commit.
                # use the name of the committer instead.
                authors.append(committer)
    #             print('commit %s from repo %s has no author' % (commit.sha, repo_name))
            messages.append(commit.commit.message)
    #         print(commit.commit.committer/)
            dates.append(commit.commit.committer['date'])
            shas.append(commit.sha)
    return {
        'repo': repos,
        'authors': authors,
        'messages': messages,
        'dates': dates,
        'committers': committers,
        'shas': shas,
    }
    
In [4]:
    
softmatter_commits = fetch_commits_for_user('soft-matter')
    
    
In [5]:
    
matplotlib_commits = fetch_commits_for_user('matplotlib')
    
    
In [6]:
    
nsls2_commits = fetch_commits_for_user('NSLS-II')
    
    
In [7]:
    
skxray_commits = fetch_commits_for_user('scikit-xray')
    
    
In [8]:
    
Nikea_commits = fetch_commits_for_user('Nikea')
    
    
In [9]:
    
csx_commits = fetch_commits_for_user('NSLS-II-CSX')
chx_commits = fetch_commits_for_user('NSLS-II-CHX')
hxn_commits = fetch_commits_for_user('NSLS-II-HXN')
srx_commits = fetch_commits_for_user('NSLS-II-SRX')
xpd_commits = fetch_commits_for_user('NSLS-II-XPD')
ixs_commits = fetch_commits_for_user('NSLS-II-IXS')
    
    
In [10]:
    
vistrails_commits = fetch_commits_for_usertch_commits_for_user('VisTrails')
    
    
In [13]:
    
ericdill_commits = fetch_commits_for_user('ericdill')
    
    
In [14]:
    
danielballan_commits = fetch_commits_for_user('danielballan')
    
    
In [15]:
    
dchabot_commits = fetch_commits_for_user('dchabot')
arkilic_commits = fetch_commits_for_user('arkilic')
cowanml_commits = fetch_commits_for_user('cowanml')
areaDetector_commits = fetch_commits_for_user('areaDetector')
    
    
In [28]:
    
synchbot_commits = fetch_commits_for_user('synchbot')
    
    
In [41]:
    
klauer_commits = fetch_commits_for_user('klauer')
    
    
In [51]:
    
giltis_commits = fetch_commits_for_user('giltis')
    
    
In [52]:
    
commit_order = [
    skxray_commits,
    softmatter_commits,
    vistrails_commits,
    matplotlib_commits,
    areaDetector_commits,
    nsls2_commits,
    Nikea_commits,
    chx_commits,
    csx_commits,
    hxn_commits,
    srx_commits,
    ixs_commits,
    xpd_commits,
    synchbot_commits,
    ericdill_commits,
    danielballan_commits,
    dchabot_commits,
    arkilic_commits,
    cowanml_commits,
    klauer_commits,
    giltis_commits
]
    
In [53]:
    
from collections import defaultdict
    
In [54]:
    
df = defaultdict(deque)
for commits in commit_order:
    for column_name, column in commits.items():
        df[column_name].extend(column)
df = pd.DataFrame(df)
    
# dfs = {repo_name: pd.DataFrame({column_name: pd.Series(column) for column_name, column in repo_data.items()}) 
#        for repo_name, repo_data in repo_info.items()}
    
In [55]:
    
len(df)
    
    Out[55]:
In [56]:
    
cleaned_df = df.drop_duplicates('shas').copy()
    
In [57]:
    
len(cleaned_df)
    
    Out[57]:
In [58]:
    
# # remove the user name where the repo came from
# repo_names = [repo.split('/')[-1] for repo in cleaned_df.repo]
# cleaned_df.update({'repo': pd.Series(repo_names)})
    
In [59]:
    
map_people = {
    'daniel allan': 'danielballan',
    'arman arkilic': 'arkilic',
    'daron chabot': 'dchabot',
    'thomas caswell': 'tacaswell',
}
    
In [60]:
    
from collections import deque
authors = deque()
for idx, (author, committer) in enumerate(zip(cleaned_df.authors, cleaned_df.committers)):
    if author == 'unknown':
        author = committer
    author = author.lower()
    if author in map_people:
        author = map_people[author]
    authors.append(str(author).lower())
cleaned_df['authors'] = authors
    
In [61]:
    
cleaned_df.to_csv('cleaned-commit-info.csv')
    
In [ ]:
    
    
In [ ]: