In [1]:
import github3
import pandas as pd
from collections import deque
In [ ]:
# go here to get a github OAuth token --> https://github.com/settings/tokens
In [2]:
gh = github3.login(token='xxxxxxxxx')
In [3]:
def fetch_commits_for_user(user, blacklisted_repos=None):
# repos that I don't care about are listed in 'blacklisted_repos'
if not blacklisted_repos:
blacklisted_repos = ['homebrew',]
all_repos = list(gh.repositories_by(user))
repos = deque()
authors = deque()
messages = deque()
dates = deque()
committers = deque()
shas = deque()
# all_repos = nsls2_repos + Nikea_repos + skxray_repos + beamline_repos + flattened
for idx, repo in enumerate(all_repos):
# make all repo names lowercase
repo_name = repo.full_name.lower()
if any([blacklisted in repo_name for blacklisted in blacklisted_repos]):
# don't process this blacklisted repo
print('skipping %s. It is blacklisted. %s of %s' % (repo, idx+1, len(all_repos)))
continue
print('processing %s. %s of %s' % (repo, idx+1, len(all_repos)))
try:
commits = list(repo.commits())
except github3.exceptions.ClientError:
continue
for commit in commits:
repos.append(repo_name)
committer = commit.commit.committer['name']
committers.append(committer)
try:
authors.append(commit.author.login)
except AttributeError:
# there is no reported author of this commit.
# use the name of the committer instead.
authors.append(committer)
# print('commit %s from repo %s has no author' % (commit.sha, repo_name))
messages.append(commit.commit.message)
# print(commit.commit.committer/)
dates.append(commit.commit.committer['date'])
shas.append(commit.sha)
return {
'repo': repos,
'authors': authors,
'messages': messages,
'dates': dates,
'committers': committers,
'shas': shas,
}
In [4]:
softmatter_commits = fetch_commits_for_user('soft-matter')
In [5]:
matplotlib_commits = fetch_commits_for_user('matplotlib')
In [6]:
nsls2_commits = fetch_commits_for_user('NSLS-II')
In [7]:
skxray_commits = fetch_commits_for_user('scikit-xray')
In [8]:
Nikea_commits = fetch_commits_for_user('Nikea')
In [9]:
csx_commits = fetch_commits_for_user('NSLS-II-CSX')
chx_commits = fetch_commits_for_user('NSLS-II-CHX')
hxn_commits = fetch_commits_for_user('NSLS-II-HXN')
srx_commits = fetch_commits_for_user('NSLS-II-SRX')
xpd_commits = fetch_commits_for_user('NSLS-II-XPD')
ixs_commits = fetch_commits_for_user('NSLS-II-IXS')
In [10]:
vistrails_commits = fetch_commits_for_usertch_commits_for_user('VisTrails')
In [13]:
ericdill_commits = fetch_commits_for_user('ericdill')
In [14]:
danielballan_commits = fetch_commits_for_user('danielballan')
In [15]:
dchabot_commits = fetch_commits_for_user('dchabot')
arkilic_commits = fetch_commits_for_user('arkilic')
cowanml_commits = fetch_commits_for_user('cowanml')
areaDetector_commits = fetch_commits_for_user('areaDetector')
In [28]:
synchbot_commits = fetch_commits_for_user('synchbot')
In [41]:
klauer_commits = fetch_commits_for_user('klauer')
In [51]:
giltis_commits = fetch_commits_for_user('giltis')
In [52]:
commit_order = [
skxray_commits,
softmatter_commits,
vistrails_commits,
matplotlib_commits,
areaDetector_commits,
nsls2_commits,
Nikea_commits,
chx_commits,
csx_commits,
hxn_commits,
srx_commits,
ixs_commits,
xpd_commits,
synchbot_commits,
ericdill_commits,
danielballan_commits,
dchabot_commits,
arkilic_commits,
cowanml_commits,
klauer_commits,
giltis_commits
]
In [53]:
from collections import defaultdict
In [54]:
df = defaultdict(deque)
for commits in commit_order:
for column_name, column in commits.items():
df[column_name].extend(column)
df = pd.DataFrame(df)
# dfs = {repo_name: pd.DataFrame({column_name: pd.Series(column) for column_name, column in repo_data.items()})
# for repo_name, repo_data in repo_info.items()}
In [55]:
len(df)
Out[55]:
In [56]:
cleaned_df = df.drop_duplicates('shas').copy()
In [57]:
len(cleaned_df)
Out[57]:
In [58]:
# # remove the user name where the repo came from
# repo_names = [repo.split('/')[-1] for repo in cleaned_df.repo]
# cleaned_df.update({'repo': pd.Series(repo_names)})
In [59]:
map_people = {
'daniel allan': 'danielballan',
'arman arkilic': 'arkilic',
'daron chabot': 'dchabot',
'thomas caswell': 'tacaswell',
}
In [60]:
from collections import deque
authors = deque()
for idx, (author, committer) in enumerate(zip(cleaned_df.authors, cleaned_df.committers)):
if author == 'unknown':
author = committer
author = author.lower()
if author in map_people:
author = map_people[author]
authors.append(str(author).lower())
cleaned_df['authors'] = authors
In [61]:
cleaned_df.to_csv('cleaned-commit-info.csv')
In [ ]:
In [ ]: