In [1]:
# NOTE: The RuntimeWarnings (if any) are harmless. See ContinuumIO/anaconda-issues#6678.
from pandas.io import gbq
import pandas as pd
In [2]:
import getpass
import subprocess
# Configuration Variables. Modify as desired.
PROJECT = subprocess.check_output(["gcloud", "config", "get-value", "project"]).strip().decode()
In [26]:
import datetime
month = datetime.datetime.now().month
year = datetime.datetime.now().year
num_months = 12
months = []
for i in range(num_months):
months.append("\"{0}{1:02}\"".format(year, month))
month -= 1
if month == 0:
month = 12
year -=1
In [27]:
import json
import os
import requests
if not os.path.exists(".cache"):
os.makedirs(".cache")
users_file = os.path.join(".cache", "github_users.json")
if not os.path.exists(users_file):
url = "https://github.com/kubeflow/community/blob/master/devstats/data/github_users.json?raw=true"
r = requests.get(url, allow_redirects=True)
with open(users_file, "wb") as hf:
hf.write(r.content)
with open(users_file) as hf:
data = json.load(hf)
users=pd.DataFrame(data)
users = users[["login", "company"]]
In [28]:
# Dedupe companies
c = ["cisco", "datawire", "google", "ibm", "intel", "teradata", "red hat"]
known_companies = dict(zip(c,c))
known_companies["redhat"] = "red hat"
def normalize_company(name):
if name is None:
return "None"
name = name.strip().lower().strip("!").strip("@")
for k, v in known_companies.items():
if k in name:
return v
return name
users["company"] = users["company"].apply(normalize_company)
In [29]:
def combine_company(names):
for i in names:
if i != "None":
return i
return None
user_map= users.groupby("login")["company"].apply(combine_company)
# You can now look up users as user_map[actor]
user_map["jlewi"]
Out[29]:
In [30]:
query = """
SELECT
DATE(created_at) AS pr_date,
actor.id,
actor.login
FROM `githubarchive.month.*`
WHERE
_TABLE_SUFFIX IN ({0})
AND type = 'PullRequestEvent'
AND org.login = 'kubeflow'
AND JSON_EXTRACT(payload, '$.action') IN ('"opened"')
""".format(",".join(months))
prs=gbq.read_gbq(str(query), dialect='standard', project_id=PROJECT)
In [31]:
p=pd.Series(data=prs["id"].values,index=prs["pr_date"])
p=p.sort_index()
In [32]:
prs
Out[32]:
In [33]:
prs["company"] = user_map[prs["login"]].values
In [34]:
d=prs[["pr_date", "company"]]
d["count"]=1
In [35]:
pr_counts = d.pivot_table("count", columns="company", index="pr_date", aggfunc="sum", fill_value=0)
In [46]:
# Some solutions here: https://stackoverflow.com/questions/46470743/how-to-efficiently-compute-a-rolling-unique-count-in-a-pandas-time-series
# Need to figure out how to do a time based window
counts = pr_counts.rolling('28d').sum()
In [128]:
names = ["google", "cisco", "microsoft", "arrikto", "ibm", "seldon"]
companies_df = counts[names]
companies_df["day"] = companies_df.index
companies = pd.melt(companies_df, value_vars=names, id_vars=["day"])
chart = alt.Chart(companies, title= "PRs")
point = chart.mark_point().encode(
x= alt.X('day', title = "Day"),
y=alt.Y("value", title="# PRs"),
color="company",
)
point.interactive()
Out[128]: