In this tutorial, we'll load in some repository data for Julia packages which was scraped from https://github.com/svaksha/Julia.jl and queried through the excellent wrapped API: https://github.com/JuliaWeb/GitHub.jl.
In [ ]:
# # to load the data, you'll need to check out my forked repo:
# Pkg.clone("https://github.com/tbreloff/Julia.jl")
# Pkg.checkout("Julia", "tb_github")
# # and you'll want to install Plots
# Pkg.add("Plots")
# Pkg.checkout("Plots")
# Pkg.add("PyPlot")
# Pkg.add("PlotRecipes")
# Pkg.checkout("PlotRecipes")
In [ ]:
In [ ]:
# setup Plots and PlotRecipes (graphs and other cool stuff)
using Plots, PlotRecipes
pyplot(size=(700,500), leg=false, guidefont=font(6), titlefont=font(9))
In [ ]:
# load in the tables
using DataFrames
stardf = readtable(Pkg.dir("Julia","stargazers.csv"))
contribdf = readtable(Pkg.dir("Julia","contributors.csv"))
# a helper method to extract lists from strings
splitlist(s) = filter(str -> str != "", split(s[6:end-3], ","))
In [ ]:
# count the stargazers
starcounts = map(gazers -> length(splitlist(gazers)), stardf[:gazers]);
sum(starcounts)
In [ ]:
# Track the number of stars per repo using a Dict
starmap = Dict()
for (repo,cnt) in zip(stardf[:repo], starcounts)
starmap[repo] = cnt
end
length(starmap)
In [ ]:
# This is horribly messy. You don't need to know the details, just that we're
# going to assign package contributors a "Star Value" score, which is just
# a percentage of the stars on a repo assigned by proportion of number of commits.
userstars = Dict{UTF8String,Float64}()
for i=1:size(contribdf,1)
# extract the package name
repo = split(contribdf[i,:url], "/")[end]
repo[end-2:end] == ".jl" || continue
repo = repo[1:end-3]
# get a list of users (contributors) and the number of contributions
users = splitlist(contribdf[i,:users])
nums = splitlist(contribdf[i,:contribs])
isempty(nums) && continue
ncontribs = [parse(Int,c) for c in nums]
# assign the stars to users
pct = ncontribs ./ sum(ncontribs)
stars = get(starmap, repo, 0)
pctstars = stars * pct
for (i,user) in enumerate(users)
userval = get(userstars, user, 0.0)
userstars[user] = userval + pctstars[i]
end
end
# remove if too small
filter!((k,v) -> v>=1, userstars)
In [ ]:
# what does this data look like?
vals = collect(values(userstars))
plot(
histogram(vals),
histogram(vals, bins=500, xaxis=(:log,(1,Inf))),
scatter(sort(vals), alpha=0.1, yscale=:log),
layout = @layout [a b; c{0.7h}]
)
In [ ]:
# Gadfly and IJulia didn't make it onto the list because of formatting issues in Julia.jl
# Gotta give credit to Daniel and Steven!
userstars["dcjones"] += 450
userstars["stevengj"] += 400
In [ ]:
# get the list of users and calculated star value
users, starvalues = collect(keys(userstars)), collect(values(userstars))
In [ ]:
# Looking at the highest values, we'll use series annotations to label datapoints.
# Note how the `text` method takes a string and an arbitrarily ordered list of font attributes.
topidx = sortperm(starvalues, rev=true)[1:50]
anns = [text("$(users[i]): $(round(Int,starvalues[i]))", round(Int, 2+0.4sqrt(starvalues[i]))) for i=topidx]
# We cycle through x values 1-4 with invisible markers. We use the `yaxis` "magic arg" which takes values in any order.
scatter(1:4, starvalues[topidx], series_annotations=anns, xlims=(0.5,4.5), yaxis =(:log,"Star Value",(NaN,1000)), alpha=0)
In [ ]:
# grab some table columns
groups, subgroups, repos = stardf[:group], stardf[:subgroup], stardf[:repo];
groupnames = sort(unique(groups))'
N = length(groupnames)
# view a grid of histograms, each group in its own subplot
p = histogram(repos, starcounts, group=groups, layout=N, marker=(6,0.2))
In [ ]:
# This could look better. Give titles, remove tick labels, and link all the x and y axes.
# Note: xlink/ylink take a list of subplot indices which a subplot should link axes with.
# Since it's a vector-type, it applies to ALL the subplots
plot!(title = groupnames, xlims=(0,100), ylink = 1:N, ticks=nothing, bins=4)
In [ ]:
# Not yet an official feature, filter to only the AI group, and boxplot by subgroups
groupfilter(gname) = filter(i -> groups[i] == gname, 1:length(groups))
boxplot(subgroups, starcounts, markersize = 2, idxfilter=groupfilter("AI"))
In [ ]:
# Make it easier to see
plot!(ylims=(0,100), xrot=90)
In [ ]:
In [ ]:
# cycling inputs, series annotations
# Similar to the contributors above, view the
n = 50
rng = sortperm(starcounts, rev=true)[1:50]
plot(1:4, starcounts[rng], xlim=(0,5), w=0, series_annotations=repos[rng], yscale=:log2)
In [ ]:
# PlotText construction
text("Hi",10)
In [ ]:
# text attributes
idx = findfirst(repos, "Plots")
anns = [repo == "Plots" ? text(repo,40,:red) : text(repo,8,RGBA(0,0,0,0.3)) for repo in repos[rng]]
plot(1:4, starcounts[rng], xlim=(0,5), w=0, yscale=:log2, series_annotations=anns)
In [ ]:
In [ ]:
# Lets rewrite this as a series recipe
@recipe function f(::Type{Val{:annotations}}, x, y, z)
seriestype := :scatter
yscale := :log2
markeralpha := 0
xlims := (0.5,4.5)
series_annotations := x
x := 1:4
()
end
@shorthands annotations
annotations(repos[rng], starcounts[rng])
In [ ]: