MetadataTools.jl Demo

Cambridge Area Julia Users Group, Sept. 4th 2014

Iain Dunning, MIT


In [4]:
using MetadataTools
using Interact


Getting information about packages

MetadataTools defines a PkgMeta type that represents a package's METADATA entry, and contains a PkgMetaVersion for each tagged version.


In [5]:
pkgs = get_all_pkg()  # Returns a Dict{String,PkgMeta}

@manipulate for pkg_name in ["JuMP","DataArrays","BinDeps"]
    pkgs[pkg_name]
end


Out[5]:
JuMP   git://github.com/JuliaOpt/JuMP.jl.git
  0.1.1,7339ce,julia 0.2-,MathProgBase 0.0.0 0.3.0-
  0.1.2,83f689,julia 0.2-,MathProgBase 0.0.0 0.3.0-
  0.2.0,c35353,julia 0.2-,MathProgBase 0.0.0 0.3.0-
  0.3.0,6a6f5a,julia 0.2-,MathProgBase 0.0.0 0.3.0-
  0.3.1,d478da,julia 0.2,MathProgBase 0.0.0 0.3.0-
  0.3.2,174fe2,julia 0.2,MathProgBase 0.0.0 0.3.0-
  0.4.0,275953,julia 0.2,MathProgBase 0.0.0 0.3.0-
  0.4.1,4e1ce4,julia 0.2,MathProgBase 0.0.0 0.3.0-
  0.5.0,1337c9,julia 0.2,MathProgBase 0.0.0 0.3.0-,ReverseDiffSparse
  0.5.1,cc3e09,julia 0.2,MathProgBase 0.0.0 0.3.0-,ReverseDiffSparse
  0.5.2,8f165f,julia 0.2,MathProgBase 0.0.0 0.3.0-,ReverseDiffSparse
  0.5.3,1f7793,julia 0.2,MathProgBase 0.0.0 0.3.0-,ReverseDiffSparse
  0.5.4,a2cb72,julia 0.2,MathProgBase 0.0.0 0.3.0-,ReverseDiffSparse
  0.5.5,ae7295,julia 0.2,MathProgBase 0.0.0 0.3.0-,ReverseDiffSparse
  0.5.6,882fee,julia 0.2,MathProgBase 0.3.0- 0.4.0-,ReverseDiffSparse

We can check that maximum supported Julia version using get_upper_limit - useful for checking if a package is deprecated.


In [6]:
get_upper_limit(get_pkg("Monads"))


Out[6]:
v"0.3.0"

In [10]:
get_upper_limit(get_pkg("DataFrames"))


Out[10]:
v"0.0.0"

We can also requestion information about a package from GitHub (or wherever it is hosted - only GitHub needed right now!)


In [11]:
gadfly_info = get_pkg_info(get_pkg("Gadfly"))
Base.isless(a::MetadataTools.Contributor,b::MetadataTools.Contributor) =
    isless(a.username,b.username)
@manipulate for top_x in 1:20
    sort(gadfly_info.contributors, rev=true)[1:top_x]
end


git://github.com/dcjones/Gadfly.jl.git
Out[11]:
10-element Array{(Int64,Contributor),1}:
 (428,Contributor("dcjones","https://github.com/dcjones"))        
 (8,Contributor("dchudz","https://github.com/dchudz"))            
 (7,Contributor("darwindarak","https://github.com/darwindarak"))  
 (6,Contributor("timholy","https://github.com/timholy"))          
 (5,Contributor("kleinschmidt","https://github.com/kleinschmidt"))
 (5,Contributor("aviks","https://github.com/aviks"))              
 (5,Contributor("Keno","https://github.com/Keno"))                
 (4,Contributor("jverzani","https://github.com/jverzani"))        
 (4,Contributor("inq","https://github.com/inq"))                  
 (4,Contributor("IainNZ","https://github.com/IainNZ"))            

I pulled all the data a week or so and serialized it for later use.


In [12]:
f = open("20140904_metadatatools.jldata","r")
pkg_info = deserialize(f)
close(f)
pkg_info["Dates"]


Out[12]:
PkgInfo("https://github.com/quinnj/Dates.jl","Date/DateTime Implementation for the Julia Language; Successor to Datetime.jl","",5,2,[(2,Contributor("jiahao","https://github.com/jiahao")),(131,Contributor("quinnj","https://github.com/quinnj"))])

In [13]:
# Calculate commits stats
total_coms = Dict()
total_pkgs = Dict()

for pkg in values(pkg_info)
    for contrib in pkg.contributors
        commits, c = contrib
        total_coms[c.username] = get(total_coms,c.username,0) + commits
        total_pkgs[c.username] = get(total_pkgs,c.username,0) + 1
    end
end

# Turn dicts into sorted (num,username) vectors
total_pkgs = sort([(total_pkgs[n],n) for n in keys(total_pkgs)],rev=true)
total_coms = sort([(total_coms[n],n) for n in keys(total_coms)],rev=true)

println("Number of packages contributed to")
map(println, total_pkgs[1:20])

println("Number of commits across all packages")
map(println, total_coms[1:20]);


Number of packages contributed to
(51,"timholy")
(45,"johnmyleswhite")
(40,"kmsquire")
(37,"StefanKarpinski")
(35,"Keno")
(34,"lindahua")
(30,"simonster")
(29,"IainNZ")
(25,"mlubin")
(24,"staticfloat")
(24,"aviks")
(21,"vtjnash")
(20,"stevengj")
(20,"ihnorton")
(18,"quinnj")
(17,"tanmaykm")
(17,"dcjones")
(17,"carlobaldassi")
(16,"tkelman")
(16,"powerdistribution")
Number of commits across all packages
(1734,"lindahua")
(1427,"jakebolewski")
(1178,"timholy")
(893,"johnmyleswhite")
(821,"dcjones")
(788,"simonster")
(749,"mlubin")
(678,"milktrader")
(462,"stevengj")
(435,"dmbates")
(415,"nolta")
(402,"one-more-minute")
(398,"quinnj")
(397,"IainNZ")
(372,"joehuchette")
(353,"powerdistribution")
(350,"WestleyArgentum")
(340,"Keno")
(336,"scidom")
(330,"tanmaykm")

Package Ecosystem

MetadataTools has a dependency on Graphs to enable an analysis of how packages rely on each other.


In [14]:
using Graphs
# Get a directed graph where PkgA -> PkgB iff 
# PkgA directly requires PkgB
g = get_pkgs_dep_graph(get_all_pkg())


Out[14]:
Directed Graph (418 vertices, 496 edges)

In [15]:
g_gadfly = get_pkg_dep_graph(get_pkg("Gadfly"),g)


Out[15]:
Directed Graph (24 vertices, 36 edges)

To plot the dependency graph for a package, we can use my GraphLayout.jl package which uses Compose.jl internally for drawing. I haven't got around to adding Graphs.jl support to GraphLayout.jl just yet though...


In [18]:
using GraphLayout
for pkg_name in ["Gadfly","QuantEcon","JuMP","Twitter"]
    # Extract graph
    g_pkg = get_pkg_dep_graph(get_pkg(pkg_name),g)
    # Extract adjacency matrix
    adj_mat = adjacency_matrix(g_pkg)
    # Build layout
    locs_x,locs_y = layout_spring_adj(adj_mat)
    # Extract name for each vertex
    vert_names = map(pm->pm.name, vertices(g_pkg))
    # Draw as an SVG
    draw_layout_adj(adj_mat, locs_x, locs_y, labels=vert_names)
end


Distances PDMats ArrayViews Reexport SortingAlgorithms GZip DataArrays ImmutableArrays FixedPointNumbers StatsBase Loess KernelDensity JSON Iterators Hexagons Distributions Datetime DataStructures DataFrames Contour Compose Color Codecs Gadfly
JSON SHA URIParser Zlib Homebrew BinDeps Reexport Polynomials DualNumbers Calculus Options StatsBase PDMats ArrayViews DataStructures FactCheck MAT HDF5 DSP Optim Grid Distributions QuantEcon
DataStructures Calculus Graphs DualNumbers ReverseDiffSparse MathProgBase JuMP
LibCURL ArrayViews Zlib LibExpat HTTPClient URLParse SHA Reexport SortingAlgorithms GZip StatsBase DataArrays WinRPM Homebrew BinDeps GnuTLS URIParser HttpParser Dates DataFrames Nettle JSON Requests HttpCommon Codecs Twitter

We can also look at which packages depend on the most packages


In [17]:
num_pkg_req = [
    (num_vertices(get_pkg_dep_graph(pkg, g)), pkg.name)
        for pkg in values(pkgs)]
sort!(num_pkg_req, rev=true)  # Sort descending
println("Top 10 packages by number of packages depended on:")
for i in 1:10
    println(rpad(num_pkg_req[i][2],20," "), num_pkg_req[i][1]-1)
end


Top 10 packages by number of packages depended on:
RobustStats         30
MachineLearning     30
Quandl              26
Twitter             25
Lumira              24
Gadfly              23
QuantEcon           22
ProfileView         22
ImageView           21
Etcd                21

We can also reverse the graph - now an arc from PkgA to PkgB means PkgB requires PkgA


In [19]:
g_rev = get_pkgs_dep_graph(pkgs, reverse=true)
# Count size of every subgraphs like above
num_pkg_req = [
    (num_vertices(get_pkg_dep_graph(pkg, g_rev)), pkg.name)
        for pkg in values(pkgs)]
sort!(num_pkg_req, rev=true)  # Sort descending
println("Top 10 packages by number of packages that depend on them:")
for i in 1:10
    println(rpad(num_pkg_req[i][2],20," "), num_pkg_req[i][1]-1)
end


Top 10 packages by number of packages that depend on them:
URIParser           89
SHA                 88
BinDeps             87
ArrayViews          76
JSON                71
StatsBase           66
Homebrew            58
Zlib                49
URLParse            40
Reexport            40

In [ ]: