In [2]:
import pandas as pd
import numpy as np
from ggplot import *
import time
import datetime as dt
import random
import scipy as sc
import scipy.stats as ss
import statsmodels.api as smapi
import statsmodels.graphics as smgraphics 
from statsmodels.formula.api import ols
from scipy.optimize import curve_fit
from scipy.interpolate import interp1d
import json

Load data


In [4]:
with open ('data/deps-size-all.ldjson', "r") as datafile:
    text = '[%s]' % ',\n'.join(datafile.read().strip().split('\n'))
    d_packages = pd.read_json(text)

d_packages.columns


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-f2f55c1e4451> in <module>()
      2 with open ('data/deps-size-all.ldjson', "r") as datafile:
      3     text = '[%s]' % ',\n'.join(datafile.read().strip().split('\n'))
----> 4     d_packages = pd.read_json(text)
      5 
      6 d_packages.columns

/Users/anand/data/npm/env/lib/python2.7/site-packages/pandas/io/json.pyc in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
    196         obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
    197                           keep_default_dates, numpy, precise_float,
--> 198                           date_unit).parse()
    199 
    200     if typ == 'series' or obj is None:

/Users/anand/data/npm/env/lib/python2.7/site-packages/pandas/io/json.pyc in parse(self)
    264 
    265         else:
--> 266             self._parse_no_numpy()
    267 
    268         if self.obj is None:

/Users/anand/data/npm/env/lib/python2.7/site-packages/pandas/io/json.pyc in _parse_no_numpy(self)
    481         if orient == "columns":
    482             self.obj = DataFrame(
--> 483                 loads(json, precise_float=self.precise_float), dtype=None)
    484         elif orient == "split":
    485             decoded = dict((str(k), v)

ValueError: Expected object or value

Calculated Columns


In [26]:
d_packages['num_dependencies'] = d_packages['dependencies'].map(len)
d_packages['num_devdependencies'] = d_packages['devDependencies'].map(len)

First Look


In [35]:
d_packages.describe()


Out[35]:
latest size num_dependencies num_devdependencies
count 5524 5504.000000 5524.000000 5524.000000
mean 0.160572 331497.266897 4.238595 2.807748
std 0.367169 2109101.184107 6.067839 3.698237
min False 165.000000 0.000000 0.000000
25% 0 4507.750000 0.000000 0.000000
50% 0 14104.500000 2.000000 1.000000
75% 0 83817.250000 5.000000 4.000000
max True 27056268.000000 32.000000 28.000000

In [36]:
d_packages[d_packages['size'] > d_packages['size'].quantile(.9)][d_packages['latest']]


Out[36]:
dependencies description devDependencies latest name size version num_dependencies num_devdependencies
28 [ejs, grunt, grunt-contrib-clean, grunt-contri... 1:1 ticket and laptop system [mocha] True 1-1-help-desk-system 306666 0.0.7 18 1
52 [qs, debug, mime, gm, ws, musicmetadata, mmmag... 10er10 is an HTML5 audio jukebox. It works on ... [] True 10er10 22254752 0.23.0 11 0
62 [express, mongoskin, mustache, jade, sugar, va... CRUD over express and mongodb [] True 10tcl 1393901 0.0.10 8 0
142 [express, static-favicon, morgan, cookie-parse... None [] True 2048.io 330707 0.0.1 7 0
147 [] A node.js module for querying genome data in t... [] True 23query 8261189 0.0.1 0 0
334 [] 3kenizer [mocha, jamjs] True 3kenizer 1347501 0.1.2 0 2
795 [async, azure, bcrypt-nodejs, cloudinary, conn... 47Pages fork of the Node.js CMS: Web Applicati... [browserify, chalk, fs-extra, gulp, gulp-cover... True 47pages-keystone 1729805 0.0.5 23 21
906 [promise, request, ws] Node.js module for 6px [] True 6px 477885 0.1.6 3 0
1058 [acorn-6to5, ast-types, chokidar, commander, e... Turn ES6 code into readable vanilla ES5 with s... [browserify, chai, istanbul, jshint, jshint-st... True 6to5 469274 1.15.0 15 9
1175 [shelljs] An all-in-one package for 7zip [] True 7zjs 532583 1.0.1 1 0
1252 [underscore, backbone] One Collection different models, mapped easy v... [grunt] True Backbone.Chosen 7479381 0.1.1 2 1
1404 [coffee-script] Hierarchical time-series axis for charts with ... [coffeedoctest] True ChartTime 332943 0.1.0 1 1
1562 [underscore, underscore-data, simple-mime, coo... Simple middleware [] True F 6403985 0.0.4 10 0
1580 [] Generate massive amounts of fake contextual data [jshint, istanbul, mocha, node-minify, optimis... True Faker 277301 0.7.2 0 6
1622 [async, coffee-script, datejs, discount, expre... Friggeri.net [] True Friggeri.net 5048320 0.2.6 8 0
1642 [azure, step, options] Github hook for node.js apps hosted in Microso... [mocha, should] True GitAzure 407924 0.0.7 3 2
1655 [tape, kdtree] Convert GPS latitude/longitude pairs to US zip... [] True Gps2zip 784590 1.0.0 2 0
1667 [mongodb] GridFS made easy. [] True GridFS 273140 0.3.0 1 0
1735 [nopt, generic-pool, iconv, ipaddr.js, semver,... An SMTP Server project. [nodeunit] True Haraka 325500 2.5.0 14 1
1852 [] CSS-like selectors for JSON [] True JSONSelect 245760 0.4.0 0 0
1890 [irrklang] a real-time multiuser audio sequencer [] True JSONloops 2095966 0.1.0 1 0
1977 [] A JavaScript implementation of a extendable, f... [] True JSV 1031217 4.0.2 0 0
2160 [tztime, jsduckify, JSON2] Illuminating the forest AND the trees in your ... [coffee-script, coffeedoctest, uglify-js, node... True Lumenize 1408413 0.7.2 3 12
2178 [express, nohm, node-native-zip] Ultra lightweight MVC for Node.js [] True Monorail.js 440742 1.0.1 3 0
2242 [es5-shim, shelf.js, commander, smoosh, JSUS, ... Powerful and versatile 100% javascript object ... [docker, mocha, should] True NDDB 229784 0.9.8 7 3
2291 [request, mime, nomnom, xml2js] Generates an NFO for a video file [] True NFO-Generator 27056268 1.0.0beta25 4 0
2354 [debug, ffi, ref, ref-struct] The Node.js ⇆ Objective-C bridge [libxmljs, dox, memwatch, highlight.js, jade, ... True NodObjC 541861 1.0.0 4 6
2362 [express, socket.io, coffee-script] Project to create an keynote node presenter [] True NodeKeynote 453519 0.0.1 3 0
2564 [underscore, underscore.string, mongodb, redis... [stylus, less, uglify-js, pathfinder, async, m... True Rajas 695798 0.0.1 8 15
2653 [canvas] Charting and graphing node.js module [] True TeeChart 390703 0.1.4 1 0
2705 [] None [] True Thimble 1232864 0.0.1 0 0
2828 [tar] WordNet 3.1 Database files [] True WNdb 9907442 3.1.1 1 0
2837 [express] A selectively caching proxy for repeatable cli... [] True WebParrot 3434439 0.0.3 1 0
2958 [cli-color, duplex] A short node workshop [] True a-taste-of-node 819299 2.1.0 2 0
2999 [marked, mdash, highlight.js] HTML Academy Developer Tools [] True a2 9147496 0.0.1 3 0
3154 [] Jots and tittles and JSON [express, mounce-koine-dictionary, lexham-engl... True aaronshaf-bible-data 3134412 2.2.1 0 4
3174 [] AngularJS A/B Test Service and Directives for ... [gulp, gulp-complexity, gulp-jshint, gulp-ngmi... True ab-test 769976 0.0.1 0 13
3619 [xml2js, request, commander] Unofficial node module for calling ABBYY's OCR... [mocha] True abbyy-ocr 418410 0.0.4 3 1
3645 [lodash, express, socket.io, async, dargs, mkd... Core module for abc-web [] True abc-web-core 4815543 0.0.5 9 0
3662 [glob] A Protractor JS pre-processor to load ABE Spec... [abe-spec, bower, grunt, grunt-cli, grunt-cont... True abe-protractor 418235 0.1.3 1 16
4152 [level] Autocomplete [chai, coveralls, istanbul, mocha] True ac 3770564 0.0.2 1 4
4547 [split, sext, minimist, through, mysql, csv-st... php light FE/BE framework [tape] True aced 1229268 0.0.12 10 1
4579 [coffee-script, async, mongoskin, j3] ERROR: No README.md file found! [packflow] True acf 239597 0.2.1 4 1
4916 [] ECMAScript parser [regenerate, unicode-7.0.0] True acorn 422735 0.10.0 0 2
4930 [] Acorn fork used by 6to5 [regenerate, unicode-7.0.0] True acorn-6to5 729655 0.9.1-14 0 2
4932 [] ECMAScript parser [regenerate, unicode-7.0.0] True acorn-bind-operator 416746 0.7.1 0 2
4956 [] Alternative React JSX parser [regenerate, unicode-7.0.0] True acorn-jsx 690564 0.9.1-7 0 2
4962 [] ECMAScript parser [] True acorn-semicolon 252206 0.1.1-semicolon 0 0
4968 [concat-stream, fpcalc, hyperquest] Get music metadata from AcoustID Web Service [tape] True acoustid 3093882 1.2.1 3 1
5114 [express, socket.io, underscore, node-uuid] Use Socket.io to drive CasperJS [mocha, expect.js, cucumber, http-server] True action-at-a-distance 1703754 1.0.20 4 4
5167 [grunt, browser_fingerprint, node-resque, asyn... actionhero.js is a multi-transport API Server ... [mocha, should, request, redis-sentinel-client] True actionhero 230400 9.4.1 15 4
5226 [] Define your Class with JavaScript. [grunt, grunt-contrib-clean, grunt-contrib-con... True active-class 268797 0.0.1 0 18
5260 [cli, fs-extra, showdown, underscore, undersco... A tool for generating reactive documents from ... [backbone, browserify, codemirror, coffee-scri... True active-markdown 473986 0.3.2 5 11
5289 [any-db, mysql, pg, sqlite3] Rails ActiveRecord inspired for Nodejs. [any-db, mysql, pg, sqlite3] True active_record 275688 0.1.4 4 4
5502 [] Functions to create, process and test objects [grunt, grunt-contrib-jshint, grunt-mocha-cli,... True adam 562403 0.2.0 0 9
5517 [bootstrap] Base css and assets for use across projects [] True adamrneary-base-css 9876362 0.0.1 1 0
5520 [async, colors, glob, iconv-lite, lazy, rimraf... A light weight native JavaScript implementatio... [nodeunit] True adamvr-geoip-lite 19006605 1.2.0 7 1

In [21]:
# distribution of sizes
hist_size = ggplot(aes(x='size'), data=d_packages) + geom_histogram()
hist_size.draw()


Out[21]:

In [18]:
# distribution of dependency count
hist_deps = ggplot(aes(x=d_packages.dependencies.map(len)), data=d_packages) + geom_histogram() + xlab('dependencies')
hist_deps.draw()


Out[18]:

In [37]:
# distribution of devDependency count
hist_devdeps = ggplot(aes(x='size'), data=d_packages) + geom_histogram() + xlab('dev dependencies')
hist_devdeps.draw()


Out[37]:

In [41]:
scat = ggplot(aes('num_dependencies','num_devdependencies'), data=d_packages) + geom_point()
scat.draw()


Out[41]:

Remove outliers


In [ ]:
# remove some outliers
subset = d_packages


print( len(subset)/float(len(d_packages)))

In [5]:
subset.columns


Out[5]:
Index([u'id', u'latest_version', u'version_major', u'version_minor', u'version_patch', u'created', u'modified', u'maintainer_count', u'version_count', u'keywords', u'createdAt', u'modifiedAt', u'deltaSinceModified', u'age', u'delta', u'log_delta', u'deltaSinceModifiedDays'], dtype='object')

Questions


In [12]: