In [2]:
    
import pandas as pd
import numpy as np
from ggplot import *
import time
import datetime as dt
import random
import scipy as sc
import scipy.stats as ss
import statsmodels.api as smapi
import statsmodels.graphics as smgraphics 
from statsmodels.formula.api import ols
from scipy.optimize import curve_fit
from scipy.interpolate import interp1d
import json
    
In [4]:
    
with open ('data/deps-size-all.ldjson', "r") as datafile:
    text = '[%s]' % ',\n'.join(datafile.read().strip().split('\n'))
    d_packages = pd.read_json(text)
d_packages.columns
    
    
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-f2f55c1e4451> in <module>()
      2 with open ('data/deps-size-all.ldjson', "r") as datafile:
      3     text = '[%s]' % ',\n'.join(datafile.read().strip().split('\n'))
----> 4     d_packages = pd.read_json(text)
      5 
      6 d_packages.columns
/Users/anand/data/npm/env/lib/python2.7/site-packages/pandas/io/json.pyc in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
    196         obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
    197                           keep_default_dates, numpy, precise_float,
--> 198                           date_unit).parse()
    199 
    200     if typ == 'series' or obj is None:
/Users/anand/data/npm/env/lib/python2.7/site-packages/pandas/io/json.pyc in parse(self)
    264 
    265         else:
--> 266             self._parse_no_numpy()
    267 
    268         if self.obj is None:
/Users/anand/data/npm/env/lib/python2.7/site-packages/pandas/io/json.pyc in _parse_no_numpy(self)
    481         if orient == "columns":
    482             self.obj = DataFrame(
--> 483                 loads(json, precise_float=self.precise_float), dtype=None)
    484         elif orient == "split":
    485             decoded = dict((str(k), v)
ValueError: Expected object or value
In [26]:
    
d_packages['num_dependencies'] = d_packages['dependencies'].map(len)
d_packages['num_devdependencies'] = d_packages['devDependencies'].map(len)
    
In [35]:
    
d_packages.describe()
    
    Out[35]:
  
    
       
      latest 
      size 
      num_dependencies 
      num_devdependencies 
     
  
  
    
      count 
           5524 
           5504.000000 
       5524.000000 
       5524.000000 
     
    
      mean 
       0.160572 
         331497.266897 
          4.238595 
          2.807748 
     
    
      std 
       0.367169 
        2109101.184107 
          6.067839 
          3.698237 
     
    
      min 
          False 
            165.000000 
          0.000000 
          0.000000 
     
    
      25% 
              0 
           4507.750000 
          0.000000 
          0.000000 
     
    
      50% 
              0 
          14104.500000 
          2.000000 
          1.000000 
     
    
      75% 
              0 
          83817.250000 
          5.000000 
          4.000000 
     
    
      max 
           True 
       27056268.000000 
         32.000000 
         28.000000 
     
  
In [36]:
    
d_packages[d_packages['size'] > d_packages['size'].quantile(.9)][d_packages['latest']]
    
    Out[36]:
  
    
       
      dependencies 
      description 
      devDependencies 
      latest 
      name 
      size 
      version 
      num_dependencies 
      num_devdependencies 
     
  
  
    
      28   
       [ejs, grunt, grunt-contrib-clean, grunt-contri... 
                            1:1 ticket and laptop system 
                                                 [mocha] 
       True 
       1-1-help-desk-system 
         306666 
                 0.0.7 
       18 
        1 
     
    
      52   
       [qs, debug, mime, gm, ws, musicmetadata, mmmag... 
       10er10 is an HTML5 audio jukebox. It works on ... 
                                                      [] 
       True 
                     10er10 
       22254752 
                0.23.0 
       11 
        0 
     
    
      62   
       [express, mongoskin, mustache, jade, sugar, va... 
                           CRUD over express and mongodb 
                                                      [] 
       True 
                      10tcl 
        1393901 
                0.0.10 
        8 
        0 
     
    
      142  
       [express, static-favicon, morgan, cookie-parse... 
                                                    None 
                                                      [] 
       True 
                    2048.io 
         330707 
                 0.0.1 
        7 
        0 
     
    
      147  
                                                      [] 
       A node.js module for querying genome data in t... 
                                                      [] 
       True 
                    23query 
        8261189 
                 0.0.1 
        0 
        0 
     
    
      334  
                                                      [] 
                                                3kenizer 
                                          [mocha, jamjs] 
       True 
                   3kenizer 
        1347501 
                 0.1.2 
        0 
        2 
     
    
      795  
       [async, azure, bcrypt-nodejs, cloudinary, conn... 
       47Pages fork of the Node.js CMS: Web Applicati... 
       [browserify, chalk, fs-extra, gulp, gulp-cover... 
       True 
           47pages-keystone 
        1729805 
                 0.0.5 
       23 
       21 
     
    
      906  
                                  [promise, request, ws] 
                                  Node.js module for 6px 
                                                      [] 
       True 
                        6px 
         477885 
                 0.1.6 
        3 
        0 
     
    
      1058 
       [acorn-6to5, ast-types, chokidar, commander, e... 
       Turn ES6 code into readable vanilla ES5 with s... 
       [browserify, chai, istanbul, jshint, jshint-st... 
       True 
                       6to5 
         469274 
                1.15.0 
       15 
        9 
     
    
      1175 
                                               [shelljs] 
                          An all-in-one package for 7zip 
                                                      [] 
       True 
                       7zjs 
         532583 
                 1.0.1 
        1 
        0 
     
    
      1252 
                                  [underscore, backbone] 
       One Collection different models, mapped easy v... 
                                                 [grunt] 
       True 
            Backbone.Chosen 
        7479381 
                 0.1.1 
        2 
        1 
     
    
      1404 
                                         [coffee-script] 
       Hierarchical time-series axis for charts with ... 
                                         [coffeedoctest] 
       True 
                  ChartTime 
         332943 
                 0.1.0 
        1 
        1 
     
    
      1562 
       [underscore, underscore-data, simple-mime, coo... 
                                       Simple middleware 
                                                      [] 
       True 
                          F 
        6403985 
                 0.0.4 
       10 
        0 
     
    
      1580 
                                                      [] 
        Generate massive amounts of fake contextual data 
       [jshint, istanbul, mocha, node-minify, optimis... 
       True 
                      Faker 
         277301 
                 0.7.2 
        0 
        6 
     
    
      1622 
       [async, coffee-script, datejs, discount, expre... 
                                            Friggeri.net 
                                                      [] 
       True 
               Friggeri.net 
        5048320 
                 0.2.6 
        8 
        0 
     
    
      1642 
                                  [azure, step, options] 
       Github hook for node.js apps hosted in Microso... 
                                         [mocha, should] 
       True 
                   GitAzure 
         407924 
                 0.0.7 
        3 
        2 
     
    
      1655 
                                          [tape, kdtree] 
       Convert GPS latitude/longitude pairs to US zip... 
                                                      [] 
       True 
                    Gps2zip 
         784590 
                 1.0.0 
        2 
        0 
     
    
      1667 
                                               [mongodb] 
                                       GridFS made easy. 
                                                      [] 
       True 
                     GridFS 
         273140 
                 0.3.0 
        1 
        0 
     
    
      1735 
       [nopt, generic-pool, iconv, ipaddr.js, semver,... 
                                 An SMTP Server project. 
                                              [nodeunit] 
       True 
                     Haraka 
         325500 
                 2.5.0 
       14 
        1 
     
    
      1852 
                                                      [] 
                             CSS-like selectors for JSON 
                                                      [] 
       True 
                 JSONSelect 
         245760 
                 0.4.0 
        0 
        0 
     
    
      1890 
                                              [irrklang] 
                   a real-time multiuser audio sequencer 
                                                      [] 
       True 
                  JSONloops 
        2095966 
                 0.1.0 
        1 
        0 
     
    
      1977 
                                                      [] 
       A JavaScript implementation of a extendable, f... 
                                                      [] 
       True 
                        JSV 
        1031217 
                 4.0.2 
        0 
        0 
     
    
      2160 
                              [tztime, jsduckify, JSON2] 
       Illuminating the forest AND the trees in your ... 
       [coffee-script, coffeedoctest, uglify-js, node... 
       True 
                   Lumenize 
        1408413 
                 0.7.2 
        3 
       12 
     
    
      2178 
                        [express, nohm, node-native-zip] 
                       Ultra lightweight MVC for Node.js 
                                                      [] 
       True 
                Monorail.js 
         440742 
                 1.0.1 
        3 
        0 
     
    
      2242 
       [es5-shim, shelf.js, commander, smoosh, JSUS, ... 
       Powerful and versatile 100% javascript object ... 
                                 [docker, mocha, should] 
       True 
                       NDDB 
         229784 
                 0.9.8 
        7 
        3 
     
    
      2291 
                         [request, mime, nomnom, xml2js] 
                       Generates an NFO for a video file 
                                                      [] 
       True 
              NFO-Generator 
       27056268 
           1.0.0beta25 
        4 
        0 
     
    
      2354 
                           [debug, ffi, ref, ref-struct] 
                        The Node.js ⇆ Objective-C bridge 
       [libxmljs, dox, memwatch, highlight.js, jade, ... 
       True 
                    NodObjC 
         541861 
                 1.0.0 
        4 
        6 
     
    
      2362 
                     [express, socket.io, coffee-script] 
             Project to create an keynote node presenter 
                                                      [] 
       True 
                NodeKeynote 
         453519 
                 0.0.1 
        3 
        0 
     
    
      2564 
       [underscore, underscore.string, mongodb, redis... 
                                                         
       [stylus, less, uglify-js, pathfinder, async, m... 
       True 
                      Rajas 
         695798 
                 0.0.1 
        8 
       15 
     
    
      2653 
                                                [canvas] 
                    Charting and graphing node.js module 
                                                      [] 
       True 
                   TeeChart 
         390703 
                 0.1.4 
        1 
        0 
     
    
      2705 
                                                      [] 
                                                    None 
                                                      [] 
       True 
                    Thimble 
        1232864 
                 0.0.1 
        0 
        0 
     
    
      2828 
                                                   [tar] 
                              WordNet 3.1 Database files 
                                                      [] 
       True 
                       WNdb 
        9907442 
                 3.1.1 
        1 
        0 
     
    
      2837 
                                               [express] 
       A selectively caching proxy for repeatable cli... 
                                                      [] 
       True 
                  WebParrot 
        3434439 
                 0.0.3 
        1 
        0 
     
    
      2958 
                                     [cli-color, duplex] 
                                   A short node workshop 
                                                      [] 
       True 
            a-taste-of-node 
         819299 
                 2.1.0 
        2 
        0 
     
    
      2999 
                           [marked, mdash, highlight.js] 
                            HTML Academy Developer Tools 
                                                      [] 
       True 
                         a2 
        9147496 
                 0.0.1 
        3 
        0 
     
    
      3154 
                                                      [] 
                               Jots and tittles and JSON 
       [express, mounce-koine-dictionary, lexham-engl... 
       True 
       aaronshaf-bible-data 
        3134412 
                 2.2.1 
        0 
        4 
     
    
      3174 
                                                      [] 
       AngularJS A/B Test Service and Directives for ... 
       [gulp, gulp-complexity, gulp-jshint, gulp-ngmi... 
       True 
                    ab-test 
         769976 
                 0.0.1 
        0 
       13 
     
    
      3619 
                            [xml2js, request, commander] 
       Unofficial node module for calling ABBYY's OCR... 
                                                 [mocha] 
       True 
                  abbyy-ocr 
         418410 
                 0.0.4 
        3 
        1 
     
    
      3645 
       [lodash, express, socket.io, async, dargs, mkd... 
                                 Core module for abc-web 
                                                      [] 
       True 
               abc-web-core 
        4815543 
                 0.0.5 
        9 
        0 
     
    
      3662 
                                                  [glob] 
       A Protractor JS pre-processor to load ABE Spec... 
       [abe-spec, bower, grunt, grunt-cli, grunt-cont... 
       True 
             abe-protractor 
         418235 
                 0.1.3 
        1 
       16 
     
    
      4152 
                                                 [level] 
                                            Autocomplete 
                      [chai, coveralls, istanbul, mocha] 
       True 
                         ac 
        3770564 
                 0.0.2 
        1 
        4 
     
    
      4547 
       [split, sext, minimist, through, mysql, csv-st... 
                               php light FE/BE framework 
                                                  [tape] 
       True 
                       aced 
        1229268 
                0.0.12 
       10 
        1 
     
    
      4579 
                   [coffee-script, async, mongoskin, j3] 
                         ERROR: No README.md file found! 
                                              [packflow] 
       True 
                        acf 
         239597 
                 0.2.1 
        4 
        1 
     
    
      4916 
                                                      [] 
                                       ECMAScript parser 
                             [regenerate, unicode-7.0.0] 
       True 
                      acorn 
         422735 
                0.10.0 
        0 
        2 
     
    
      4930 
                                                      [] 
                                 Acorn fork used by 6to5 
                             [regenerate, unicode-7.0.0] 
       True 
                 acorn-6to5 
         729655 
              0.9.1-14 
        0 
        2 
     
    
      4932 
                                                      [] 
                                       ECMAScript parser 
                             [regenerate, unicode-7.0.0] 
       True 
        acorn-bind-operator 
         416746 
                 0.7.1 
        0 
        2 
     
    
      4956 
                                                      [] 
                            Alternative React JSX parser 
                             [regenerate, unicode-7.0.0] 
       True 
                  acorn-jsx 
         690564 
               0.9.1-7 
        0 
        2 
     
    
      4962 
                                                      [] 
                                       ECMAScript parser 
                                                      [] 
       True 
            acorn-semicolon 
         252206 
       0.1.1-semicolon 
        0 
        0 
     
    
      4968 
                     [concat-stream, fpcalc, hyperquest] 
            Get music metadata from AcoustID Web Service 
                                                  [tape] 
       True 
                   acoustid 
        3093882 
                 1.2.1 
        3 
        1 
     
    
      5114 
             [express, socket.io, underscore, node-uuid] 
                         Use Socket.io to drive CasperJS 
               [mocha, expect.js, cucumber, http-server] 
       True 
       action-at-a-distance 
        1703754 
                1.0.20 
        4 
        4 
     
    
      5167 
       [grunt, browser_fingerprint, node-resque, asyn... 
       actionhero.js is a multi-transport API Server ... 
         [mocha, should, request, redis-sentinel-client] 
       True 
                 actionhero 
         230400 
                 9.4.1 
       15 
        4 
     
    
      5226 
                                                      [] 
                      Define your Class with JavaScript. 
       [grunt, grunt-contrib-clean, grunt-contrib-con... 
       True 
               active-class 
         268797 
                 0.0.1 
        0 
       18 
     
    
      5260 
       [cli, fs-extra, showdown, underscore, undersco... 
       A tool for generating reactive documents from ... 
       [backbone, browserify, codemirror, coffee-scri... 
       True 
            active-markdown 
         473986 
                 0.3.2 
        5 
       11 
     
    
      5289 
                            [any-db, mysql, pg, sqlite3] 
                 Rails ActiveRecord inspired for Nodejs. 
                            [any-db, mysql, pg, sqlite3] 
       True 
              active_record 
         275688 
                 0.1.4 
        4 
        4 
     
    
      5502 
                                                      [] 
           Functions to create, process and test objects 
       [grunt, grunt-contrib-jshint, grunt-mocha-cli,... 
       True 
                       adam 
         562403 
                 0.2.0 
        0 
        9 
     
    
      5517 
                                             [bootstrap] 
             Base css and assets for use across projects 
                                                      [] 
       True 
        adamrneary-base-css 
        9876362 
                 0.0.1 
        1 
        0 
     
    
      5520 
       [async, colors, glob, iconv-lite, lazy, rimraf... 
       A light weight native JavaScript implementatio... 
                                              [nodeunit] 
       True 
          adamvr-geoip-lite 
       19006605 
                 1.2.0 
        7 
        1 
     
  
In [21]:
    
# distribution of sizes
hist_size = ggplot(aes(x='size'), data=d_packages) + geom_histogram()
hist_size.draw()
    
    Out[21]:
 
In [18]:
    
# distribution of dependency count
hist_deps = ggplot(aes(x=d_packages.dependencies.map(len)), data=d_packages) + geom_histogram() + xlab('dependencies')
hist_deps.draw()
    
    Out[18]:
 
In [37]:
    
# distribution of devDependency count
hist_devdeps = ggplot(aes(x='size'), data=d_packages) + geom_histogram() + xlab('dev dependencies')
hist_devdeps.draw()
    
    Out[37]:
 
In [41]:
    
scat = ggplot(aes('num_dependencies','num_devdependencies'), data=d_packages) + geom_point()
scat.draw()
    
    Out[41]:
 
In [ ]:
    
# remove some outliers
subset = d_packages
print( len(subset)/float(len(d_packages)))
    
In [5]:
    
subset.columns
    
    Out[5]:
Index([u'id', u'latest_version', u'version_major', u'version_minor', u'version_patch', u'created', u'modified', u'maintainer_count', u'version_count', u'keywords', u'createdAt', u'modifiedAt', u'deltaSinceModified', u'age', u'delta', u'log_delta', u'deltaSinceModifiedDays'], dtype='object')
In [12]:
    
    
Content source: anandthakker/npm-data-exploration
Similar notebooks: