In [4]:
%pwd


Out[4]:
u'C:\\ARGO\\talks\\split-apply-combine'

In [5]:
import os
import gzip
import json
import yaml
import pandas as pd
import numpy as np
import blaze as bz
print bz.__version__

directory = r'data\github_archive'
filename = '2015-01-01-0.json.gz'
path = os.path.join(directory, filename)
with gzip.open(path) as f:
        print yaml.dump(yaml.load(f.readline()), default_flow_style=False)


0.8.3
actor:
  avatar_url: https://avatars.githubusercontent.com/u/9152315?
  gravatar_id: ''
  id: 9152315
  login: davidjhulse
  url: https://api.github.com/users/davidjhulse
created_at: '2015-01-01T00:00:00Z'
id: '2489368070'
payload:
  before: 86ffa724b4d70fce46e760f8cc080f5ec3d7d85f
  commits:
  - author:
      email: david.hulse@live.com
      name: davidjhulse
    distinct: true
    message: 'Altered BingBot.jar


      Fixed issue with multiple account support'
    sha: a9b22a6d80c1e0bb49c1cf75a3c075b642c28f81
    url: https://api.github.com/repos/davidjhulse/davesbingrewardsbot/commits/a9b22a6d80c1e0bb49c1cf75a3c075b642c28f81
  distinct_size: 1
  head: a9b22a6d80c1e0bb49c1cf75a3c075b642c28f81
  push_id: 536740396
  ref: refs/heads/master
  size: 1
public: true
repo:
  id: 28635890
  name: davidjhulse/davesbingrewardsbot
  url: https://api.github.com/repos/davidjhulse/davesbingrewardsbot
type: PushEvent


In [6]:
from collections import Counter
import ujson
with gzip.open(path) as f:
    counts = Counter(str(ujson.loads(line)['type']) for line in f)
print yaml.dump(dict(counts), default_flow_style=False)


CommitCommentEvent: 56
CreateEvent: 815
DeleteEvent: 141
ForkEvent: 213
GollumEvent: 90
IssueCommentEvent: 650
IssuesEvent: 373
MemberEvent: 16
PublicEvent: 2
PullRequestEvent: 315
PullRequestReviewCommentEvent: 85
PushEvent: 4280
ReleaseEvent: 24
WatchEvent: 642


In [7]:
with gzip.open(path) as f:
    events = [ujson.loads(line) for line in f]
creates = [event for event in events if event['type']=='CreateEvent']
pushes = [event for event in events if event['type']=='PushEvent']
pull_requests = [event for event in events if event['type']=='PullRequestEvent']
print len(events)
print len(creates)
print len(pushes)
print len(pull_requests)


7702
815
4280
315

In [8]:
print Counter(pr['actor']['login'] for pr in pull_requests).most_common()[:10]
print Counter(pr['repo']['name'] for pr in pull_requests).most_common()[:10]


[(u'tridge', 5), (u'interactivellama', 4), (u'we555', 4), (u'fishulla', 4), (u'vegitron', 4), (u'tianon', 4), (u'mike-spainhower', 4), (u'rohanbk', 3), (u'mdamt', 3), (u'cspray', 3)]
[(u'selfhub/selfhub', 5), (u'tridge/cuav', 5), (u'docker-library/docs', 4), (u'eclipse/birt', 4), (u'Merchello/Merchello', 4), (u'TeamGabriel/gabriel', 4), (u'twbs/bootstrap', 4), (u'uw-it-aca/jira-hook', 4), (u'fishulla/Torque3D', 4), (u'we555/hello-world', 4)]

In [9]:
actors = Counter(event['actor']['login'] for event in events)
print actors.most_common()[:10]


[(u'KenanSulayman', 79), (u'opencm', 66), (u'mirror-updates', 42), (u'cm-gerrit', 35), (u'westurner', 30), (u'qdm', 29), (u'NikhilSingh19', 27), (u'greatfire', 24), (u'ejoseca', 24), (u'tianon', 21)]

In [10]:
repos = Counter(event['repo']['name'] for event in events)
print repos.most_common()[:10]


[(u'KenanSulayman/heartbeat', 79), (u'sakai-mirror/melete', 30), (u'qdm/qdm.github.io', 29), (u'ejoseca/great-validator', 24), (u'square/okhttp', 23), (u'westurner/sphinxcontrib-srclinks', 23), (u'captainkirkby/Gears', 21), (u'apache/spark', 21), (u'docker-library/docs', 21), (u'NikhilSingh19/NikhilSingh19.github.io', 20)]

In [11]:
for event in creates[:5]:
    print event['payload']


{u'master_branch': u'master', u'ref_type': u'branch', u'ref': u'master', u'description': u'Making an HTML5 game from scratch.', u'pusher_type': u'user'}
{u'master_branch': u'master', u'ref_type': u'branch', u'ref': u'master2', u'description': u'Deploy and Manage Docker Container', u'pusher_type': u'user'}
{u'master_branch': u'master', u'ref_type': u'repository', u'ref': None, u'description': u'An ansible role for setting up Wordpress Nginx website', u'pusher_type': u'user'}
{u'master_branch': u'master', u'ref_type': u'tag', u'ref': u'0.0.184', u'description': None, u'pusher_type': u'user'}
{u'master_branch': u'master', u'ref_type': u'tag', u'ref': u'0.1.32', u'description': None, u'pusher_type': u'user'}

In [12]:
for event in events[:5]:
    print json.dumps(event, indent=4)


{
    "payload": {
        "size": 1, 
        "head": "a9b22a6d80c1e0bb49c1cf75a3c075b642c28f81", 
        "commits": [
            {
                "distinct": true, 
                "sha": "a9b22a6d80c1e0bb49c1cf75a3c075b642c28f81", 
                "message": "Altered BingBot.jar\n\nFixed issue with multiple account support", 
                "url": "https://api.github.com/repos/davidjhulse/davesbingrewardsbot/commits/a9b22a6d80c1e0bb49c1cf75a3c075b642c28f81", 
                "author": {
                    "email": "david.hulse@live.com", 
                    "name": "davidjhulse"
                }
            }
        ], 
        "distinct_size": 1, 
        "push_id": 536740396, 
        "ref": "refs/heads/master", 
        "before": "86ffa724b4d70fce46e760f8cc080f5ec3d7d85f"
    }, 
    "created_at": "2015-01-01T00:00:00Z", 
    "actor": {
        "url": "https://api.github.com/users/davidjhulse", 
        "login": "davidjhulse", 
        "avatar_url": "https://avatars.githubusercontent.com/u/9152315?", 
        "id": 9152315, 
        "gravatar_id": ""
    }, 
    "id": "2489368070", 
    "repo": {
        "url": "https://api.github.com/repos/davidjhulse/davesbingrewardsbot", 
        "id": 28635890, 
        "name": "davidjhulse/davesbingrewardsbot"
    }, 
    "type": "PushEvent", 
    "public": true
}
{
    "payload": {
        "size": 1, 
        "head": "56688cc528224d40679b7e83c105b27367443a8c", 
        "commits": [
            {
                "distinct": true, 
                "sha": "56688cc528224d40679b7e83c105b27367443a8c", 
                "message": "Can be run from outside of bin. Fixed multi-line issue", 
                "url": "https://api.github.com/repos/jmoon018/rshell-unit-tester/commits/56688cc528224d40679b7e83c105b27367443a8c", 
                "author": {
                    "email": "jmoon018@ucr.edu", 
                    "name": "Jamal Moon"
                }
            }
        ], 
        "distinct_size": 1, 
        "push_id": 536740397, 
        "ref": "refs/heads/master", 
        "before": "d5a69a84e4d95c1c2cbc0a828172f9ce94db35c6"
    }, 
    "created_at": "2015-01-01T00:00:00Z", 
    "actor": {
        "url": "https://api.github.com/users/jmoon018", 
        "login": "jmoon018", 
        "avatar_url": "https://avatars.githubusercontent.com/u/5581438?", 
        "id": 5581438, 
        "gravatar_id": ""
    }, 
    "id": "2489368072", 
    "repo": {
        "url": "https://api.github.com/repos/jmoon018/rshell-unit-tester", 
        "id": 26392647, 
        "name": "jmoon018/rshell-unit-tester"
    }, 
    "type": "PushEvent", 
    "public": true
}
{
    "payload": {
        "master_branch": "master", 
        "ref_type": "branch", 
        "ref": "master", 
        "description": "Making an HTML5 game from scratch.", 
        "pusher_type": "user"
    }, 
    "created_at": "2015-01-01T00:00:01Z", 
    "actor": {
        "url": "https://api.github.com/users/christoferpeterson", 
        "login": "christoferpeterson", 
        "avatar_url": "https://avatars.githubusercontent.com/u/6352424?", 
        "id": 6352424, 
        "gravatar_id": ""
    }, 
    "id": "2489368089", 
    "repo": {
        "url": "https://api.github.com/repos/christoferpeterson/Vadek", 
        "id": 28677542, 
        "name": "christoferpeterson/Vadek"
    }, 
    "type": "CreateEvent", 
    "public": true
}
{
    "payload": {
        "size": 8, 
        "head": "bc9aa8a0c999232d2675cf5ba60e9c9d03f2a806", 
        "commits": [
            {
                "distinct": false, 
                "sha": "ec572201658b566373845657a8aca7bc6d6214a4", 
                "message": "Remove unused method.", 
                "url": "https://api.github.com/repos/square/okhttp/commits/ec572201658b566373845657a8aca7bc6d6214a4", 
                "author": {
                    "email": "jw@squareup.com", 
                    "name": "Jake Wharton"
                }
            }, 
            {
                "distinct": false, 
                "sha": "bfa25f05bd5786c990fccb513f56375a4f98d95b", 
                "message": "Merge pull request #1262 from square/jw/unused\n\nRemove unused method.", 
                "url": "https://api.github.com/repos/square/okhttp/commits/bfa25f05bd5786c990fccb513f56375a4f98d95b", 
                "author": {
                    "email": "jesse@swank.ca", 
                    "name": "Jesse Wilson"
                }
            }, 
            {
                "distinct": false, 
                "sha": "6909923b89d98fef39600a170aa50e566dbb730a", 
                "message": "Update CHANGELOG.md", 
                "url": "https://api.github.com/repos/square/okhttp/commits/6909923b89d98fef39600a170aa50e566dbb730a", 
                "author": {
                    "email": "christian.becker.1987@gmail.com", 
                    "name": "Christian Becker"
                }
            }, 
            {
                "distinct": false, 
                "sha": "2ca6dfd055d480a4ef3520af39ad4b7ba02d5491", 
                "message": "Merge pull request #1263 from ChristianBecker/patch-1\n\nUpdate CHANGELOG.md", 
                "url": "https://api.github.com/repos/square/okhttp/commits/2ca6dfd055d480a4ef3520af39ad4b7ba02d5491", 
                "author": {
                    "email": "jesse@swank.ca", 
                    "name": "Jesse Wilson"
                }
            }, 
            {
                "distinct": false, 
                "sha": "5527a1764d973d293faffe31faddf205e3f37641", 
                "message": "We now fall back to TLS 1.0.", 
                "url": "https://api.github.com/repos/square/okhttp/commits/5527a1764d973d293faffe31faddf205e3f37641", 
                "author": {
                    "email": "jwilson@squareup.com", 
                    "name": "Jesse Wilson"
                }
            }, 
            {
                "distinct": false, 
                "sha": "31ec212f84edbe04b9b705f521ed7dc3bd7ddc3a", 
                "message": "Skip buffer allocation when not needed.", 
                "url": "https://api.github.com/repos/square/okhttp/commits/31ec212f84edbe04b9b705f521ed7dc3bd7ddc3a", 
                "author": {
                    "email": "jw@squareup.com", 
                    "name": "Jake Wharton"
                }
            }, 
            {
                "distinct": false, 
                "sha": "7756c4e4fe1cd703cff07d218da2fecb30f916c6", 
                "message": "Merge pull request #1264 from square/jw/kill-two-allocations\n\nSkip buffer allocation when not needed.", 
                "url": "https://api.github.com/repos/square/okhttp/commits/7756c4e4fe1cd703cff07d218da2fecb30f916c6", 
                "author": {
                    "email": "jesse@swank.ca", 
                    "name": "Jesse Wilson"
                }
            }, 
            {
                "distinct": true, 
                "sha": "bc9aa8a0c999232d2675cf5ba60e9c9d03f2a806", 
                "message": "Add a web socket call concept for connecting.\n\nSimilar to HTTP and Call, the WebSocketCall is a representation of a pending HTTP request and subsequent upgrade to speak web sockets. Upon synchronous execution you are handed a WebSocket instance for synchronous writing and also pass in a WebSocketListener for async callbacks due to reading.\n\nThe API changes in this commits also generalize WebSocket such that it's agnostic to being a client or server peer.", 
                "url": "https://api.github.com/repos/square/okhttp/commits/bc9aa8a0c999232d2675cf5ba60e9c9d03f2a806", 
                "author": {
                    "email": "jw@squareup.com", 
                    "name": "Jake Wharton"
                }
            }
        ], 
        "distinct_size": 1, 
        "push_id": 536740405, 
        "ref": "refs/heads/jw/websocket-call", 
        "before": "8590ddf9fe54fe27d24f25dd66729993e63a5d61"
    }, 
    "created_at": "2015-01-01T00:00:01Z", 
    "actor": {
        "url": "https://api.github.com/users/JakeWharton", 
        "login": "JakeWharton", 
        "avatar_url": "https://avatars.githubusercontent.com/u/66577?", 
        "id": 66577, 
        "gravatar_id": ""
    }, 
    "id": "2489368095", 
    "repo": {
        "url": "https://api.github.com/repos/square/okhttp", 
        "id": 5152285, 
        "name": "square/okhttp"
    }, 
    "org": {
        "url": "https://api.github.com/orgs/square", 
        "login": "square", 
        "avatar_url": "https://avatars.githubusercontent.com/u/82592?", 
        "id": 82592, 
        "gravatar_id": ""
    }, 
    "type": "PushEvent", 
    "public": true
}
{
    "payload": {
        "size": 1, 
        "head": "97a402b54bfd562b8cefd11f1361cd5af41b0535", 
        "commits": [
            {
                "distinct": true, 
                "sha": "97a402b54bfd562b8cefd11f1361cd5af41b0535", 
                "message": "init", 
                "url": "https://api.github.com/repos/git4ruby/movie_review1/commits/97a402b54bfd562b8cefd11f1361cd5af41b0535", 
                "author": {
                    "email": "chandanamohit@gmail.com", 
                    "name": "Mohit"
                }
            }
        ], 
        "distinct_size": 1, 
        "push_id": 536740413, 
        "ref": "refs/heads/master", 
        "before": "bc6e2f2a7d2095f8881d44d737506c14c1e07561"
    }, 
    "created_at": "2015-01-01T00:00:03Z", 
    "actor": {
        "url": "https://api.github.com/users/git4ruby", 
        "login": "git4ruby", 
        "avatar_url": "https://avatars.githubusercontent.com/u/9221683?", 
        "id": 9221683, 
        "gravatar_id": ""
    }, 
    "id": "2489368104", 
    "repo": {
        "url": "https://api.github.com/repos/git4ruby/movie_review1", 
        "id": 28520835, 
        "name": "git4ruby/movie_review1"
    }, 
    "type": "PushEvent", 
    "public": true
}

In [13]:
with gzip.open(path) as f:
    lang = [ujson.loads(line) for line in f if "language" in line]
print len(lang)


640

In [14]:
for pr in pull_requests[:5]:
    try:
        if pr['payload']['pull_request']['head']['repo']['language'] is not None:
            print pr['payload']['pull_request']['head']['repo']['language']
    except:
        pass


F#
Java
PHP
C++

In [15]:
df = pd.DataFrame.from_records(((event['type'], event['actor']['login'], event['repo']['name'], pd.Timestamp(event['created_at'])) 
                          for event in events[:100]),
                          columns=['type', 'user', 'repo', 'created_at'])
df.head(10)


Out[15]:
type user repo created_at
0 PushEvent davidjhulse davidjhulse/davesbingrewardsbot 2015-01-01 00:00:00+00:00
1 PushEvent jmoon018 jmoon018/rshell-unit-tester 2015-01-01 00:00:00+00:00
2 CreateEvent christoferpeterson christoferpeterson/Vadek 2015-01-01 00:00:01+00:00
3 PushEvent JakeWharton square/okhttp 2015-01-01 00:00:01+00:00
4 PushEvent git4ruby git4ruby/movie_review1 2015-01-01 00:00:03+00:00
5 PushEvent tlgkccampbell tlgkccampbell/ultraviolet 2015-01-01 00:00:03+00:00
6 PushEvent Vilyan01 Vilyan01/ILP 2015-01-01 00:00:03+00:00
7 PushEvent xndcn xndcn/d-statistics 2015-01-01 00:00:03+00:00
8 PushEvent team3cord team3cord/mc-dotfiles 2015-01-01 00:00:03+00:00
9 CreateEvent greyia greyia/port2container 2015-01-01 00:00:04+00:00

In [16]:
ds = bz.Data(df)
ds


Out[16]:
type user repo created_at
0 PushEvent davidjhulse davidjhulse/davesbingrewardsbot 2015-01-01 00:00:00+00:00
1 PushEvent jmoon018 jmoon018/rshell-unit-tester 2015-01-01 00:00:00+00:00
2 CreateEvent christoferpeterson christoferpeterson/Vadek 2015-01-01 00:00:01+00:00
3 PushEvent JakeWharton square/okhttp 2015-01-01 00:00:01+00:00
4 PushEvent git4ruby git4ruby/movie_review1 2015-01-01 00:00:03+00:00
5 PushEvent tlgkccampbell tlgkccampbell/ultraviolet 2015-01-01 00:00:03+00:00
6 PushEvent Vilyan01 Vilyan01/ILP 2015-01-01 00:00:03+00:00
7 PushEvent xndcn xndcn/d-statistics 2015-01-01 00:00:03+00:00
8 PushEvent team3cord team3cord/mc-dotfiles 2015-01-01 00:00:03+00:00
9 CreateEvent greyia greyia/port2container 2015-01-01 00:00:04+00:00
10 PushEvent slurp-logs piscisaureus/slurp-logs 2015-01-01 00:00:05+00:00

In [17]:
bz.compute(ds[ds.type=='CreateEvent'].user.distinct().count())


Out[17]:
9

In [18]:
df[df.type=='WatchEvent']


Out[18]:
type user repo created_at
65 WatchEvent jchristi LinuxStandardBase/lsb 2015-01-01 00:00:18+00:00
72 WatchEvent tategakibunko inf0rmer/blanket 2015-01-01 00:00:24+00:00
91 WatchEvent alfateam123 parrt/cs652 2015-01-01 00:00:35+00:00

In [19]:
bz.by(ds.type, total=ds.type.count(), user=ds.user.distinct().count(), repo=ds.repo.distinct().count())


Out[19]:
type repo total user
0 CreateEvent 12 12 12
1 DeleteEvent 1 1 1
2 ForkEvent 1 1 1
3 IssueCommentEvent 14 14 13
4 IssuesEvent 2 2 1
5 PullRequestEvent 2 2 2
6 PullRequestReviewCommentEvent 1 1 1
7 PushEvent 56 64 54
8 WatchEvent 2 3 2

In [20]:
df.groupby('type')[['user', 'repo']].agg(lambda x: x.drop_duplicates().count())


Out[20]:
user repo
type
CreateEvent 9 12
DeleteEvent 1 1
ForkEvent 1 1
IssueCommentEvent 14 14
IssuesEvent 2 2
PullRequestEvent 2 2
PullRequestReviewCommentEvent 1 1
PushEvent 60 64
WatchEvent 3 3

In [21]:
df.groupby('type')[['user', 'repo']].agg(lambda x: len(np.unique(sorted(x))))


Out[21]:
user repo
type
CreateEvent 9 12
DeleteEvent 1 1
ForkEvent 1 1
IssueCommentEvent 14 14
IssuesEvent 2 2
PullRequestEvent 2 2
PullRequestReviewCommentEvent 1 1
PushEvent 60 64
WatchEvent 3 3