Analyze participants

By Ben Welsh

Analyzes the activity of participants in the California Civic Data Coalition's open-source projects.

Import Python tools



In [27]:

    
import os
import math
import time
import folium
import numpy as np
import pandas as pd
import seaborn as sns
from selenium import webdriver
import matplotlib.pyplot as plt
from IPython.core.display import Image



In [28]:

    
pd.options.display.max_rows = 5000
pd.set_option('display.float_format', lambda x: '%.2f' % x)



In [29]:

    
sns.set(color_codes=True)



In [30]:

    
%matplotlib inline

Import data



In [31]:

    
this_dir = os.getcwd()
output_dir = os.path.join(this_dir, 'output')



In [32]:

    
participants = pd.read_csv(os.path.join(output_dir, "participants.csv"))

Identify gaps in the data



In [33]:

    
sorted(participants[participants.name.isnull()].login)









    Out[33]:





['cecht',
 'charles-difazio',
 'danachinn',
 'doxrealm',
 'hackymcgee',
 'ianvonseggern1',
 'jayelle-o',
 'karkinosw',
 'katbuchholz',
 'malon',
 'mb10',
 'mjlorda',
 'mmhirsch',
 'pumadegit',
 'python-for-data-journalists',
 'rbhlibi',
 'regirob831',
 'samlo78',
 'soomilee',
 'soorinkimmm',
 'trinieic']



In [34]:

    
sorted(participants[participants.company.isnull()].login)









    Out[34]:





['4ndygu',
 'achou11',
 'agneskc',
 'aidanconnolly',
 'brizandrew',
 'cecht',
 'charles-difazio',
 'daguar',
 'danachinn',
 'djbridges',
 'doxrealm',
 'dwillis',
 'hackymcgee',
 'ianvonseggern1',
 'jayelle-o',
 'karkinosw',
 'katbuchholz',
 'malon',
 'mazhang2718',
 'mb10',
 'mbeveridge',
 'mjlorda',
 'mmhirsch',
 'patrickvossler18',
 'pumadegit',
 'python-for-data-journalists',
 'rbhlibi',
 'regirob831',
 'samlo78',
 'soomilee',
 'soorinkimmm',
 'trinieic']



In [35]:

    
sorted(participants[participants.location.isnull()].login)









    Out[35]:





['aidanconnolly',
 'cecht',
 'charles-difazio',
 'danachinn',
 'djbridges',
 'dwillis',
 'hackymcgee',
 'hodgesmr',
 'ianvonseggern1',
 'jayelle-o',
 'karkinosw',
 'katbuchholz',
 'malon',
 'mazhang2718',
 'mb10',
 'mjlorda',
 'mmhirsch',
 'patrickvossler18',
 'pumadegit',
 'python-for-data-journalists',
 'rbhlibi',
 'regirob831',
 'samlo78',
 'soomilee',
 'soorinkimmm',
 'trinieic']



In [36]:

    
sorted(participants[participants.email.isnull()].login)









    Out[36]:





['agneskc',
 'andyroberson',
 'annkiha',
 'brizandrew',
 'burtherman',
 'carloslemos',
 'caseymm',
 'cecht',
 'chagan',
 'charles-difazio',
 'chrislkeller',
 'danachinn',
 'danmit',
 'djbridges',
 'doxrealm',
 'drtortoise',
 'ebonymarieb',
 'elainewong',
 'emamd',
 'fagerlise',
 'frnsys',
 'hackymcgee',
 'hancush',
 'hodgesmr',
 'ianvonseggern1',
 'jayelle-o',
 'jennbrandel',
 'karkinosw',
 'katbuchholz',
 'kavyasukumar',
 'lilymihalik',
 'livlab',
 'malon',
 'mazhang2718',
 'mb10',
 'mbeveridge',
 'mhkeller',
 'mijebner',
 'mjlorda',
 'mmhirsch',
 'patrickvossler18',
 'pumadegit',
 'python-for-data-journalists',
 'qstin',
 'rbhlibi',
 'rdmurphy',
 'regirob831',
 'roncampbell',
 'samlo78',
 'soomilee',
 'soorinkimmm',
 'tocateunvals',
 'trinieic']

GitHub totals



In [37]:

    
unique_contributors = participants[participants.contributions > 0]



In [38]:

    
total_contributors = len(unique_contributors)
total_contributors









    Out[38]:





166



In [39]:

    
total_contributions = unique_contributors.contributions.sum()
total_contributions









    Out[39]:





6631



In [40]:

    
unique_contributors.contributions.describe()









    Out[40]:





count    166.00
mean      39.95
std      315.54
min        1.00
25%        1.00
50%        2.00
75%        4.00
max     3669.00
Name: contributions, dtype: float64



In [41]:

    
fig = plt.figure()
sns.distplot(unique_contributors.contributions, kde=False)
fig.suptitle("Number of contributions made by GitHub users")
plt.ylabel('Contributors')
plt.xlabel('Contributions')









    Out[41]:





<matplotlib.text.Text at 0x7fbbd30d9ed0>

GitHub rankings



In [42]:

    
def rank_by_contributors(table, field):
    grouped = table.groupby(field, as_index=False)
    summed = grouped.login.count()
    summed.columns = [field, 'count']
    return summed



In [43]:

    
def rank_by_contributions(table, field):
    grouped = table.groupby(field, as_index=False)
    return grouped.contributions.sum()



In [44]:

    
def create_ranking(table, field):
    count = rank_by_contributors(table, field)
    summed = rank_by_contributions(table, field)
    merged = count.merge(summed)
    merged['count_percent'] = merged['count'] / total_contributors
    merged['contributions_percent'] = merged['contributions'] / total_contributions
    return merged.sort_values("contributions", ascending=False)



In [45]:

    
top_names = create_ranking(unique_contributors, 'name')[['name', 'contributions', 'contributions_percent']]
top_names.head(20)









    Out[45]:







  
    
      
      name
      contributions
      contributions_percent
    
  
  
    
      23
      Ben Welsh
      3669
      0.55
    
    
      66
      James Gordon
      1748
      0.26
    
    
      0
      Aaron Williams
      379
      0.06
    
    
      129
      Sahil Chinoy
      130
      0.02
    
    
      2
      Agustin Armendariz
      69
      0.01
    
    
      25
      Bill Chambers
      47
      0.01
    
    
      19
      Anthony Pesce
      41
      0.01
    
    
      29
      Casey Miller
      34
      0.01
    
    
      78
      Juan Elosua
      29
      0.00
    
    
      94
      Luciana Godoy
      25
      0.00
    
    
      31
      Charley Bodkin
      24
      0.00
    
    
      99
      Mark Beveridge
      23
      0.00
    
    
      61
      Francis Tseng
      21
      0.00
    
    
      109
      Michael Keller
      19
      0.00
    
    
      81
      Justin Myers
      18
      0.00
    
    
      79
      Julia Smith
      12
      0.00
    
    
      27
      Burt Herman
      11
      0.00
    
    
      43
      Dan Hill
      11
      0.00
    
    
      93
      Livia Labate
      11
      0.00
    
    
      32
      Cheryl Phillips
      10
      0.00



In [46]:

    
fig = plt.figure()
sns.barplot(y="name", x="contributions", data=top_names.head(10))
fig.suptitle("Top GitHub contributors")
plt.ylabel('')
plt.xlabel('Contributions')









    Out[46]:





<matplotlib.text.Text at 0x7fbbd30b2b10>



In [47]:

    
top_companies = create_ranking(unique_contributors, 'company')
top_companies.head(20)









    Out[47]:







  
    
      
      company
      count
      contributions
      count_percent
      contributions_percent
    
  
  
    
      52
      Los Angeles Times
      15
      3755
      0.09
      0.57
    
    
      20
      California Civic Data Coalition
      1
      1748
      0.01
      0.26
    
    
      84
      Washington Post
      4
      389
      0.02
      0.06
    
    
      77
      UC Berkeley
      1
      130
      0.01
      0.02
    
    
      58
      New York Times
      4
      74
      0.02
      0.01
    
    
      5
      @databricks
      1
      47
      0.01
      0.01
    
    
      53
      Mozilla OpenNews
      7
      47
      0.04
      0.01
    
    
      82
      Vox Media
      2
      37
      0.01
      0.01
    
    
      55
      NPR
      1
      29
      0.01
      0.00
    
    
      85
      tocateunvals
      1
      25
      0.01
      0.00
    
    
      12
      Associated Press
      2
      19
      0.01
      0.00
    
    
      16
      Bloomberg
      1
      19
      0.01
      0.00
    
    
      47
      Institute for Nonprofit News
      2
      19
      0.01
      0.00
    
    
      14
      Austin American-Statesman
      3
      13
      0.02
      0.00
    
    
      61
      Northwestern
      4
      11
      0.02
      0.00
    
    
      44
      Hacks/Hackers
      1
      11
      0.01
      0.00
    
    
      69
      Stanford University
      2
      11
      0.01
      0.00
    
    
      74
      Texas Tribune
      3
      10
      0.02
      0.00
    
    
      72
      Tampa Bay Times
      1
      9
      0.01
      0.00
    
    
      67
      Sourcefabric
      1
      9
      0.01
      0.00



In [48]:

    
fig = plt.figure()
sns.barplot(y="company", x="contributions", data=top_companies.head(10))
fig.suptitle("Top companies of GitHub contributions")
plt.ylabel('')
plt.xlabel('Contributions')









    Out[48]:





<matplotlib.text.Text at 0x7fbbd2e3f110>



In [49]:

    
top_locations = create_ranking(unique_contributors, 'location')
top_locations.head(20)









    Out[49]:







  
    
      
      location
      count
      contributions
      count_percent
      contributions_percent
    
  
  
    
      31
      Los Angeles, CA
      18
      3761
      0.11
      0.57
    
    
      14
      Columbia, MO
      2
      1749
      0.01
      0.26
    
    
      56
      Washington, DC
      18
      440
      0.11
      0.07
    
    
      7
      Berkeley, CA
      4
      180
      0.02
      0.03
    
    
      36
      New York, NY
      19
      168
      0.11
      0.03
    
    
      10
      Buenos Aires, Argentina
      5
      30
      0.03
      0.00
    
    
      44
      San Francisco, CA
      9
      29
      0.05
      0.00
    
    
      12
      Chicago, IL
      4
      25
      0.02
      0.00
    
    
      9
      Bristol, UK
      1
      23
      0.01
      0.00
    
    
      3
      Austin, TX
      5
      22
      0.03
      0.00
    
    
      38
      Omaha, NE
      1
      12
      0.01
      0.00
    
    
      39
      Palo Alto, CA
      2
      11
      0.01
      0.00
    
    
      26
      Kigali, Rwanda
      1
      11
      0.01
      0.00
    
    
      41
      Prague, Czech Republic
      1
      9
      0.01
      0.00
    
    
      51
      Tampa, FL
      1
      9
      0.01
      0.00
    
    
      20
      Evanston, IL
      3
      7
      0.02
      0.00
    
    
      15
      Columbus, OH
      1
      7
      0.01
      0.00
    
    
      47
      Spokane, WA
      1
      6
      0.01
      0.00
    
    
      2
      Atlanta, GA
      2
      6
      0.01
      0.00
    
    
      23
      Houston, TX
      2
      6
      0.01
      0.00



In [50]:

    
fig = plt.figure()
sns.barplot(y="location", x="contributions", data=top_locations.head(10))
fig.suptitle("Top locations of GitHub contributions")
plt.ylabel('')
plt.xlabel('Contributions')









    Out[50]:





<matplotlib.text.Text at 0x7fbbd5eddc50>



In [51]:

    
location_map_data = top_locations.merge(
    unique_contributors.groupby(['location', 'location_x', 'location_y']).size().reset_index()
)



In [52]:

    
def calculate_radius(column, value, scale=300000):
    max_value = location_map_data[column].max()
    return math.sqrt(float(value) / max_value) * scale



In [53]:

    
def screenshot_map(name):
    url = "file://{}/{}.html".format(output_dir, name)
    outfn = os.path.join(output_dir, "{}.png".format(name))
    browser = webdriver.Firefox()
    browser.set_window_size(1000, 750)
    browser.get(url)
    time.sleep(2)
    browser.save_screenshot(outfn)
    browser.quit()



In [54]:

    
contributor_map = folium.Map(location=[35, -102], zoom_start=2, tiles="Mapbox Bright")
for i, row in location_map_data.iterrows():
    radius = calculate_radius('count', row['count'])
    folium.CircleMarker([row.location_y, row.location_x],
        radius=radius,
        popup=row.location.decode("utf-8"),
        color='#3186cc',
        fill_color='#3186cc',
        fill_opacity=0.25,
    ).add_to(contributor_map)



In [55]:

    
contributor_map.save(os.path.join(output_dir, "contributor_map.html"))



In [56]:

    
screenshot_map("contributor_map")



In [57]:

    
Image(os.path.join(output_dir, "contributor_map.png"))









    Out[57]:



In [58]:

    
contributions_map = folium.Map(location=[35, -102], zoom_start=2, tiles="Mapbox Bright")
for i, row in location_map_data.iterrows():
    radius = calculate_radius('contributions', row['contributions'])
    folium.CircleMarker([row.location_y, row.location_x],
        radius=radius,
        popup=row.location.decode("utf-8"),
        color='#3186cc',
        fill_color='#3186cc',
        fill_opacity=0.25,
    ).add_to(contributions_map)



In [59]:

    
contributions_map.save(os.path.join(output_dir, "contributions_map.html"))



In [60]:

    
screenshot_map("contributions_map")



In [61]:

    
Image(os.path.join(output_dir, "contributions_map.png"))









    Out[61]:



In [62]:

    
create_ranking(unique_contributors, 'in_coalition').head(20)









    Out[62]:







  
    
      
      in_coalition
      count
      contributions
      count_percent
      contributions_percent
    
  
  
    
      1
      True
      7
      6007
      0.04
      0.91
    
    
      0
      False
      159
      624
      0.96
      0.09



In [63]:

    
create_ranking(unique_contributors, 'in_california').head(20)









    Out[63]:







  
    
      
      in_california
      count
      contributions
      count_percent
      contributions_percent
    
  
  
    
      1
      True
      41
      3996
      0.25
      0.60
    
    
      0
      False
      125
      2635
      0.75
      0.40



In [64]:

    
create_ranking(unique_contributors, 'in_usa').head(20)









    Out[64]:







  
    
      
      in_usa
      count
      contributions
      count_percent
      contributions_percent
    
  
  
    
      1
      True
      117
      6511
      0.70
      0.98
    
    
      0
      False
      17
      54
      0.10
      0.01



In [65]:

    
create_ranking(unique_contributors, 'state').head(20)









    Out[65]:







  
    
      
      state
      count
      contributions
      count_percent
      contributions_percent
    
  
  
    
      2
      CA
      41
      3996
      0.25
      0.60
    
    
      10
      MO
      3
      1750
      0.02
      0.26
    
    
      4
      DC
      18
      440
      0.11
      0.07
    
    
      15
      NY
      19
      168
      0.11
      0.03
    
    
      9
      IL
      7
      32
      0.04
      0.00
    
    
      20
      UK
      2
      29
      0.01
      0.00
    
    
      19
      TX
      8
      29
      0.05
      0.00
    
    
      13
      NE
      1
      12
      0.01
      0.00
    
    
      5
      FL
      2
      10
      0.01
      0.00
    
    
      23
      WA
      2
      9
      0.01
      0.00
    
    
      16
      OH
      1
      7
      0.01
      0.00
    
    
      0
      AZ
      4
      7
      0.02
      0.00
    
    
      6
      GA
      2
      6
      0.01
      0.00
    
    
      12
      NC
      1
      6
      0.01
      0.00
    
    
      11
      Moscow
      1
      4
      0.01
      0.00
    
    
      21
      UT
      1
      4
      0.01
      0.00
    
    
      8
      IA
      1
      2
      0.01
      0.00
    
    
      7
      Greater NYC Area
      1
      2
      0.01
      0.00
    
    
      17
      Sacramento
      1
      2
      0.01
      0.00
    
    
      1
      Barcelona
      1
      1
      0.01
      0.00



In [66]:

    
top_countries = create_ranking(unique_contributors, 'country')
top_countries.head(20)









    Out[66]:







  
    
      
      country
      count
      contributions
      count_percent
      contributions_percent
    
  
  
    
      17
      United States of America
      116
      6510
      0.70
      0.98
    
    
      0
      Argentina
      5
      30
      0.03
      0.00
    
    
      11
      Rwanda
      1
      11
      0.01
      0.00
    
    
      5
      Czech Republic
      1
      9
      0.01
      0.00
    
    
      10
      Norway
      4
      4
      0.02
      0.00
    
    
      9
      Moscow
      1
      4
      0.01
      0.00
    
    
      2
      Brazil
      1
      2
      0.01
      0.00
    
    
      3
      Canada
      1
      2
      0.01
      0.00
    
    
      8
      Greater NYC Area
      1
      2
      0.01
      0.00
    
    
      12
      Sacramento
      1
      2
      0.01
      0.00
    
    
      4
      Chile
      1
      1
      0.01
      0.00
    
    
      6
      Denmark
      1
      1
      0.01
      0.00
    
    
      7
      Germany
      1
      1
      0.01
      0.00
    
    
      1
      Barcelona
      1
      1
      0.01
      0.00
    
    
      13
      TX - New York
      1
      1
      0.01
      0.00
    
    
      14
      Thailand
      1
      1
      0.01
      0.00
    
    
      15
      USA
      1
      1
      0.01
      0.00
    
    
      16
      United States
      1
      1
      0.01
      0.00



In [67]:

    
fig = plt.figure()
sns.barplot(y="country", x="contributions", data=top_countries.head(10))
fig.suptitle("Top countries of GitHub contributions")
plt.ylabel('')
plt.xlabel('Contributions')









    Out[67]:





<matplotlib.text.Text at 0x7fbbd2502f50>

	name	contributions	contributions_percent
23	Ben Welsh	3669	0.55
66	James Gordon	1748	0.26
0	Aaron Williams	379	0.06
129	Sahil Chinoy	130	0.02
2	Agustin Armendariz	69	0.01
25	Bill Chambers	47	0.01
19	Anthony Pesce	41	0.01
29	Casey Miller	34	0.01
78	Juan Elosua	29	0.00
94	Luciana Godoy	25	0.00
31	Charley Bodkin	24	0.00
99	Mark Beveridge	23	0.00
61	Francis Tseng	21	0.00
109	Michael Keller	19	0.00
81	Justin Myers	18	0.00
79	Julia Smith	12	0.00
27	Burt Herman	11	0.00
43	Dan Hill	11	0.00
93	Livia Labate	11	0.00
32	Cheryl Phillips	10	0.00

	company	count	contributions	count_percent	contributions_percent
52	Los Angeles Times	15	3755	0.09	0.57
20	California Civic Data Coalition	1	1748	0.01	0.26
84	Washington Post	4	389	0.02	0.06
77	UC Berkeley	1	130	0.01	0.02
58	New York Times	4	74	0.02	0.01
5	@databricks	1	47	0.01	0.01
53	Mozilla OpenNews	7	47	0.04	0.01
82	Vox Media	2	37	0.01	0.01
55	NPR	1	29	0.01	0.00
85	tocateunvals	1	25	0.01	0.00
12	Associated Press	2	19	0.01	0.00
16	Bloomberg	1	19	0.01	0.00
47	Institute for Nonprofit News	2	19	0.01	0.00
14	Austin American-Statesman	3	13	0.02	0.00
61	Northwestern	4	11	0.02	0.00
44	Hacks/Hackers	1	11	0.01	0.00
69	Stanford University	2	11	0.01	0.00
74	Texas Tribune	3	10	0.02	0.00
72	Tampa Bay Times	1	9	0.01	0.00
67	Sourcefabric	1	9	0.01	0.00

	location	count	contributions	count_percent	contributions_percent
31	Los Angeles, CA	18	3761	0.11	0.57
14	Columbia, MO	2	1749	0.01	0.26
56	Washington, DC	18	440	0.11	0.07
7	Berkeley, CA	4	180	0.02	0.03
36	New York, NY	19	168	0.11	0.03
10	Buenos Aires, Argentina	5	30	0.03	0.00
44	San Francisco, CA	9	29	0.05	0.00
12	Chicago, IL	4	25	0.02	0.00
9	Bristol, UK	1	23	0.01	0.00
3	Austin, TX	5	22	0.03	0.00
38	Omaha, NE	1	12	0.01	0.00
39	Palo Alto, CA	2	11	0.01	0.00
26	Kigali, Rwanda	1	11	0.01	0.00
41	Prague, Czech Republic	1	9	0.01	0.00
51	Tampa, FL	1	9	0.01	0.00
20	Evanston, IL	3	7	0.02	0.00
15	Columbus, OH	1	7	0.01	0.00
47	Spokane, WA	1	6	0.01	0.00
2	Atlanta, GA	2	6	0.01	0.00
23	Houston, TX	2	6	0.01	0.00

	state	count	contributions	count_percent	contributions_percent
2	CA	41	3996	0.25	0.60
10	MO	3	1750	0.02	0.26
4	DC	18	440	0.11	0.07
15	NY	19	168	0.11	0.03
9	IL	7	32	0.04	0.00
20	UK	2	29	0.01	0.00
19	TX	8	29	0.05	0.00
13	NE	1	12	0.01	0.00
5	FL	2	10	0.01	0.00
23	WA	2	9	0.01	0.00
16	OH	1	7	0.01	0.00
0	AZ	4	7	0.02	0.00
6	GA	2	6	0.01	0.00
12	NC	1	6	0.01	0.00
11	Moscow	1	4	0.01	0.00
21	UT	1	4	0.01	0.00
8	IA	1	2	0.01	0.00
7	Greater NYC Area	1	2	0.01	0.00
17	Sacramento	1	2	0.01	0.00
1	Barcelona	1	1	0.01	0.00

	country	count	contributions	count_percent	contributions_percent
17	United States of America	116	6510	0.70	0.98
0	Argentina	5	30	0.03	0.00
11	Rwanda	1	11	0.01	0.00
5	Czech Republic	1	9	0.01	0.00
10	Norway	4	4	0.02	0.00
9	Moscow	1	4	0.01	0.00
2	Brazil	1	2	0.01	0.00
3	Canada	1	2	0.01	0.00
8	Greater NYC Area	1	2	0.01	0.00
12	Sacramento	1	2	0.01	0.00
4	Chile	1	1	0.01	0.00
6	Denmark	1	1	0.01	0.00
7	Germany	1	1	0.01	0.00
1	Barcelona	1	1	0.01	0.00
13	TX - New York	1	1	0.01	0.00
14	Thailand	1	1	0.01	0.00
15	USA	1	1	0.01	0.00
16	United States	1	1	0.01	0.00