By Ben Welsh
Analyzes the activity of participants in the California Civic Data Coalition's open-source projects.
In [27]:
import os
import math
import time
import folium
import numpy as np
import pandas as pd
import seaborn as sns
from selenium import webdriver
import matplotlib.pyplot as plt
from IPython.core.display import Image
In [28]:
pd.options.display.max_rows = 5000
pd.set_option('display.float_format', lambda x: '%.2f' % x)
In [29]:
sns.set(color_codes=True)
In [30]:
%matplotlib inline
In [31]:
this_dir = os.getcwd()
output_dir = os.path.join(this_dir, 'output')
In [32]:
participants = pd.read_csv(os.path.join(output_dir, "participants.csv"))
In [33]:
sorted(participants[participants.name.isnull()].login)
Out[33]:
In [34]:
sorted(participants[participants.company.isnull()].login)
Out[34]:
In [35]:
sorted(participants[participants.location.isnull()].login)
Out[35]:
In [36]:
sorted(participants[participants.email.isnull()].login)
Out[36]:
In [37]:
unique_contributors = participants[participants.contributions > 0]
In [38]:
total_contributors = len(unique_contributors)
total_contributors
Out[38]:
In [39]:
total_contributions = unique_contributors.contributions.sum()
total_contributions
Out[39]:
In [40]:
unique_contributors.contributions.describe()
Out[40]:
In [41]:
fig = plt.figure()
sns.distplot(unique_contributors.contributions, kde=False)
fig.suptitle("Number of contributions made by GitHub users")
plt.ylabel('Contributors')
plt.xlabel('Contributions')
Out[41]:
In [42]:
def rank_by_contributors(table, field):
grouped = table.groupby(field, as_index=False)
summed = grouped.login.count()
summed.columns = [field, 'count']
return summed
In [43]:
def rank_by_contributions(table, field):
grouped = table.groupby(field, as_index=False)
return grouped.contributions.sum()
In [44]:
def create_ranking(table, field):
count = rank_by_contributors(table, field)
summed = rank_by_contributions(table, field)
merged = count.merge(summed)
merged['count_percent'] = merged['count'] / total_contributors
merged['contributions_percent'] = merged['contributions'] / total_contributions
return merged.sort_values("contributions", ascending=False)
In [45]:
top_names = create_ranking(unique_contributors, 'name')[['name', 'contributions', 'contributions_percent']]
top_names.head(20)
Out[45]:
In [46]:
fig = plt.figure()
sns.barplot(y="name", x="contributions", data=top_names.head(10))
fig.suptitle("Top GitHub contributors")
plt.ylabel('')
plt.xlabel('Contributions')
Out[46]:
In [47]:
top_companies = create_ranking(unique_contributors, 'company')
top_companies.head(20)
Out[47]:
In [48]:
fig = plt.figure()
sns.barplot(y="company", x="contributions", data=top_companies.head(10))
fig.suptitle("Top companies of GitHub contributions")
plt.ylabel('')
plt.xlabel('Contributions')
Out[48]:
In [49]:
top_locations = create_ranking(unique_contributors, 'location')
top_locations.head(20)
Out[49]:
In [50]:
fig = plt.figure()
sns.barplot(y="location", x="contributions", data=top_locations.head(10))
fig.suptitle("Top locations of GitHub contributions")
plt.ylabel('')
plt.xlabel('Contributions')
Out[50]:
In [51]:
location_map_data = top_locations.merge(
unique_contributors.groupby(['location', 'location_x', 'location_y']).size().reset_index()
)
In [52]:
def calculate_radius(column, value, scale=300000):
max_value = location_map_data[column].max()
return math.sqrt(float(value) / max_value) * scale
In [53]:
def screenshot_map(name):
url = "file://{}/{}.html".format(output_dir, name)
outfn = os.path.join(output_dir, "{}.png".format(name))
browser = webdriver.Firefox()
browser.set_window_size(1000, 750)
browser.get(url)
time.sleep(2)
browser.save_screenshot(outfn)
browser.quit()
In [54]:
contributor_map = folium.Map(location=[35, -102], zoom_start=2, tiles="Mapbox Bright")
for i, row in location_map_data.iterrows():
radius = calculate_radius('count', row['count'])
folium.CircleMarker([row.location_y, row.location_x],
radius=radius,
popup=row.location.decode("utf-8"),
color='#3186cc',
fill_color='#3186cc',
fill_opacity=0.25,
).add_to(contributor_map)
In [55]:
contributor_map.save(os.path.join(output_dir, "contributor_map.html"))
In [56]:
screenshot_map("contributor_map")
In [57]:
Image(os.path.join(output_dir, "contributor_map.png"))
Out[57]:
In [58]:
contributions_map = folium.Map(location=[35, -102], zoom_start=2, tiles="Mapbox Bright")
for i, row in location_map_data.iterrows():
radius = calculate_radius('contributions', row['contributions'])
folium.CircleMarker([row.location_y, row.location_x],
radius=radius,
popup=row.location.decode("utf-8"),
color='#3186cc',
fill_color='#3186cc',
fill_opacity=0.25,
).add_to(contributions_map)
In [59]:
contributions_map.save(os.path.join(output_dir, "contributions_map.html"))
In [60]:
screenshot_map("contributions_map")
In [61]:
Image(os.path.join(output_dir, "contributions_map.png"))
Out[61]:
In [62]:
create_ranking(unique_contributors, 'in_coalition').head(20)
Out[62]:
In [63]:
create_ranking(unique_contributors, 'in_california').head(20)
Out[63]:
In [64]:
create_ranking(unique_contributors, 'in_usa').head(20)
Out[64]:
In [65]:
create_ranking(unique_contributors, 'state').head(20)
Out[65]:
In [66]:
top_countries = create_ranking(unique_contributors, 'country')
top_countries.head(20)
Out[66]:
In [67]:
fig = plt.figure()
sns.barplot(y="country", x="contributions", data=top_countries.head(10))
fig.suptitle("Top countries of GitHub contributions")
plt.ylabel('')
plt.xlabel('Contributions')
Out[67]: