By Ben Welsh
Seeds a master list of California Civic Data Coalition participants with open-source contributors drawn from the GitHub API. Last harvested on Dec. 18, 2016, using a Python script that interacts with GitHub's API.
In [1]:
import pandas as pd
import numpy as np
In [2]:
table = pd.read_csv("./input/contributors.csv")
In [3]:
table.info()
In [4]:
table.replace(np.nan, "", inplace=True)
In [5]:
table.login = table.login.map(str.strip).str.lower()
table.company = table.company.map(str.strip)
table.location = table.location.map(str.strip)
table.avatar_url = table.avatar_url.map(str.strip)
In [6]:
corrections = pd.read_csv("./input/contributors-corrections.csv")
In [7]:
table = table.merge(corrections, on="login", how="left")
In [8]:
table.name = table.corrected_name.fillna(table.name)
table.company = table.corrected_company.fillna(table.company)
table.location = table.corrected_location.fillna(table.location)
table.email = table.corrected_email.fillna(table.email)
In [9]:
table.drop('corrected_name', axis=1, inplace=True)
table.drop('corrected_company', axis=1, inplace=True)
table.drop('corrected_location', axis=1, inplace=True)
table.drop('corrected_email', axis=1, inplace=True)
In [10]:
table.loc[table.location.isin(['Los Angeles', 'Los Angeles, California']), 'location'] = 'Los Angeles, CA'
table.loc[table.location.isin(['Washington D.C.', 'District of Columbia', 'Washington, D.C.']), 'location'] = 'Washington, DC'
table.loc[table.location == 'Chicago', 'location'] = 'Chicago, IL'
table.loc[table.location == 'San Francisco', 'location'] = 'San Francisco, CA'
table.loc[table.location == 'Palo Alto', 'location'] = 'Palo Alto, CA'
table.loc[table.location == 'Spokane, Wash.', 'location'] = 'Spokane, WA'
table.loc[table.location == 'Hackney, London', 'location'] = 'London, UK'
table.loc[table.location.isin(['Brooklyn', 'Brooklyn NY', 'Brooklyn, NY', 'NYC', 'New York']), 'location'] = 'New York, NY'
table.loc[table.location == 'Columbia, Missouri', 'location'] = 'Columbia, MO'
table.loc[table.location == 'Tucson, Arizona', 'location'] = 'Tucson, AZ'
table.loc[table.location == 'Toronto', 'location'] = 'Toronto, Canada'
table.loc[table.location == 'Salt Lake City, Utah', 'location'] = 'Salt Lake City, UT'
table.loc[table.location == 'Houston', 'location'] = 'Houston, TX'
table.loc[table.location == 'Orange County, Calif.', 'location'] = 'Houston, TX'
In [11]:
table.company = table.company.str.replace("The ", "")
table.loc[table.company == 'Sunnmorsposten', 'company'] = 'Sunnmørsposten'
table.loc[table.company == 'Wall Street Journal.', 'company'] = 'Wall Street Journal'
table.loc[table.company == 'Northwestern University Knight Lab', 'company'] = 'Northwestern'
table.loc[table.company == 'Investigative News Network', 'company'] = 'Institute for Nonprofit News'
table.loc[table.company == 'Stanford', 'company'] = 'Stanford University'
table.loc[table.company == 'Missouri School of Journalism', 'company'] = 'University of Missouri'
table.loc[table.company == 'University of Iowa School of Journalism', 'company'] = 'University of Iowa'
table.loc[table.company == 'Knight-Mozilla fellow 2015', 'company'] = 'Mozilla OpenNews'
table.loc[table.company == 'Knight-Mozilla Fellow', 'company'] = 'Mozilla OpenNews'
In [12]:
columns = [
"login",
"name",
"email",
"company",
"location",
"bio",
"avatar_url"
]
unique_contributors = table.groupby(columns, as_index=False).contributions.sum()
In [13]:
login_list = [
'palewire',
'gordonje',
'sahilchinoy',
'aboutaaron',
'armendariz',
'cephillips',
'jlagetz'
]
unique_contributors['in_coalition'] = unique_contributors.login.isin(login_list)
In [14]:
unique_contributors['in_california'] = False
unique_contributors.loc[unique_contributors.location.str.endswith(", CA"), 'in_california'] = True
In [15]:
unique_contributors.loc[unique_contributors.location == '', 'in_usa'] = np.NaN
unique_contributors.loc[unique_contributors.location.str.contains(", \w{2}$"), 'in_usa'] = True
unique_contributors.loc[unique_contributors.location.str.contains(", \w{3,}$"), 'in_usa'] = False
In [20]:
def split_state(val):
if val == np.NaN:
return val
elif val == "":
return np.NaN
else:
try:
parent = val.split(", ")[1]
except IndexError:
return val
if len(parent) == 2:
return parent
else:
return np.NaN
In [21]:
unique_contributors['state'] = unique_contributors['location'].apply(split_state)
In [28]:
def split_country(val):
if val == np.NaN:
return val
elif val == "":
return np.NaN
else:
try:
parent = val.split(", ")[1]
except IndexError:
return val
if len(parent) == 2:
return "United States of America"
elif len(parent) > 2:
return parent
else:
return np.NaN
In [29]:
unique_contributors['country'] = unique_contributors['location'].apply(split_country)
In [30]:
unique_contributors.to_csv("./output/participants.csv", index=False)