Seed contributors

By Ben Welsh

Seeds a master list of California Civic Data Coalition participants with open-source contributors drawn from the GitHub API. Last harvested on Dec. 18, 2016, using a Python script that interacts with GitHub's API.


In [1]:
import pandas as pd
import numpy as np

Load in the data


In [2]:
table = pd.read_csv("./input/contributors.csv")

In [3]:
table.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 9 columns):
repo             225 non-null object
login            225 non-null object
name             175 non-null object
email            108 non-null object
company          115 non-null object
location         145 non-null object
bio              55 non-null object
avatar_url       225 non-null object
contributions    225 non-null int64
dtypes: int64(1), object(8)
memory usage: 15.9+ KB

Clean up strings


In [4]:
table.replace(np.nan, "", inplace=True)

In [5]:
table.login = table.login.map(str.strip).str.lower()
table.company = table.company.map(str.strip)
table.location = table.location.map(str.strip)
table.avatar_url = table.avatar_url.map(str.strip)

Merge in corrections


In [6]:
corrections = pd.read_csv("./input/contributors-corrections.csv")

In [7]:
table = table.merge(corrections, on="login", how="left")

In [8]:
table.name = table.corrected_name.fillna(table.name)
table.company = table.corrected_company.fillna(table.company)
table.location = table.corrected_location.fillna(table.location)
table.email = table.corrected_email.fillna(table.email)

In [9]:
table.drop('corrected_name', axis=1, inplace=True)
table.drop('corrected_company', axis=1, inplace=True)
table.drop('corrected_location', axis=1, inplace=True)
table.drop('corrected_email', axis=1, inplace=True)

Merge some common variations


In [10]:
table.loc[table.location.isin(['Los Angeles', 'Los Angeles, California']), 'location'] = 'Los Angeles, CA'
table.loc[table.location.isin(['Washington D.C.', 'District of Columbia', 'Washington, D.C.']), 'location'] = 'Washington, DC'
table.loc[table.location == 'Chicago', 'location'] = 'Chicago, IL'
table.loc[table.location == 'San Francisco', 'location'] = 'San Francisco, CA'
table.loc[table.location == 'Palo Alto', 'location'] = 'Palo Alto, CA'
table.loc[table.location == 'Spokane, Wash.', 'location'] = 'Spokane, WA'
table.loc[table.location == 'Hackney, London', 'location'] = 'London, UK'
table.loc[table.location.isin(['Brooklyn', 'Brooklyn NY', 'Brooklyn, NY', 'NYC', 'New York']), 'location'] = 'New York, NY'
table.loc[table.location == 'Columbia, Missouri', 'location'] = 'Columbia, MO'
table.loc[table.location == 'Tucson, Arizona', 'location'] = 'Tucson, AZ'
table.loc[table.location == 'Toronto', 'location'] = 'Toronto, Canada'
table.loc[table.location == 'Salt Lake City, Utah', 'location'] = 'Salt Lake City, UT'
table.loc[table.location == 'Houston', 'location'] = 'Houston, TX'
table.loc[table.location == 'Orange County, Calif.', 'location'] = 'Houston, TX'

In [11]:
table.company = table.company.str.replace("The ", "")
table.loc[table.company == 'Sunnmorsposten', 'company'] = 'Sunnmørsposten'
table.loc[table.company == 'Wall Street Journal.', 'company'] = 'Wall Street Journal'
table.loc[table.company == 'Northwestern University Knight Lab', 'company'] = 'Northwestern'
table.loc[table.company == 'Investigative News Network', 'company'] = 'Institute for Nonprofit News'
table.loc[table.company == 'Stanford', 'company'] = 'Stanford University'
table.loc[table.company == 'Missouri School of Journalism', 'company'] = 'University of Missouri'
table.loc[table.company == 'University of Iowa School of Journalism', 'company'] = 'University of Iowa'
table.loc[table.company == 'Knight-Mozilla fellow 2015', 'company'] = 'Mozilla OpenNews'
table.loc[table.company == 'Knight-Mozilla Fellow', 'company'] = 'Mozilla OpenNews'

Output unique list


In [12]:
columns = [
    "login",
    "name",
    "email",
    "company",
    "location",
    "bio",
    "avatar_url"
]
unique_contributors = table.groupby(columns, as_index=False).contributions.sum()

In [13]:
login_list = [
    'palewire',
    'gordonje',
    'sahilchinoy',
    'aboutaaron',
    'armendariz',
    'cephillips',
    'jlagetz'
]
unique_contributors['in_coalition'] = unique_contributors.login.isin(login_list)

California v. everybody


In [14]:
unique_contributors['in_california'] = False
unique_contributors.loc[unique_contributors.location.str.endswith(", CA"), 'in_california'] = True

Count the different states and countries


In [15]:
unique_contributors.loc[unique_contributors.location == '', 'in_usa'] = np.NaN
unique_contributors.loc[unique_contributors.location.str.contains(", \w{2}$"), 'in_usa'] = True
unique_contributors.loc[unique_contributors.location.str.contains(", \w{3,}$"), 'in_usa'] = False

In [20]:
def split_state(val):
    if val == np.NaN:
        return val
    elif val == "":
        return np.NaN
    else:
        try:
            parent = val.split(", ")[1]
        except IndexError:
            return val
        if len(parent) == 2:
            return parent
        else:
            return np.NaN

In [21]:
unique_contributors['state'] = unique_contributors['location'].apply(split_state)

In [28]:
def split_country(val):
    if val == np.NaN:
        return val
    elif val == "":
        return np.NaN
    else:
        try:
            parent = val.split(", ")[1]
        except IndexError:
            return val
        if len(parent) == 2:
            return "United States of America"
        elif len(parent) > 2:
            return parent
        else:
            return np.NaN

In [29]:
unique_contributors['country'] = unique_contributors['location'].apply(split_country)

Output data


In [30]:
unique_contributors.to_csv("./output/participants.csv", index=False)