Last updated: August 4, 2015
I am not a statistician by trade; far from it. I did take a few stats & econometrics courses in college, but I won't even consider myself an armchair statistician here.
I am not making any suggestions about causation, just merely exploring what the Meetup API has to offer.
This also isn't how I code in general; but I love IPython Jupyter Notebooks, and I wanted an excuse to use it with Pandas (first time I'm using Pandas too!).
This data was used in my EuroPython 2015 talk, Diversity: We're not done yet. (Slides, video soon)
In [98]:
from __future__ import print_function
from collections import defaultdict
import json
import os
import time
import requests
This repository includes all the data files that I used (latest update: Aug 4, 2015). You may skip this part if you don't want to call the Meetup API to get new/fresh data.
Take a look at Meetup's API Console; I used it when forming API requests as well as getting an idea of pagination for some requests.
We'll call a few different endpoints from the Meetup API and save the data locally in a json file for us to use later.
To get your own Meetup API key, you'll need a regular Meetup user account. Once you're logged in, you can navigate to the API Key portion of the API docs to reveal your API key.
API Endpoint docs:
In [15]:
def save_output(data, output_file):
with open(output_file, "w") as f:
json.dump(data, f)
In [6]:
# Set some global variables
MEETUP_API_KEY = "yeah right"
MEETUP_GROUPS_URL = "https://api.meetup.com/2/groups"
PARAMS = {
"signed": True,
"key": MEETUP_API_KEY,
"topic": "python",
"category_id": 34, # 34 = Tech, there are only ~35 categories
"order": "members",
"page": 200, # max allowed
"omit": "group_photo" # no need for photos in response
}
TOTAL_PAGES = 6 # looked on the API console, 1117 meetup groups as of 7/17, 200 groups per page = 6 pages
The Meetup API limits requests, however their documentation isn't exactly helpful. Using their headers, I saw that I was limited to 30 requests per 10 seconds. Therefore, I'll sleep 1 second in between each request to be safe.
In [16]:
def get_meetup_groups():
meetup_groups = []
for i in xrange(TOTAL_PAGES):
PARAMS["offset"] = i
print("GROUPS: Getting page {0} of {1}".format(i+1, TOTAL_PAGES+1))
response = requests.get(MEETUP_GROUPS_URL, params=PARAMS)
if response.ok:
meetup_groups.extend(response.json().get("results"))
time.sleep(1) # don't bombard the Meetup API
print("GROUPS: Collected {0} Meetup groups".format(len(meetup_groups)))
return meetup_groups
In [17]:
meetup_groups = get_meetup_groups()
In [20]:
# Create a directory to save everything
data_dir = "meetup_data"
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# Save meetup groups data
output = os.path.join(data_dir, "meetup_groups.json")
save_output(meetup_groups, output)
In [21]:
# inspect one for funsies
meetup_groups[0]
Out[21]:
We got a lot returned from searching the /groups endpoint with just the "python" topic. So we should narrow it down a bit, as well as sort out PyLadies groups.
My process is to just narrow down by actual name of the group (e.g. python, py, django, etc).
Spot checking the results will definitely be needed, but will come a bit later.
In [44]:
search = ["python", "pydata", "pyramid", "py", "django", "flask", "plone"]
omit = ["happy"] # I realize that a group could be called "happy python user group" or something...
def is_pug(group):
"""
Return `True` if in `search` key words and not in `omit` keywords.
"""
group_name = group.get("name").lower()
for o in omit:
if o in group_name:
return False
for s in search:
if s in group_name:
return True
return False
def sort_groups(groups):
"""
Sort groups by 'pyladies' and 'python user groups'.
"""
pyladies = []
user_groups = []
for g in groups:
if "pyladies" in g.get("name").lower():
pyladies.append(g)
else:
if is_pug(g):
user_groups.append(g)
return user_groups, pyladies
In [42]:
user_groups, pyladies = sort_groups(meetup_groups)
In [47]:
# Let's spot check the UGs to see if what we're left with makes sense
# Note: I took a peek at a few (not shown here) and for the most part,
# all seems okay
for g in user_groups:
print(g.get("name"))
I've adapted this from a Java implementation to find if a point is within a radius of another point. Geo-math is hard.
In [48]:
from math import sin, cos, asin, degrees, radians, atan2, sqrt
In [49]:
RADIUS = 3958.75 # Earth's radius in miles
In [50]:
def is_within_50_miles(pyladies_coords, python_coords):
pyladies_lat, pyladies_lon = pyladies_coords[0], pyladies_coords[1]
python_lat, python_lon = python_coords[0], python_coords[1]
d_lat = radians(pyladies_lat - python_lat)
d_lon = radians(pyladies_lon - python_lon)
sin_d_lat = sin(d_lat / 2)
sin_d_lon = sin(d_lon / 2)
a = (sin_d_lat ** 2 + sin_d_lon ** 2 ) * cos(radians(pyladies_lat)) * cos(radians(python_lat))
c = 2 * atan2(sqrt(a), sqrt(1-a))
dist = RADIUS * c
return dist <= 50
In [53]:
def get_coords(group):
return group.get("lat"), group.get("lon")
def get_nearby_python_groups(pyl, collect):
pyl_coords = get_coords(pyl)
nearby = []
for group in user_groups:
pyt_coords = get_coords(group)
if is_within_50_miles(pyl_coords, pyt_coords):
nearby.append(group)
collect[pyl.get("name")] = nearby
return collect
In [54]:
collect = {}
for pylady in pyladies:
collect = get_nearby_python_groups(pylady, collect)
In [57]:
for item in collect.items():
print(item[0], len(item[1]))
In [82]:
# Save data into pyladies-specific directories
def pylady_dir(pyl):
_dir = pyl.split()
_dir = "".join(_dir)
outdir = os.path.join(data_dir, _dir)
if not os.path.exists(outdir):
os.makedirs(outdir)
return _dir
def save_pyladies():
for pylady in pyladies:
name = pylady.get("name")
subdir = pylady_dir(name)
outputdir = os.path.join(data_dir, subdir)
output = os.path.join(outputdir, subdir + ".json")
save_output(pylady, output)
groups = collect.get(name)
for g in groups:
group_link = g.get("link")
group_name = group_link.split(".com/")[1][:-1]
group_name = "".join(group_name)
outfile = group_name + ".json"
ug_output = os.path.join(outputdir, outfile)
save_output(g, ug_output)
In [83]:
save_pyladies()
Sanity check (I have a tree command installed via brew install tree):
In [89]:
!tree
If getting members from an endpoint returns 0, despite the member count in the group data being a positive number, then the group is set to private & accessible only to members (you can join that group to be able to have access that data, but I did not; I already have too much email).
There's a "pseudo" race condition where the group data member # may be one number, but you actually receive a different number (+/- ~3), it's (probably) due to people leaving or joining the group between the group API call and the members API call.
API endpoint docs:
In [114]:
MEETUP_MEMBER_URL = "https://api.meetup.com/2/members"
PARAMS = {
"signed": True,
"key": MEETUP_API_KEY,
}
In [115]:
def get_members(group):
PARAMS["group_id"] = group.get("id")
members_count = group.get("members")
print(u"MEMBERS: Getting {0} members for group {1}".format(members_count, group.get("name")))
pages = members_count / 200
remainder = members_count % 200
if remainder > 0:
pages += 1
members = []
for i in xrange(pages):
print("MEMBERS: Iteration {0} out of {1}".format(i+1, pages+1))
PARAMS["offset"] = i
resp = requests.get(MEETUP_MEMBER_URL, PARAMS)
if resp.ok:
results = resp.json().get("results")
members.extend(results)
time.sleep(1)
print("MEMBERS: Got {0} members".format(len(members)))
return members
In [118]:
def get_members_collection(pylady, groups):
pylady_members = get_members(pylady)
pug_members = defaultdict(list)
for g in groups:
pg_mbrs = get_members(g)
pug_members[g.get("name")].append(pg_mbrs)
return pylady_members, pug_members
In [120]:
# NOTE: this takes *FOREVER*.
start = time.time()
for i, item in enumerate(collect.items()):
print("COLLECTING: {0} out of {1}".format(i+1, len(collect)+1))
pylady = [p for p in pyladies if p.get("name") == item[0]][0]
pylady_members, pug_members = get_members_collection(pylady, item[1])
print("COLLECTING: Saving all the data!")
pylady_name = pylady.get("name")
outdir = pylady_dir(pylady_name)
outdir = os.path.join(data_dir, outdir)
outfile = os.path.join(outdir, "pyladies_members.json")
save_output(pylady_members, outfile)
outfile = os.path.join(outdir, "pug_members.json")
save_output(pug_members, outfile)
end = time.time()
delta_s = end - start
delta_m = delta_s / 60
print("**DONE**")
print("Completed in {:.0f} minutes".format(delta_m))
In [ ]: