%config InlineBackend.figure_format = 'retina'
# %config InlineBackend.figure_format = 'png'
%load_ext autoreload
%autoreload 2

import pandas as pd
import sklearn as sk
import json

import numpy as np
import pandas as pd
import toolz
import seaborn.apionly as sns
import matplotlib.pyplot as plt
import subsample.algorithms as subsample
%matplotlib inline

import pymongo
DB = pymongo.MongoClient('localhost', 27017)

from datetime import datetime

# import toyplot

import folium

import mplleaflet
import geopandas as gpd

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

meetup_cities = pd.read_csv("data/meetup_cities.csv")

cities1000 = pd.read_csv("data/cities1000.csv")
# Dedupe cities by selecting the highest population entry
cities1000 = cities1000.groupby('ansiname', as_index=False, group_keys=False)\
                       .apply(lambda x: x.ix[x.population.idxmax()])

meetup_cities.loc[ == "New York",'city'] = 'New York City'  # Name consistency for join
cities = pd.merge(meetup_cities, cities1000, how='left', left_on='city', right_on='name')

cities_gb = cities[cities['country'] == 'gb']

fig, ax = plt.subplots()
cities.head(10).plot(kind='scatter', loglog=True, x='population', y='members', alpha=1, ax = ax, color='lightgreen', label="Top 10")
cities.tail(-10).plot(kind='scatter', loglog=False, x='population', y='members', alpha=0.2, linewidths=0, ax = ax)
cities_gb.plot(kind='scatter', loglog=True, x='population', y='members', ax=ax, alpha=1, color='orange', label="UK")
for _, row in toolz.concatv(cities_gb.iterrows(), cities.head(10).iterrows()):
    ax.annotate(row['city'][:4], xy=(row['population'], row['members']*1.1),
               horizontalalignment='center', verticalalignment='bottom')
ax.set_xlabel("City population")
ax.set_ylabel("Number of meetup members")
ax.legend(loc='upper left', scatterpoints=1)

groups = pd.read_csv("crawler/groups.csv.gz").drop_duplicates(subset='id')
groups['topics'] = groups.topics.dropna().apply(json.loads)
groups['topics_urlkeys'] = groups.topics.dropna().apply(lambda x: ' '.join([i['urlkey'] for i in x]))

groups['category'] = groups.category.dropna().apply(json.loads)
groups['category_id'] = groups.category.dropna().apply(lambda x: x['id'])

Topic modelling

def topic_modeling(text_data, max_df=0.95, min_df=0.02, max_features=1000, stop_words="english", n_topics=10,
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.02, max_features=max_features,
    nmf = NMF(n_components=n_topics, random_state=1)
    pipeline_topic_modeling = make_pipeline(
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(nmf.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print ""

topic_modeling(groups.query('category_id == 10').topics_urlkeys.dropna(), n_topics=5, n_top_words=30)

Topic #0:
self meditation spirituality improvement spiritual healing empowerment consciousness exploration growth personal wellness lifetransform yoga energy stress mindfulness transformation health development living relief life healthy coaching medicine alternative holistic lawofattraction support

Topic #1:
business networking marketing professional entrepreneurship strategy startup small women businesses entrepreneur smallbiz entrepreneurs referral owners womens startups prodev professionals technology online coaching internet media leadership socialnetwork creative innovation education young

Topic #2:
social singles fun newintown times socialnetwork diningout friends music nightlife women culture dating london dancing pubs 30s language new bars drinking professional livemusic relationships group 20s travel coffee young talking

Topic #3:
fitness outdoor sports excercise training healthy outdoors living group wellness hiking walkers health adventures womens nutrition yoga socials adventure travel beginners walks support dance weekend social classes lessons holistic arts

Topic #4:
development web programming technology computer softwaredev mobile data newtech opensource management startups design internet startup personal learning innovation science entrepreneurship media games businesses professionals coaching group community leadership life creative


t_groups_ids = list(groups.loc[groups.category_id == 34,'id'])
t_events = list({'': {'$in': t_groups_ids}}))
for e in t_events:
    e['datetime'] = datetime.utcfromtimestamp(e['time']/1000)
tedf = pd.DataFrame(t_events)

#{'': {'$in': t_groups_ids}}).count()


nt_groups_ids = list(subsample.approximate_sample(groups.loc[groups.category_id != 34,'id'], 0.1))
nt_events = list({'': {'$in': nt_groups_ids}}))
for e in nt_events:
    e['datetime'] = datetime.utcfromtimestamp(e['time']/1000)
ntedf = pd.DataFrame(nt_events)

fig, ax = plt.subplots(figsize=(16,6), ncols=3, nrows=2)
month_grouper = lambda x: x.datetime.dt.month
weekday_grouper = lambda x: x.datetime.dt.weekday
hour_grouper = lambda x: x.datetime.dt.hour

for i, grouper in enumerate((month_grouper, weekday_grouper, hour_grouper)):
    grouped = tedf.groupby(grouper(tedf))
    grouped2 = ntedf.groupby(grouper(ntedf))
    a = sns.barplot(grouped.groups.keys(), (grouped.venue.count()/len(tedf)).tolist(), lw=0, color='lightgreen', ax=ax[0][i], alpha=0.8, label="Tech event")
    b = sns.barplot(grouped2.groups.keys(), (grouped2.venue.count()/len(ntedf)).tolist(), lw=0, color='lightblue', ax=ax[1][i], alpha=0.8, label="Non tech event")
#     sns.barplot('Mo,Tu,We,Th,Fr,Sa,Su'.split(","), adf.groupby(adf.datetime.dt.weekday).venue.count().tolist(), lw=0, color='lightgreen', ax=ax[1])
#     sns.barplot(df.index.tolist(), adf.groupby(adf.datetime.dt.hour).venue.count().tolist(), lw=0, color='lightgreen', ax=ax[2])
for i in (0, 1):
for ax_i in ax.ravel():
h1, l1 = ax[0][0].get_legend_handles_labels()
h2, l2 = ax[1][0].get_legend_handles_labels()
ax[0][2].legend(h1+h2, l1+l2, loc='upper left')
# ax[0].set_ylabel("")

Events occur most frequently in the weekdays' evenings, and less frequently on the months of August and December.

m2 = folium.Map()

In [139]:
mc = folium.MarkerCluster()
for event in subsample.reservoir_sample(t_events, 1000):
        lon, lat = event['venue']['lon'], event['venue']['lat']
        if lat == 0 and lon == 0: # Outliers
        marker = folium.Marker((lat, lon))
        folium.Popup(u'Event "{}" by "{}" on {}'.format(event['name'],
    except KeyError:
m = folium.Map(tiles = 'CartoDB positron',

Most events (~75%) are in London Greater Area, and ~16% in Shoreditch.

mc = folium.MarkerCluster()
for event in subsample.reservoir_sample(nt_events, 1000):
        lon, lat = event['venue']['lon'], event['venue']['lat']
        if lat == 0 and lon == 0: # Outliers
        marker = folium.Marker((lat, lon))
        folium.Popup(u'Event "{}" by "{}" on {}'.format(event['name'],
    except KeyError:
m = folium.Map(tiles = 'CartoDB positron',

fig, ax = plt.subplots()
lons = []
lats = []
for event in events[:5000]:
        lon, lat = event['venue']['lon'], event['venue']['lat']
    except KeyError:
# ax.scatter(lons,lats, marker='.', alpha=0.1)
# mplleaflet.display(fig=fig, tiles=positron)

gb = gpd.read_file('data/lad.json')

fig, ax = plt.subplots()
original = dict(ellps='WGS84', datum='WGS84', proj='longlat') = original
gb.to_crs(epsg=27700).geometry.plot(facecolor='white', edgecolor='grey', linewidth=0.1, ax=ax)
# ax.scatter(lons, lats, marker='.', alpha=1)

fig, ax = plt.subplots()
original = dict(ellps='WGS84', datum='WGS84', proj='longlat') = original
gb.to_crs(epsg=27700).geometry.plot(facecolor='white', linewidth=0.1, ax=ax)
ax.scatter(lons, lats, marker='.', alpha=1)

for e in events:

# Make a plot of the number of groups created in each category

groups['created_dt'] = pd.to_datetime(groups.created, unit='ms')

for i in xrange(45):
        print groups.query("category_id == {}".format(i)).iloc[0].category['shortname'], 

arts-culture career-business cars-motorcycles community-environment dancing education-learning fashion-beauty fitness food-drink games lgbt government-politics health-wellbeing hobbies-crafts language lifestyle book-clubs movies-film music new-age-spirituality outdoors-adventure paranormal parents-family pets-animals photography religion-beliefs sci-fi-fantasy singles socializing sports-recreation support tech writing

gg = groups.sort_values(by='created_dt')
# gg_tech = gg.query('category_id == 34')
# gg_arts = gg.query('category_id == 1')
# print gg_tech.groupby(gg_tech.created_dt.dt.year).count()[['category']]
# print gg_arts.groupby(gg_arts.created_dt.dt.year).count()[['category']]

Unnamed: 0 category city country created description group_photo id join_mode lat ... state timezone topics urlname utc_offset visibility who topics_urlkeys category_id created_dt
0 0 {u'shortname': u'language', u'name': u'languag... London GB 1205260880000 <p>Es un intercambio de ingles y espanol. Haz ... {"thumb_link": " 1058118 closed 51.549999 ... 17 Europe/London [{u'name': u'Spanish Language', u'urlkey': u's... spanish-896 0 public_limited Members spanish language-exchange spanish-language-london 16 2008-03-11 18:41:20
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
520 rows × 26 columns

a = gg.groupby('category_id').apply((x.created_dt.dt.year + x.created_dt.dt.month)).count()[['category_id']])

timeseries = a.rename(columns={'category_id': 'meetup_groups'}).reset_index().fillna(0)

colors = ('red', 'blue')
for i, g in enumerate((1, 34)):
    sns.tsplot(timeseries.query('category_id == {}'.format(g)).meetup_groups, color=colors[i])

nuts = gpd.GeoDataFrame.from_file('data/nuts_rg_60m_2010_lvl_1.geojson')
uk_nuts = nuts[nuts.NUTS_ID.apply(lambda x: 'UK' in x)]

import shapely
import shapely.geometry
groups['geometry'] = groups.apply(lambda x: shapely.geometry.Point(x['lon'], x['lat']), axis=1)

groups_gdf = gpd.GeoDataFrame(groups) = {'init': 'epsg:4326'}

Unnamed: 0 category city country created description group_photo id join_mode lat ... state timezone topics urlname utc_offset visibility who topics_urlkeys category_id geometry
0 0 {u'shortname': u'language', u'name': u'languag... London GB 1205260880000 <p>Es un intercambio de ingles y espanol. Haz ... {"thumb_link": " 1058118 closed 51.549999 ... 17 Europe/London [{u'name': u'Spanish Language', u'urlkey': u's... spanish-896 0 public_limited Members spanish language-exchange spanish-language-london 16 POINT (-0.230000004172 51.5499992371)
1 1 {u'shortname': u'hobbies-crafts', u'name': u'h... London GB 1205962371000 <p>Formed in 2008, the Croydon Filmmaking meet... {"thumb_link": " 1070873 approval 51.520000 ... 17 Europe/London [{u'name': u'Film Industry', u'urlkey': u'film... filmind-351 0 members Film People filmind dv screenwriters indiefilm video film-... 15 POINT (-0.10000000149 51.52000045779999)
2 2 {u'shortname': u'socializing', u'name': u'soci... London GB 1206119861000 <p style="text-align : justify">A WALK WELCOME... {"thumb_link": " 1073622 open 51.520000 ... 17 Europe/London [{u'name': u'Fitness', u'urlkey': u'fitness', ... Londons-Social-Urban-Walks 0 public London Strollers fitness singles hiking walkers newintown self-... 31 POINT (-0.10000000149 51.52000045779999)
3 3 {u'shortname': u'socializing', u'name': u'soci... Twickenham GB 1206365647000 <p>Meet other people who are interested in wid... {"thumb_link": " 1077504 open 51.450001 ... 17 Europe/London [{u'name': u'New In Town', u'urlkey': u'newint... twickenham-social-meetup 0 public_limited Twickers Locals newintown rugby neighbors socialnetwork social... 31 POINT (-0.330000013113 51.4500007629)
4 4 {u'shortname': u'tech', u'name': u'tech', u'id... London GB 1206457505000 Meet other locals who earn their living from t... {"thumb_link": " 1079363 open 51.520000 ... 17 Europe/London [{u'name': u'Web Design', u'urlkey': u'webdesi... internetpro-56 0 public Internet Professionals webdesign media internetpro web 34 POINT (-0.10000000149 51.52000045779999)

5 rows × 26 columns

Index([u'name_right', u'id'], dtype='object')

london = gpd.read_file('data/london.geojson')
groups_gdf_london =, london, how='inner')
groups_gdf_london_count = groups_gdf_london.groupby(by='name_right')[['id']].count().reset_index()
london_withcounts = london.merge(groups_gdf_london_count, how='left', left_on="name", right_on="name_right").fillna(0)
# uk_nuts_merge2 = uk_nuts_merge2.to_crs(epsg='27700')

fig, ax = plt.subplots()
    ax = ax, column='id', colormap='OrRd', legend=True)


fig, ax = plt.subplots(figsize=(12,2.5), ncols=2)

pointInPolys2 =, uk_nuts, how='inner')
gpointSumByPoly2 = pointInPolys2.groupby(by='NUTS_ID')[['id']].count().reset_index().rename({"id": "groupcount"})
uk_nuts_merge2 = uk_nuts.merge(gpointSumByPoly2, how='left', left_on="NUTS_ID", right_on="NUTS_ID").fillna(0)
uk_nuts_merge2 = uk_nuts_merge2.to_crs(epsg='27700')
geopandas.plotting.plot_dataframe(uk_nuts_merge2, ax=ax[0], column='id', colormap='OrRd', legend=True)

london = gpd.read_file('data/london.geojson')
groups_gdf_london =, london, how='inner')
groups_gdf_london_count = groups_gdf_london.groupby(by='name_right')[['id']].count().reset_index()
london_withcounts = london.merge(groups_gdf_london_count, how='left', left_on="name", right_on="name_right").fillna(0)
# uk_nuts_merge2 = uk_nuts_merge2.to_crs(epsg='27700')

    ax=ax[1], column='id', colormap='OrRd', legend=True)

for ax_i in ax.ravel():
# plt.savefig("figure/maps.pdf")

