In [1]:
%pylab inline

%load_ext autoreload
%autoreload 2


Populating the interactive namespace from numpy and matplotlib

Libraries


In [2]:
import numpy as np
from numpy.random import randn

import pandas as pd

#time
from datetime import datetime
from datetime import timedelta

#counting
from collections import Counter

In [3]:
# good old matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

#high-level based on matplotlib
import seaborn as sns

#dynamic and interactive
import highcharts
from highcharts.charts import chart

Init


In [4]:
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
np.random.seed(1234)

In [5]:
#loading highcharts javascript
highcharts.init()


Out[5]:

Reading data

Checkins


In [6]:
dfc = pd.read_csv('../../datasets/loc-gowalla_totalCheckins.txt', sep='\t', header=False)
dfc.columns = ['uid','utc','lat','lon','vid']

dfc['utc'] = dfc['utc'].astype('datetime64[ms]')

dfc.head()


Out[6]:
uid utc lat lon vid
0 0 2010-10-18 22:17:43 30.269103 -97.749395 420315
1 0 2010-10-17 23:42:03 30.255731 -97.763386 316637
2 0 2010-10-17 19:26:05 30.263418 -97.757597 16516
3 0 2010-10-16 18:50:42 30.274292 -97.740523 5535878
4 0 2010-10-12 23:58:03 30.261599 -97.758581 15372

Data munging: cleanup, time parsing, etc.

New York, New York!

  • bounding box: (40.4774, -74.2589), (40.9176, -73.7004)

In [7]:
ny = (dfc['lat']>=40.4774) & (dfc['lat']<=40.9176) & (dfc['lon']>=-74.2589) & (dfc['lon']<=-73.7004)
dfc = dfc[ny]

Time columns


In [8]:
dfc['year']     = dfc['utc'].apply(lambda x: x.date().year)
dfc['month']    = dfc['utc'].apply(lambda x: x.date().month)
dfc['day']      = dfc['utc'].apply(lambda x: x.date().day)

dfc['date']     = dfc['utc'].apply(lambda x: x.date())

def time_in_seconds(x):
    return ((x.hour)*60+x.minute)*60+x.second

dfc['time']     = dfc['utc'].apply(lambda x: time_in_seconds(x.time()))

dfc['isotime']  = dfc['utc'].apply(lambda x: x.isoformat() +'Z')

dfc = dfc[dfc['date']<=datetime(2010,10,19).date()]
dfc[['uid','utc','lat','lon','vid', 'time', 'isotime']].head()


Out[8]:
uid utc lat lon vid time isotime
9 0 2010-10-12 00:21:28 40.643885 -73.782806 23261 1288 2010-10-12T00:21:28Z
10 0 2010-10-11 20:21:20 40.741374 -73.988105 16907 73280 2010-10-11T20:21:20Z
11 0 2010-10-11 20:20:42 40.741388 -73.989455 12973 73242 2010-10-11T20:20:42Z
12 0 2010-10-11 00:06:30 40.724910 -73.994621 341255 390 2010-10-11T00:06:30Z
13 0 2010-10-10 22:00:37 40.729768 -73.998535 260957 79237 2010-10-10T22:00:37Z

Venues


In [9]:
dfv = pd.read_csv('../../datasets/spots.txt', sep='\t', header=False)
dfv.columns = ['vid','name','loc']

coords = dfv['loc'].replace('[^0-9. -]+', '',regex=True)
coords = coords.apply(lambda x: x.split())

dfv['v_lat'] = coords.apply(lambda x: float(x[1]))
dfv['v_lon'] = coords.apply(lambda x: float(x[0]))

dfv = dfv.drop('loc', 1)

ny = (dfv['v_lat']>=40.4774) & (dfv['v_lat']<=40.9176) & (dfv['v_lon']>=-74.2589) & (dfv['v_lon']<=-73.7004)
dfv = dfv[ny]

dfv.head()


Out[9]:
vid name v_lat v_lon
0 1391604 Conference House Park 40.501759 -74.252343
1 1391611 Almer G. Russell Pavilion 40.502265 -74.254264
2 3612422 Conference House 40.500064 -74.249042
3 3612431 Billop House 40.500064 -74.249042
4 1391499 Biddle House 40.505483 -74.254146

Merge venue names and checkins


In [10]:
df = pd.merge(dfc, dfv[['vid', 'name']], how='left', on='vid')

Missing venue names on the available checkins:


In [11]:
print "missing venue names: {}% of available checkins".format(len(df[pd.isnull(df['name'])])*100 / len(df))


missing venue names: 19% of available checkins

Data Exploration


In [12]:
d = df.groupby('date').size()

In [13]:
chart({
            'chart': {
                'type': 'line',
                'marginRight': 30,
                'marginBottom': 50
            },
            'title': {
                'text':'#checkins per day in New York City'
            },
            'yAxis': {
                'type': 'linear',
                'title': {'text':'#checkins'},
                
            },
            'xAxis': {
                'categories': [str(x) for x in d.index.tolist()]
            },
            'series': [{
                'name': 'date',
                'data': d.tolist()
            }]
})


Out[13]:
Re-run cell if chart is not shown ...

Clustering


In [14]:
from sklearn.cluster import KMeans

In [15]:
start = datetime(2010,1,1,0,0) 
dfw = df[df['utc']>start]

In [16]:
cl = min(200, len(dfw)/8)
ml = KMeans(n_clusters=cl)
ml.fit(dfw[['lat', 'lon']])


Out[16]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=200, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [17]:
labels = Counter(ml.labels_)
clusters = [ [ ml.cluster_centers_[x][1], ml.cluster_centers_[x][0], labels[x] ] for x in labels]
clusters


Out[17]:
[[-74.015022195584038, 40.7773550104367, 188],
 [-73.954193674962369, 40.724490581561632, 821],
 [-74.186784259077911, 40.70475409981043, 163],
 [-73.846494514159318, 40.75277626556737, 472],
 [-73.986113898674375, 40.757698173454251, 4348],
 [-73.78159774862776, 40.644712698468545, 2489],
 [-73.97636617816309, 40.684118386381357, 810],
 [-74.001176818362623, 40.733972952163867, 1929],
 [-73.944019096102366, 40.833846889171262, 254],
 [-74.117684847208693, 40.902364348180434, 46],
 [-74.165217257175456, 40.579376068555, 220],
 [-73.965345762047363, 40.779527330572265, 2073],
 [-73.830434887443076, 40.892368988221541, 65],
 [-74.044491571007413, 40.692955382649693, 324],
 [-74.15744540907788, 40.83261594067308, 104],
 [-73.979921972212793, 40.57563029619137, 336],
 [-73.99937101315578, 40.705846352126642, 762],
 [-74.046972083791857, 40.879787544687211, 86],
 [-73.809094904919064, 40.70044378605953, 257],
 [-73.872682901571764, 40.774579438142617, 2100],
 [-73.773137119515283, 40.76514339794192, 229],
 [-74.030182223027737, 40.738503848740329, 786],
 [-74.072505598567062, 40.643791352991016, 167],
 [-74.214768074891197, 40.814481937084039, 307],
 [-73.990519660306447, 40.735673537326313, 3508],
 [-74.249122427942552, 40.630588630330315, 188],
 [-73.874778700139046, 40.850277149204288, 210],
 [-74.161563201140297, 40.753715491202982, 67],
 [-73.91251967976045, 40.774507764210732, 177],
 [-73.973377660929884, 40.764035047681375, 3136],
 [-74.02887444893689, 40.625064909081559, 385],
 [-74.047343847722161, 40.786150970579996, 185],
 [-73.996162006359896, 40.722934611217646, 2805],
 [-73.920403351045167, 40.625337016290324, 31],
 [-73.73757483969149, 40.692865504755318, 94],
 [-74.110677711781747, 40.632102356874206, 252],
 [-73.929589773045294, 40.760413497213044, 276],
 [-73.955420981606309, 40.782191186104804, 1269],
 [-73.962997867750133, 40.806520935331321, 1488],
 [-73.803723056639996, 40.818093227381539, 130],
 [-74.231749559104443, 40.524343283401109, 90],
 [-74.100351383854075, 40.825061537520916, 196],
 [-74.17939248110757, 40.691548448987852, 1902],
 [-73.993059064576656, 40.750933769800589, 3409],
 [-73.893265606031107, 40.890520045757775, 90],
 [-73.886739327672544, 40.746460388292157, 255],
 [-74.161962646415176, 40.733323373068316, 606],
 [-73.996304580648669, 40.682023721448417, 791],
 [-73.971792593102109, 40.750890464259292, 710],
 [-73.976669096545891, 40.783699141415219, 1741],
 [-73.990409632201235, 40.728832419689965, 2361],
 [-74.010890712896867, 40.709841341268955, 1820],
 [-73.9068112611007, 40.70137504033007, 143],
 [-74.075681154813694, 40.91499025984524, 168],
 [-73.973633774499092, 40.856733613100907, 110],
 [-74.064547691166325, 40.732049551601705, 294],
 [-74.204338479474188, 40.873375560377418, 62],
 [-73.865679013492311, 40.708873406921796, 78],
 [-73.959226885630031, 40.666286161233607, 363],
 [-73.949914171912809, 40.745049037588139, 531],
 [-73.796080083333337, 40.589665967243334, 30],
 [-74.243429308161112, 40.750388364007406, 54],
 [-73.786828386573944, 40.909922373775352, 142],
 [-73.940853099074246, 40.805719659958683, 334],
 [-73.927042488687661, 40.826600485433282, 640],
 [-73.989760576469493, 40.703402725144066, 1298],
 [-74.003202999687176, 40.720593596619018, 1388],
 [-73.707305175058821, 40.745086171541175, 34],
 [-73.982838908351013, 40.763421002649466, 2719],
 [-74.245928502744448, 40.891905066308333, 72],
 [-74.034919817246731, 40.715573839100657, 306],
 [-73.986905080467707, 40.74809339735922, 2896],
 [-73.981478778712031, 40.674289234029239, 1139],
 [-73.996808958747366, 40.608842504609022, 133],
 [-74.005286819326983, 40.741180527930695, 2642],
 [-73.951642088550201, 40.8516742432668, 253],
 [-73.830065370117879, 40.760583054946352, 274],
 [-73.947749656281488, 40.58786713437901, 81],
 [-74.198312602678257, 40.638567116784785, 46],
 [-73.827567246187101, 40.824911567658063, 62],
 [-73.801028478855656, 40.745311315873046, 115],
 [-73.978648420039448, 40.760133372029422, 3572],
 [-73.829883472524003, 40.664726168817332, 75],
 [-73.716061867551787, 40.659130890607145, 56],
 [-74.21455725887273, 40.667155565794808, 154],
 [-74.074856700229034, 40.76248557090161, 186],
 [-74.164861628367817, 40.610829737421838, 87],
 [-73.930605095668497, 40.702983630122738, 365],
 [-73.912988309938598, 40.870890603949711, 171],
 [-74.068897717311344, 40.85938195913505, 97],
 [-74.072006828421053, 40.80917911926938, 209],
 [-73.951690698805024, 40.803196898573184, 358],
 [-73.966517719311412, 40.762860368324255, 1410],
 [-73.73434184957307, 40.590834666265387, 26],
 [-73.980163724067893, 40.736337431106143, 570],
 [-74.120457519791671, 40.794904517611108, 36],
 [-74.118050119729631, 40.564814225322223, 27],
 [-73.991221441668245, 40.757145825400038, 2497],
 [-73.985436419506499, 40.691298087347981, 446],
 [-74.009907559280563, 40.762759663471734, 283],
 [-73.920973205230354, 40.730783454333931, 56],
 [-74.176994783080474, 40.7424308026905, 379],
 [-73.867965285322967, 40.735546437805183, 135],
 [-74.177555896645657, 40.914380193636958, 46],
 [-73.969144699326819, 40.794369887927843, 686],
 [-74.122093648852456, 40.743054767785246, 61],
 [-73.891676703181076, 40.820459434602704, 37],
 [-73.977329654361455, 40.719566771283439, 314],
 [-73.971658910152144, 40.773422964575424, 2637],
 [-73.946407190523516, 40.714518967333404, 931],
 [-74.206138372460288, 40.538350365380886, 68],
 [-74.023177285508226, 40.819271798912325, 73],
 [-74.013345843674415, 40.703297295845402, 1141],
 [-74.010214558575669, 40.639164237772299, 148],
 [-73.794722730577675, 40.797394958512506, 112],
 [-73.894543821092228, 40.666481567493335, 90],
 [-73.970976975487886, 40.756550365455858, 1511],
 [-73.986412976829712, 40.812955335022465, 138],
 [-73.983284007809374, 40.743220274387582, 1578],
 [-74.03005072382058, 40.906626367628228, 209],
 [-73.988581026104058, 40.741997198662929, 2684],
 [-73.885536471334049, 40.864004085514182, 141],
 [-73.979403991667056, 40.891081175021178, 85],
 [-73.964308569515666, 40.64153124362673, 217],
 [-73.96749700608278, 40.676359849681106, 598],
 [-73.901875051574123, 40.745646230688109, 143],
 [-73.993259988224608, 40.69142876715339, 1032],
 [-73.966295088343301, 40.690832884628868, 388],
 [-73.99932090428679, 40.759889605064814, 810],
 [-73.827586864068834, 40.790110187975323, 77],
 [-73.959747031344563, 40.715416141868708, 1681],
 [-73.956147357932281, 40.772712582138261, 988],
 [-74.030967700424327, 40.748452297497749, 444],
 [-74.188623826446829, 40.790971726618984, 79],
 [-73.855815553577969, 40.680375085132205, 59],
 [-73.830412220623401, 40.710980385175887, 141],
 [-74.245010738972724, 40.693442719165908, 44],
 [-73.704219446062496, 40.838636422631254, 16],
 [-74.173653599658422, 40.669767510109899, 101],
 [-74.05821731463223, 40.914913540147779, 90],
 [-74.048532775601984, 40.605933072807922, 101],
 [-73.983245852574953, 40.753585373625342, 2060],
 [-73.796092661778104, 40.720943824367659, 201],
 [-73.996218649724185, 40.730133226621597, 1278],
 [-73.777872564098487, 40.700300955456818, 132],
 [-74.033880562833176, 40.728026687251443, 624],
 [-73.789111594948636, 40.643206135561606, 586],
 [-73.987909536120611, 40.720544707397764, 2324],
 [-74.18664505711952, 40.833979937365854, 41],
 [-74.211039152834417, 40.834646280766236, 154],
 [-74.044199934801313, 40.720339010798469, 456],
 [-73.796446862532378, 40.664491377969526, 105],
 [-73.946983673228289, 40.778480957416264, 707],
 [-74.14539437374286, 40.787151842392859, 84],
 [-73.955555201455184, 40.763137495284852, 482],
 [-74.174675887120458, 40.856298254186363, 44],
 [-73.936114828143218, 40.850388608527133, 199],
 [-74.007033752727864, 40.734431735281255, 811],
 [-73.981212068272981, 40.776547800779241, 1373],
 [-73.922462529010815, 40.743826225318919, 222],
 [-73.852791831372144, 40.837135431489287, 140],
 [-73.997968611565454, 40.717216996792637, 1100],
 [-73.987924778915058, 40.666193761462345, 571],
 [-74.158119237328236, 40.890538254855294, 85],
 [-74.082422119698762, 40.723715470235803, 81],
 [-74.012915782002963, 40.675009685125744, 202],
 [-73.953791761287647, 40.792147474238838, 672],
 [-73.984051881074208, 40.728272527382067, 2303],
 [-74.003754692860525, 40.862364056323685, 76],
 [-73.990653403167812, 40.764933924096781, 1491],
 [-74.136351406411492, 40.625119638741381, 87],
 [-73.905263256912505, 40.850290297732499, 120],
 [-74.005310267192641, 40.748788280341344, 774],
 [-73.976033358451517, 40.744640485819936, 988],
 [-73.976339530145609, 40.662537715672514, 342],
 [-73.827562687029996, 40.85855158135417, 120],
 [-73.738062257453578, 40.635459290103569, 28],
 [-74.097684287565514, 40.60601977751552, 58],
 [-73.864164222737031, 40.770366140746233, 478],
 [-73.95497551248684, 40.818975744804938, 243],
 [-74.230844566599998, 40.586220492319995, 10],
 [-73.932601708885215, 40.866605195577392, 115],
 [-73.919415977299181, 40.766242583601091, 367],
 [-73.819151374145463, 40.594317819987275, 55],
 [-73.747117591878947, 40.731990550489478, 38],
 [-74.245824357865004, 40.803990452137498, 40],
 [-73.977277498472887, 40.752492468923244, 2508],
 [-74.009631140012786, 40.716226009696435, 1205],
 [-73.847820724619851, 40.722799273694385, 267],
 [-73.995193319918116, 40.743108726997008, 2170],
 [-74.099780160552172, 40.683019916013045, 23],
 [-73.926445936912131, 40.795391574889592, 173],
 [-73.889038363488226, 40.574834704605884, 17],
 [-74.138498158002534, 40.828802564069541, 197],
 [-73.937044913898632, 40.676824579891779, 146],
 [-74.007690217841414, 40.725296728576112, 1147],
 [-74.091018692383329, 40.882939647541669, 36],
 [-73.982667665885799, 40.769101025403266, 2570],
 [-74.000885557052101, 40.727333307656302, 1737],
 [-73.72810829610286, 40.774284143015713, 70]]

In [18]:
delta=0.15
chart({
            'chart': {
                'type': 'bubble',
                'zoomType': 'xy',
                'width':800,
                'height':800,
                'plotBackgroundImage':'https://maps.googleapis.com/maps/api/staticmap?center=40.8,-74.0&zoom=11&size=800x800&maptype=roadmap'
            },
            'plotOptions': {
              'bubble': {
                'maxSize':'5%',
                'minSize':'1%'
               }
             },
            'legend': { 
              'enabled': False
            },
            'yAxis': {
              'min':40.8-delta,
              'max':40.8+delta
            },
            'xAxis': {
              'min':-74.0-delta,
              'max':-74.0+delta
            },
            'series': [{'data': clusters, 'color':"#FF0000"}]
}, '800px', '800px')


Out[18]:
Re-run cell if chart is not shown ...

Write data to csv file

checkins


In [19]:
cols = ['year','month','day','time', 'isotime','uid','lat','lon','vid']
dfc[cols].to_csv('../../datasets/checkins.csv', header=False, index=False)

venues


In [20]:
dfv[['vid','name','v_lat','v_lon']].to_csv('../../datasets/venues.csv', header=False, index=False)

In [ ]: