In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2
In [2]:
import numpy as np
from numpy.random import randn
import pandas as pd
#time
from datetime import datetime
from datetime import timedelta
#counting
from collections import Counter
In [3]:
# good old matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
#high-level based on matplotlib
import seaborn as sns
#dynamic and interactive
import highcharts
from highcharts.charts import chart
In [4]:
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
np.random.seed(1234)
In [5]:
#loading highcharts javascript
highcharts.init()
Out[5]:
In [6]:
dfc = pd.read_csv('../../datasets/loc-gowalla_totalCheckins.txt', sep='\t', header=False)
dfc.columns = ['uid','utc','lat','lon','vid']
dfc['utc'] = dfc['utc'].astype('datetime64[ms]')
dfc.head()
Out[6]:
New York, New York!
In [7]:
ny = (dfc['lat']>=40.4774) & (dfc['lat']<=40.9176) & (dfc['lon']>=-74.2589) & (dfc['lon']<=-73.7004)
dfc = dfc[ny]
Time columns
In [8]:
dfc['year'] = dfc['utc'].apply(lambda x: x.date().year)
dfc['month'] = dfc['utc'].apply(lambda x: x.date().month)
dfc['day'] = dfc['utc'].apply(lambda x: x.date().day)
dfc['date'] = dfc['utc'].apply(lambda x: x.date())
def time_in_seconds(x):
return ((x.hour)*60+x.minute)*60+x.second
dfc['time'] = dfc['utc'].apply(lambda x: time_in_seconds(x.time()))
dfc['isotime'] = dfc['utc'].apply(lambda x: x.isoformat() +'Z')
dfc = dfc[dfc['date']<=datetime(2010,10,19).date()]
dfc[['uid','utc','lat','lon','vid', 'time', 'isotime']].head()
Out[8]:
In [9]:
dfv = pd.read_csv('../../datasets/spots.txt', sep='\t', header=False)
dfv.columns = ['vid','name','loc']
coords = dfv['loc'].replace('[^0-9. -]+', '',regex=True)
coords = coords.apply(lambda x: x.split())
dfv['v_lat'] = coords.apply(lambda x: float(x[1]))
dfv['v_lon'] = coords.apply(lambda x: float(x[0]))
dfv = dfv.drop('loc', 1)
ny = (dfv['v_lat']>=40.4774) & (dfv['v_lat']<=40.9176) & (dfv['v_lon']>=-74.2589) & (dfv['v_lon']<=-73.7004)
dfv = dfv[ny]
dfv.head()
Out[9]:
In [10]:
df = pd.merge(dfc, dfv[['vid', 'name']], how='left', on='vid')
Missing venue names on the available checkins:
In [11]:
print "missing venue names: {}% of available checkins".format(len(df[pd.isnull(df['name'])])*100 / len(df))
In [12]:
d = df.groupby('date').size()
In [13]:
chart({
'chart': {
'type': 'line',
'marginRight': 30,
'marginBottom': 50
},
'title': {
'text':'#checkins per day in New York City'
},
'yAxis': {
'type': 'linear',
'title': {'text':'#checkins'},
},
'xAxis': {
'categories': [str(x) for x in d.index.tolist()]
},
'series': [{
'name': 'date',
'data': d.tolist()
}]
})
Out[13]:
In [14]:
from sklearn.cluster import KMeans
In [15]:
start = datetime(2010,1,1,0,0)
dfw = df[df['utc']>start]
In [16]:
cl = min(200, len(dfw)/8)
ml = KMeans(n_clusters=cl)
ml.fit(dfw[['lat', 'lon']])
Out[16]:
In [17]:
labels = Counter(ml.labels_)
clusters = [ [ ml.cluster_centers_[x][1], ml.cluster_centers_[x][0], labels[x] ] for x in labels]
clusters
Out[17]:
In [18]:
delta=0.15
chart({
'chart': {
'type': 'bubble',
'zoomType': 'xy',
'width':800,
'height':800,
'plotBackgroundImage':'https://maps.googleapis.com/maps/api/staticmap?center=40.8,-74.0&zoom=11&size=800x800&maptype=roadmap'
},
'plotOptions': {
'bubble': {
'maxSize':'5%',
'minSize':'1%'
}
},
'legend': {
'enabled': False
},
'yAxis': {
'min':40.8-delta,
'max':40.8+delta
},
'xAxis': {
'min':-74.0-delta,
'max':-74.0+delta
},
'series': [{'data': clusters, 'color':"#FF0000"}]
}, '800px', '800px')
Out[18]:
In [19]:
cols = ['year','month','day','time', 'isotime','uid','lat','lon','vid']
dfc[cols].to_csv('../../datasets/checkins.csv', header=False, index=False)
In [20]:
dfv[['vid','name','v_lat','v_lon']].to_csv('../../datasets/venues.csv', header=False, index=False)
In [ ]: