In [1]:
import pandas as pd
import numpy as np
import sklearn
from pygeocoder import Geocoder
import time
import datetime
import networkx
from sklearn.svm import SVC
from sklearn import neighbors, cross_validation, mixture
import scipy.stats
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
%pylab inline
In [2]:
data = pd.read_csv('11700TrackingData.csv')
In [3]:
del data['Unnamed: 0']
Part of the pre-proccessing was performed manually
In [4]:
data.head()
Out[4]:
In [46]:
data.columns = ['date','time','long','lat','Google Location','Address','week day']
data.head()
Out[46]:
In [43]:
def reverseGeo(lat1, long1):
#results = Geocoder.reverse_geocode(37.875741,-122.260363)
return Geocoder.reverse_geocode(lat1,long1)
In [44]:
print reverseGeo(37.864022,-122.274711).city, reverseGeo(37.864022,-122.274711).route
In [7]:
data.head()
Out[7]:
In [8]:
del data['Unnamed: 0']
In [41]:
locArray = data[:][['lat','long']].values
In [18]:
data['Google Location'] = ''
data['Full Address'] = ''
data['time'] = ''
Geo-Coding is done in steps due to Google API limitation
In [47]:
start = currentIndex #2200
end = currentIndex + 2500
for index, locPair in enumerate(locArray[start:end]):
time.sleep(0.1)
print index
result = reverseGeo(locPair[0],locPair[1])
if (result.route is not None):
clear_output(wait=True)
print 'city'
print result.route
data['Google Location'][start + index] = str(str(result.city) + ' '+ str(result.route))
data['Address'][start + index] = result.formatted_address
else:
data['Google Location'][start + index] = result.formatted_address
data['Address'][start + index] = result.formatted_address
#print str(result.city + ' '+ result.route)
print 'done codeing from 0 to 4200'
In [48]:
currentIndex = currentIndex + 2500
currentIndex
Out[48]:
In [264]:
dataWithTIme = pd.read_csv('RawTrackingData.csv')
In [283]:
timeColumn = dataWithTIme.ix[:,1]
In [284]:
data.ix[:,1] = timeColumn
In [78]:
currentIndex
Out[78]:
In [49]:
data.to_csv('11700TrackingData.csv')
data[0:currentIndex].tail()
Out[49]:
In [50]:
data['week day'] = ''
for index, element in enumerate(data['date']):
t= datetime.datetime.strptime(element, '%m/%d/%y')
data['week day'][index] = t.weekday()
In [51]:
plt.figure()
plt.plot(data[0:currentIndex]['lat'], data[0:currentIndex]['long'], 'o-', color='r')
plt.xlabel('lat')
plt.ylabel('long')
plt.title('Lat versus Long')
plt.figure()
Out[51]:
In [52]:
data.ix[0]['Google Location']
Out[52]:
In [53]:
import datetime
locationS = pd.DataFrame({'Brief Location':[],'TimeIn':[],'TimeOut':[],'deltaT':[],'MeanLat':[],'MeanLong':[],'WeekDay':[]})
locationS.head()
currentLocation = data.ix[0]['Address']
startTime = data.ix[0]['time']
ArrayOfLat = []
ArrayOfLong = []
ArrayOfLat.append(data.ix[0]['lat'])
ArrayOfLong.append(data.ix[0]['long'])
weekDay = data.ix[0]['week day']
for i in range(0,currentIndex):
if(currentLocation == data.ix[i]['Address']):
#Update time
ArrayOfLat.append(data.ix[i]['lat'])
ArrayOfLong.append(data.ix[i]['long'])
else:
meanLat = np.mean(ArrayOfLat)
meanLong = np.mean(ArrayOfLong)
endTime = data.ix[i]['time']
FMT = '%I:%M:%S %p'
deltaTime = datetime.datetime.strptime(endTime, FMT) - datetime.datetime.strptime(startTime, FMT)
locationS.ix[len(locationS), ['Brief Location','TimeIn','TimeOut','deltaT','MeanLat','MeanLong','WeekDay']] = [currentLocation,startTime,endTime,deltaTime,meanLat,meanLong,weekDay]
currentLocation = data.ix[i]['Address']
startTime = data.ix[i]['time']
weekDay = data.ix[i]['week day']
ArrayOfLat = []
ArrayOfLong = []
ArrayOfLat.append(data.ix[i]['lat'])
ArrayOfLong.append(data.ix[i]['long'])
locationS[:]
Out[53]:
In [54]:
from datetime import datetime, timedelta
a = locationS[locationS.deltaT > timedelta(minutes =15)]
In [55]:
a.head()
Out[55]:
In [56]:
import datetime
locationSM = pd.DataFrame({'Brief Location':[],'TimeIn':[],'TimeOut':[],'deltaT':[],'MeanLat':[],'MeanLong':[],'week day':[]})
locationSM.head()
currentLocation = locationS.iloc[0,:]['Brief Location']
startTime = locationS.ix[0]['TimeIn']
endTime = locationS.ix[0]['TimeOut']
ArrayOfLat = []
ArrayOfLong = []
ArrayOfLat.append(locationS.ix[0]['MeanLat'])
ArrayOfLong.append(locationS.ix[0]['MeanLong'])
wk = locationS.ix[0]['WeekDay']
for i in range(len(a)):
location = a.iloc[i,:]
if(currentLocation == location['Brief Location']):
#Update time
endTime = location['TimeOut']
ArrayOfLat.append(location['MeanLat'])
ArrayOfLong.append(location['MeanLong'])
if (i == len(a)-1):
meanLat = np.mean(ArrayOfLat)
meanLong = np.mean(ArrayOfLong)
FMT = '%I:%M:%S %p'
deltaTime = datetime.datetime.strptime(endTime, FMT) - datetime.datetime.strptime(startTime, FMT)
locationSM.ix[len(locationSM), ['Brief Location','TimeIn','TimeOut','deltaT','MeanLat','MeanLong','week day']] = [currentLocation,startTime,endTime,deltaTime,meanLat,meanLong,wk]
currentLocation = location['Brief Location']
startTime = location['TimeIn']
endTime = location['TimeOut']
ArrayOfLat = []
ArrayOfLong = []
wk = location['WeekDay']
ArrayOfLat.append(location['MeanLat'])
ArrayOfLong.append(location['MeanLong'])
else:
meanLat = np.mean(ArrayOfLat)
meanLong = np.mean(ArrayOfLong)
FMT = '%I:%M:%S %p'
deltaTime = datetime.datetime.strptime(endTime, FMT) - datetime.datetime.strptime(startTime, FMT)
locationSM.ix[len(locationSM), ['Brief Location','TimeIn','TimeOut','deltaT','MeanLat','MeanLong','week day']] = [currentLocation,startTime,endTime,deltaTime,meanLat,meanLong, wk]
currentLocation = location['Brief Location']
startTime = location['TimeIn']
endTime = location['TimeOut']
ArrayOfLat = []
ArrayOfLong = []
wk = location['WeekDay']
ArrayOfLat.append(location['MeanLat'])
ArrayOfLong.append(location['MeanLong'])
locationSM[:]
Out[56]:
In [57]:
import random
def r(minimum, maximum):
return minimum + (maximum - minimum) * random.random()
In [59]:
plt.plot(locationSM[:]['MeanLat'], locationSM[:]['MeanLong'], 'o-', color='r')
plt.xlabel('lat')
plt.ylabel('long')
plt.title('Lat versus Long')
Out[59]:
In [60]:
plt.figure()
plt.plot(locationSM[:]['MeanLat'], locationSM[:]['MeanLong'], 'o-', color='r')
plt.xlabel('lat')
plt.ylabel('long')
plt.title('Lat versus Long')
meanX = (max(locationSM[:]['MeanLat']) + min(locationSM[:]['MeanLat'])) / 2
meanY = (max(locationSM[:]['MeanLong']) + min(locationSM[:]['MeanLong'])) / 2
for label, x, y in zip(locationSM[:]['Brief Location'], locationSM[:]['MeanLat'], locationSM[:]['MeanLong']):
sX = 1
sY = 1
if (x > meanX):
sX = 4
else:
sX = -1
if (y > meanY):
sY = 1
else:
sY = -1
plt.annotate(
label,
xy = (x, y), xytext = (sX*100, r(-5,5)*sY*100),
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
fig = plt.figure()
In [61]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
X = vstack([locationSM[:]['MeanLat'], locationSM[:]['MeanLong']]).T
# Compute DBSCAN
db = DBSCAN(eps=0.001, min_samples=2).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
##############################################################################
plt.figure()
plt.plot(data[0:currentIndex]['lat'], data[0:currentIndex]['long'], 'o-', color='r')
plt.xlabel('lat')
plt.ylabel('long')
plt.title('Raw Data')
plt.figure()
plt.plot(locationSM[:]['MeanLat'], locationSM[:]['MeanLong'], 'o-', color='r')
plt.xlabel('lat')
plt.ylabel('long')
plt.title('Reverse Encoding Clustering| %d clusters' % len(locationSM))
plt.show()
# Plot result
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k',markersize=6)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
#plt.plot(locationSM[:]['MeanLat'], locationSM[:]['MeanLong'], 'o-', color='r')
plt.title('DB Scan Clustering | %d clusters' % n_clusters_)
plt.show()
In [62]:
import datetime
locationSM2 = pd.DataFrame({'Brief Location':[],'Cluster Number':[],'TimeIn':[],'TimeOut':[],'deltaT':[],'MeanLat':[],'MeanLong':[],'weekday':[]})
locationSM2.head()
currentLocation = locationSM.iloc[0,:]['Brief Location']
startTime = locationSM.ix[0]['TimeIn']
endTime = locationSM.ix[0]['TimeOut']
ArrayOfLat = []
ArrayOfLong = []
ArrayOfLat.append(locationSM.ix[0]['MeanLat'])
ArrayOfLong.append(locationSM.ix[0]['MeanLong'])
wk = locationSM.ix[0]['week day']
currentLabel = db.labels_[0]
for i in range(len(locationSM)):
location = locationSM.iloc[i,:]
if(db.labels_[i] == currentLabel):
#Update time
endTime = location['TimeOut']
ArrayOfLat.append(location['MeanLat'])
ArrayOfLong.append(location['MeanLong'])
else:
meanLat = np.mean(ArrayOfLat)
meanLong = np.mean(ArrayOfLong)
FMT = '%I:%M:%S %p'
deltaTime = datetime.datetime.strptime(endTime, FMT) - datetime.datetime.strptime(startTime, FMT)
if (currentLabel != -1):
locationSM2.ix[len(locationSM2), ['Brief Location','Cluster Number','TimeIn','TimeOut','deltaT','MeanLat','MeanLong','weekday']] = [currentLocation,currentLabel,startTime,endTime,deltaTime,meanLat,meanLong,wk]
currentLocation = location['Brief Location']
startTime = location['TimeIn']
endTime = location['TimeOut']
ArrayOfLat = []
ArrayOfLong = []
wk = location['week day']
ArrayOfLat.append(location['MeanLat'])
ArrayOfLong.append(location['MeanLong'])
currentLabel = db.labels_[i]
locationSM2[:]
Out[62]:
Inputs: Time in Time Out
Output: Label most anticipated location
In [63]:
from datetime import date, datetime, timedelta
import datetime
import time
inc = 0
def datespan(startTimestr, endTimestr, delta=timedelta(days=1)):
startTimeD = datetime.datetime.strptime(startTimestr, '%I:%M:%S %p')
endTimeD = datetime.datetime.strptime(endTimestr, '%I:%M:%S %p')
if (startTimeD < endTimeD):
inc = 0
startTimeD = datetime.datetime(2007, 3,20, startTimeD.hour, 0)
endTimeD = datetime.datetime(2007, 3, 20,endTimeD.hour, 0)
if (startTimeD > endTimeD):
inc = 1
startTimeD = datetime.datetime(2007, 3,19, startTimeD.hour, 0)
endTimeD = datetime.datetime(2007, 3, 20,endTimeD.hour, 0)
currentDate = startTimeD
while currentDate < endTimeD:
yield currentDate.hour
currentDate += delta
In [64]:
locationSplitted_States = pd.DataFrame({'Cluster Number':[],'hour':[],'weekday':[]})
locationSplitted_States.head()
for i in range(len(locationSM2)):
element = locationSM2.ix[i,:]
#print element
for timestamp in datespan(element['TimeIn'],
element['TimeOut'],
delta=timedelta(hours=1)):
locationSplitted_States.ix[len(locationSplitted_States),['Cluster Number','hour','weekday']] = [element['Cluster Number'],timestamp,element['weekday']]
In [20]:
locationSplitted_States[:]
Out[20]:
In [21]:
def trainSupportVectorMachineGivenXandY(X,Y):
clf = SVC(probability=True)
clf.fit(X, Y)
return clf
In [65]:
import datetime
def trainSupportVectorMachineGivenXandY(X,Y):
clf = SVC(probability=True)
clf.fit(X, Y)
return clf
def getLabelOfAddress(address):
a = data.ix[:,4].unique().flatten()
for i in range(len(a)):
if (a[i] == address):
return i
return "No match found!"
def getAddressFromLabel(label):
a = data.ix[:,5].unique().flatten()
return a[label]
def convertTimeDeltaToSeconds(timeD):
return timeD.total_seconds()
def convertToHour(timeD):
t= datetime.datetime.strptime(timeD, '%I:%M:%S %p')
return t.hour
In [66]:
X_SVM_Split_wk = locationSplitted_States[:]['weekday']
X_SVM_Split_hr = locationSplitted_States[:]['hour']
Y_SVM_Split_CN = locationSplitted_States[:]['Cluster Number']
X_SVM_split = vstack([X_SVM_Split_wk,X_SVM_Split_hr]).T
In [76]:
from sklearn.cross_validation import KFold
kf = KFold(len(X_SVM_split), n_folds=50, shuffle=True)
errorArray=[]
for fold in kf:
#print fold
myModel = trainSupportVectorMachineGivenXandY(X_SVM_split[fold[0]],Y_SVM_Split_CN[fold[0]])
predicted = myModel.predict(X_SVM_split[fold[1]])
actual = Y_SVM_Split_CN[fold[1]]
errorArray.append((1 * (predicted == actual)).sum() / float(len(actual)))
print 'Mean Percentage Success: '
mean(errorArray)*100
Out[76]:
In [71]:
i +=1
#We want an arary that is 7 x 24 that contains the most probable stay points a person would be at within a particular hour
# 7 x 24 Xs
myModel = trainSupportVectorMachineGivenXandY(X_SVM_split,Y_SVM_Split_CN)
7*24
X_array = []
for day in range(7):
for hour in range(24):
Xi = [day,hour]
X_array.append(Xi)
spatialCalendar = myModel.predict(X_array)
Calendar = np.split(spatialCalendar,7)
spatialCalendar
Out[71]:
In [72]:
clr = colors #['p','r','b','w','p']
currentClass = Calendar[0][0]
BigArray = []
for element in Calendar:
currentArray = []
colorArray = []
currentArray.append(0)
#colorArray.append(clr[int(currentClass)])
for index, cclass in enumerate(element):
if (cclass != currentClass):
currentArray.append(index-1)
colorArray.append(clr[int(currentClass)])
currentClass = cclass
currentArray.append(24)
colorArray.append(clr[int(currentClass)])
BigArray.append([currentArray,colorArray])
In [73]:
BigArray
Out[73]:
In [85]:
'''
Make a colorbar as a separate figure.
'''
from matplotlib import pyplot
import matplotlib as mpl
daysDictionary = ['Monday','Tuesday','Wednesday','Thursday','Friday','Sunday','Saturday']
cmap = mpl.cm.cool
norm = mpl.colors.Normalize(vmin=5, vmax=10)
cmap = mpl.colors.ListedColormap(['r', 'g', 'b', 'c'])
cmap.set_over('0.25')
cmap.set_under('0.75')
# Make a figure and axes with dimensions as desired.
fig = pyplot.figure(figsize=(8,3))
for i in range(7):
ax = fig.add_axes([0.05, 0.3*i, 0.9, 0.15])
# Set the colormap and norm to correspond to the data for which
# the colorbar will be used.
cmap = mpl.colors.ListedColormap(BigArray[i][1])
# If a ListedColormap is used, the length of the bounds array must be
# one greater than the length of the color list. The bounds must be
# monotonically increasing.
bounds = BigArray[i][0]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
cb1 = mpl.colorbar.ColorbarBase(ax, cmap=cmap,
norm=norm,spacing='proportional',
orientation='horizontal')
cb1.set_label(daysDictionary[i])
print 'Aha!!!!'