In [1]:
import os
import time
import glob
import LatLon
import numpy as np
import pandas as pd
from sklearn import metrics
from datetime import datetime
from matplotlib.dates import date2num
pd.set_option('display.max_rows', 10)
%matplotlib inline
import pylab
import geoplotlib
import seaborn as sns
sns.set_style("whitegrid")
from mpl_toolkits import basemap
from pysurvey.plot import setup, legend, icolorbar, density, text
from sklearn import cluster
In [2]:
shipdata = pd.read_csv('/Users/ajmendez/tmp/ships/climate-data-from-ocean-ships/CLIWOC15.csv')
print('Number of records: {:,d}'.format(len(shipdata)))
In [3]:
shipdata['lon'] = shipdata['Lon3'].fillna(-180)
shipdata['lat'] = shipdata['Lat3'].fillna(-90)
In [4]:
shipdata['nat'] = shipdata['Nationality'].replace('British ', 'British')
print np.unique(shipdata['nat'])
In [5]:
def _datetime(*args, **kwargs):
'''This ends up being the same as UTC, but it does not include the hour'''
try:
return datetime(*args, **kwargs)
except Exception as e:
# Broken dates where the day extends past the number of days in the month
if args[2] in [31,29]:
print args
return datetime(args[0], args[1], args[2]-1)
print args
raise
shipdata['date'] = map(_datetime, shipdata['Year'], shipdata['Month'], shipdata['Day'])
shipdata['datenum'] = shipdata['date'].apply(date2num)
In [6]:
def clean(x):
x = x.decode('ascii','replace').strip().lower().split('(')[0].split(',')[0].strip()
x = x.replace('roads', '').replace('road', '').replace(' uk', '').replace('.', '').replace("'",'').strip()
return x.encode('ascii', 'replace')
shipdata['from'] = shipdata['VoyageFrom'].astype('str').map(clean)
shipdata['to'] = shipdata['VoyageTo'].astype('str').map(clean)
print('Found {} and {} unique locations'.format(len(np.unique(shipdata['from'])), len(np.unique(shipdata['to']))))
In [7]:
def get_named():
'''from/to have bundles of similar names so use lat/lon location of the end of the trip to match them
up. First build up a dataframe of these locations'''
lat, lon, name, number = [], [], [], []
items = ['from', 'to']
for item in items:
arr = shipdata[item]
ind = 0 if (item == 'from') else -1
for n in np.unique(arr):
tmp = shipdata[(arr == n) &
shipdata['Lat3'].notnull() &
shipdata['Lon3'].notnull()].sort_values(by='datenum')
if len(tmp) > 10:
lat.append(tmp['Lat3'].iloc[ind])
lon.append(tmp['Lon3'].iloc[ind])
name.append(n)
number.append(len(tmp))
return pd.DataFrame.from_items(dict(lat=lat, lon=lon, number=number, name=name).iteritems())
named_places = get_named()
print('Found {} total, and {} unique locations'.format(len(named_places), len(named_places['name'])))
In [10]:
def get_cluster(bandwidth=0.1):
places = named_places.copy()
# print cluster.estimate_bandwidth(named_places[['lon','lat']].as_matrix(), quantile=0.01)
ms = cluster.MeanShift(bandwidth=bandwidth)
ms.fit(places[['lon','lat']])
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print 'Found: {:,d} location groups'.format(n_clusters_)
for k, label in enumerate(labels_unique):
isgood = (labels == label)
best_label = places.loc[isgood, ['name','number']].sort_values(by='number')['name'].iloc[-1]
places.loc[isgood, 'label'] = best_label
center = cluster_centers[k]
places.loc[isgood, 'clon'] = center[0]
places.loc[isgood, 'clat'] = center[1]
return ms, places
# small bandwidth for city scales
# ms, meanshift_places = get_cluster(0.1)
# large bandwidth for more country scales
# More useful to track where things are going.
ms, meanshift_places = get_cluster(10.0)
In [11]:
def map_clusters(places, arrname):
arr = shipdata[arrname]
uname = np.unique(arr)
for name in uname:
tmp = places[places['name'] == name]
if len(tmp) <= 1:
continue
tmp = tmp.iloc[0]
# assert len(tmp) == 1, IndexError('Found multiple: {}\n {}'.format(name, tmp))
isname = (arr == name)
shipdata.loc[isname, arrname+'_label'] = tmp['label']
shipdata.loc[isname, arrname+'_lon'] = tmp['clon']
shipdata.loc[isname, arrname+'_lat'] = tmp['clat']
map_clusters(meanshift_places, 'from')
map_clusters(meanshift_places, 'to')
In [12]:
# Cleaned up version
filename = '/Users/ajmendez/tmp/ships/ships_clean.csv'
shipdata.to_csv(filename)
# shipdata = pd.read_csv(filename)
In [ ]: