In [1]:

    
import os
import time
import glob
import LatLon 
import numpy as np
import pandas as pd
from sklearn import metrics
from datetime import datetime
from matplotlib.dates import date2num
pd.set_option('display.max_rows', 10)

%matplotlib inline
import pylab
import geoplotlib
import seaborn as sns
sns.set_style("whitegrid")
from mpl_toolkits import basemap
from pysurvey.plot import setup, legend, icolorbar, density, text
from sklearn import cluster

Load Data



In [2]:

    
shipdata = pd.read_csv('/Users/ajmendez/tmp/ships/climate-data-from-ocean-ships/CLIWOC15.csv')
print('Number of records: {:,d}'.format(len(shipdata)))









    



Number of records: 280,280






    



/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (5,6,7,8,11,13,18,19,23,24,25,26,28,29,30,34,35,38,43,44,46,73,77,81,82,84,85,87,88,94,96,97,98,99,111,114,116,119,120,122,124,125,127,129,131,133,135,137,140) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Simple lat/lon

Not the greatest, but this some libraries break on non numbers. The poles are not shown so that is ok



In [3]:

    
shipdata['lon'] = shipdata['Lon3'].fillna(-180)
shipdata['lat'] = shipdata['Lat3'].fillna(-90)

Clean up Nationality



In [4]:

    
shipdata['nat'] = shipdata['Nationality'].replace('British ', 'British')
print np.unique(shipdata['nat'])









    



['American' 'British' 'Danish' 'Dutch' 'French' 'Hamburg' 'Spanish'
 'Swedish']

Add some better dates



In [5]:

    
def _datetime(*args, **kwargs):
    '''This ends up being the same as UTC, but it does not include the hour'''
    try:
        return datetime(*args, **kwargs)
    except Exception as e:
        # Broken dates where the day extends past the number of days in the month
        if args[2] in [31,29]:
            print args
            return datetime(args[0], args[1], args[2]-1)
        print args
        raise
shipdata['date'] = map(_datetime, shipdata['Year'], shipdata['Month'], shipdata['Day'])
shipdata['datenum'] = shipdata['date'].apply(date2num)









    



(1764, 9, 31)
(1788, 6, 31)
(1841, 6, 31)
(1850, 11, 31)
(1773, 2, 29)
(1794, 6, 31)
(1845, 9, 31)
(1769, 2, 29)
(1774, 9, 31)
(1779, 6, 31)

Add from/to for network graph



In [6]:

    
def clean(x):
    x = x.decode('ascii','replace').strip().lower().split('(')[0].split(',')[0].strip()
    x = x.replace('roads', '').replace('road', '').replace(' uk', '').replace('.', '').replace("'",'').strip()
    return x.encode('ascii', 'replace')
shipdata['from'] = shipdata['VoyageFrom'].astype('str').map(clean)
shipdata['to'] = shipdata['VoyageTo'].astype('str').map(clean)
print('Found {} and {} unique locations'.format(len(np.unique(shipdata['from'])), len(np.unique(shipdata['to']))))









    



Found 864 and 900 unique locations



In [7]:

    
def get_named():
    '''from/to have bundles of similar names so use lat/lon location of the end of the trip to match them
    up.  First build up a dataframe of these locations'''
    lat, lon, name, number  = [], [], [], []
    items = ['from', 'to']
    for item in items:
        arr = shipdata[item]
        ind = 0 if (item == 'from') else -1
        for n in np.unique(arr):
            tmp = shipdata[(arr == n) &
                           shipdata['Lat3'].notnull() & 
                           shipdata['Lon3'].notnull()].sort_values(by='datenum')
            if len(tmp) > 10:
                lat.append(tmp['Lat3'].iloc[ind])
                lon.append(tmp['Lon3'].iloc[ind])
                name.append(n)
                number.append(len(tmp))
    return pd.DataFrame.from_items(dict(lat=lat, lon=lon, number=number, name=name).iteritems())

named_places = get_named()
print('Found {} total, and {} unique locations'.format(len(named_places), len(named_places['name'])))









    



Found 1449 total, and 1449 unique locations



In [10]:

    
def get_cluster(bandwidth=0.1):
    places = named_places.copy()
#     print cluster.estimate_bandwidth(named_places[['lon','lat']].as_matrix(), quantile=0.01)
    ms = cluster.MeanShift(bandwidth=bandwidth)
    ms.fit(places[['lon','lat']])

    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print 'Found: {:,d} location groups'.format(n_clusters_)
    
    for k, label in enumerate(labels_unique):
        isgood = (labels == label)
        
        best_label = places.loc[isgood, ['name','number']].sort_values(by='number')['name'].iloc[-1]
        places.loc[isgood, 'label'] = best_label
        
        center = cluster_centers[k]
        places.loc[isgood, 'clon'] = center[0]
        places.loc[isgood, 'clat'] = center[1]
        
    return ms, places

# small bandwidth for city scales
# ms, meanshift_places = get_cluster(0.1)
    
# large bandwidth for more country scales
# More useful to track where things are going.
ms, meanshift_places = get_cluster(10.0)









    



Found: 55 location groups



In [11]:

    
def map_clusters(places, arrname):
    arr = shipdata[arrname]
    uname = np.unique(arr)
    for name in uname:
        tmp = places[places['name'] == name]
        if len(tmp) <= 1:
            continue
        tmp = tmp.iloc[0]
#         assert len(tmp) == 1, IndexError('Found multiple: {}\n {}'.format(name, tmp))
        isname = (arr == name)
        shipdata.loc[isname, arrname+'_label'] = tmp['label']
        shipdata.loc[isname, arrname+'_lon'] = tmp['clon']
        shipdata.loc[isname, arrname+'_lat'] = tmp['clat']

    
map_clusters(meanshift_places, 'from')
map_clusters(meanshift_places, 'to')



In [12]:

    
# Cleaned up version
filename = '/Users/ajmendez/tmp/ships/ships_clean.csv'
shipdata.to_csv(filename)
# shipdata = pd.read_csv(filename)



In [ ]: