In [3]:
import os
import glob
import LatLon 
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)

# plot
%matplotlib inline
import matplotlib.pyplot as plt
import pylab
import seaborn as sns
sns.set_style("whitegrid")
from pysurvey.plot import setup, legend, icolorbar, density, minmax

import geoplotlib
import geoplotlib.colors

# date
from dateutil import parser
from matplotlib.dates import date2num

# database
import dataset
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

import sompy as SOM

In [2]:
clean = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_clean_3.csv')

In [2]:
before = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_before.csv')
after = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_after.csv')
during = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_during.csv')
clean = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_clean_2.csv')

In [3]:
during


Out[3]:
flight date time alt lat lon flightnum datenum hour weekday normtime heading_deg distance heading x y
1204068 A03510 2016/01/23 02:24:24.331 43000 39.70312 -77.79019 470 8.000032 0.000757 1 0.486940 292.706645 108.467407 -1.174491 -100.359872 -41.146986
1204069 A03510 2016/01/23 02:24:24.911 43000 39.70249 -77.79211 470 8.000038 0.000919 1 0.486940 292.639920 108.594032 -1.175655 -100.428990 -41.312005
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1271894 C0331E 2016/01/25 01:17:51.032 34975 39.02911 -78.22713 2947 9.953813 22.891508 2 0.605861 256.874318 142.582569 -1.799883 -75.291829 -121.082325
1271895 C0331E 2016/01/25 01:17:59.707 35000 39.04485 -78.22906 2947 9.953913 22.893918 2 0.605867 257.572947 142.328326 -1.787689 -76.625721 -119.941032

67827 rows × 16 columns


In [4]:
from sklearn import manifold, datasets, cluster

n_neighbors = 5
n_components = 2
n_points=10000

In [242]:
input_tags = ['lat', 'lon', 'alt', 'hour', 'flightnum']

def make_data(cat):
    np.random.seed(0)
    ii = np.random.choice(before.index, n_points)
    X = np.zeros( (n_points, len(input_tags)) )
    for i,tag in enumerate(input_tags):
        tmp = before[tag][ii]
        X[:, i] = (tmp-tmp.mean()) / tmp.std()
    color = np.array(before['flightnum'][ii])
    return X, color
X, YModel = make_data(during)
print X.shape


(10000, 5)

In [248]:
isomap = manifold.Isomap(n_neighbors, n_components)
Y = isomap.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)


Out[248]:
<matplotlib.collections.PathCollection at 0x139e00a90>

In [249]:
tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
Y = tsne.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)


Out[249]:
<matplotlib.collections.PathCollection at 0x139f9bed0>

In [ ]:
se = manifold.SpectralEmbedding(n_components=n_components, affinity='rbf', n_neighbors=n_neighbors, random_state=0)
Y = se.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=color, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)

In [252]:
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)


Out[252]:
<matplotlib.collections.PathCollection at 0x1347ded10>

In [ ]:
mds = manifold.MDS(n_components, max_iter=100, n_init=1, random_state=0)
Y = mds.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=color, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)

In [254]:
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)


Out[254]:
<matplotlib.collections.PathCollection at 0x139140b10>

Clustering


In [261]:
colors = np.array([x for x in 'bgrcmybgrcmybgrcmybgrcmy'])
colors = np.hstack([colors] * 40)

def run_clustering(model, **kwargs):
    alg = model(**kwargs)
    alg.fit(X)
    if hasattr(alg, 'labels_'):
        y_pred = alg.labels_.astype(np.int)
    else:
        y_pred = alg.predict(X)
    return y_pred

def plot_clustering(y_pred):
    pylab.figure(figsize=(12,6))
    pylab.subplot(121)
    pylab.title('From Flight Numbers')
    plt.scatter(X[:, 0], X[:, 1], c=YModel, s=10, lw=0, cmap=pylab.cm.jet)
    pylab.subplot(122)
    pylab.title('From Clusters')
    plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=10, lw=0, cmap=pylab.cm.jet)

In [262]:
bandwidth = cluster.estimate_bandwidth(X, quantile=0.009)
y_pred = run_clustering(cluster.MeanShift, bandwidth=bandwidth, bin_seeding=True)
plot_clustering(y_pred)
print minmax(y_pred), bandwidth, len(np.unique(y_pred)), len(np.unique(Ymodel))


(0, 77) 0.911446051951 78 685

In [263]:
y_pred = run_clustering(cluster.DBSCAN, eps=0.3)
plot_clustering(y_pred)
print minmax(y_pred), bandwidth, len(np.unique(y_pred)), len(np.unique(Ymodel))


(-1, 425) 0.911446051951 427 685

In [264]:
from sklearn.neighbors import kneighbors_graph

In [265]:
n_clusters = len(np.unique(Ymodel))
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
y_pred = run_clustering(cluster.AgglomerativeClustering, 
                        n_clusters=n_clusters, linkage='ward',
                        connectivity=connectivity)
plot_clustering(y_pred)
print minmax(y_pred), bandwidth, len(np.unique(y_pred)), len(np.unique(Ymodel))


(0, 684) 0.911446051951 685 685

In [319]:
during.hist('flightnum', bins=np.unique(during['flightnum'])-0.5, )


Out[319]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x14072d210>]], dtype=object)

In [5]:
# tmp = during.groupby('flightnum', as_index=True).count().sort('x')
tmp = clean.groupby(['weekday', 'flightnum', 'flightindex'], as_index=True).count().sort('x')
index = tmp.index[-2]
flight = clean[(clean['flightnum'] == index[1]) & (clean['flightpoints'] > 100)]
# color = pylab.cm.jet(flight['flightindex']*1.0/flight['flightindex'].max())
colors = geoplotlib.colors.create_set_cmap(flight['flightindex'], pylab.cm.jet)

# flight.plot('lon', 'lat', 'scatter')
print index, len(flight)

geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(flight['flightindex']):
    geoplotlib.scatter(flight[flight['flightindex'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)


(2, 726, 1) 3543
/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app

In [8]:
dtime = lambda x: (x - x.shift(1))
tmp = np.array((flight['datenum'] - flight['datenum'].shift()).fillna(0))*24

In [4]:
clean.loc[:, 'flightindex'] = 0

In [3]:
uflightnum = np.unique(clean['flightnum'])
for i, flightnum in enumerate(uflightnum):
    isgood = (clean['flightnum'] == flightnum)
    tmp = clean.loc[isgood, 'datenum']
    delta = np.array((tmp - tmp.shift()).fillna(0))*24.0
    clean.loc[isgood, 'flightindex'] = np.where(delta > 0.5, 1,0)
    clean.loc[isgood, 'flightindex'] = np.cumsum(clean.loc[isgood, 'flightindex'])
    for fi in np.unique(clean.loc[isgood, 'flightindex']):
        isfi = (clean.loc[isgood, 'flightindex'] == fi)
        clean.loc[isgood&isfi, 'flightpoints'] = len(np.where(isfi)[0])
#     break
    if (i%(len(uflightnum)/20.0) == 0) and (i > 0):
        print i
    if i > 10:
        break
# clean.loc[clean['flightindex'] > 0, 'flightindex'] = 1

# for name, group in clean.groupby('flightnum'):
#     tmp = np.array((group['datenum'] - group['datenum'].shift()).fillna(0))*24
#     clean.loc(clean['flightnum']==name, 'deltatime') = tmp
#     break

In [7]:
clean.hist('flightnum', bins=100, lw=0)


Out[7]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x118469490>]], dtype=object)

In [20]:
clean.where(clean['flightpoints'] > 10 ).hist('flightpoints', bins=100, lw=0)


Out[20]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11d89a050>]], dtype=object)

In [ ]:
flight = clean.where(clean['flightpoints'] == clean['flightpoints'].max())
colors = geoplotlib.colors.create_set_cmap(flight['flightindex'], pylab.cm.jet)

geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(flight['flightindex']):
    geoplotlib.scatter(flight[flight['flightindex'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)

In [29]:
colors = geoplotlib.colors.create_set_cmap(clean['flightnum'], pylab.cm.jet)

geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(clean['flightnum']):
    geoplotlib.scatter(clean[clean['flightnum'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)



In [30]:
clean.plot('flightnum', 'datenum', c='flightindex', kind='scatter', 
           marker='.', alpha=0.5, lw=0, cmap=pylab.cm.jet, colorbar=False)


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x116932c90>
/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [33]:
clean[['flightnum', 'flightindex']].describe()


Out[33]:
flightnum flightindex
count 1860239.000000 1860239.000000
mean 1656.891684 0.000876
std 850.311453 0.038299
min 1.000000 0.000000
25% 956.000000 0.000000
50% 1719.000000 0.000000
75% 2366.000000 0.000000
max 3066.000000 3.000000

In [23]:
from sklearn import metrics

In [33]:
clean


Out[33]:
flight date time alt lat lon flightnum datenum hour weekday normtime heading_deg distance heading x y px py flightindex
0 AA9249 2016/01/15 02:24:21.604 8625 39.12570 -77.53583 2115 0.000000 0.000000 0 0.000000e+00 254.091033 82.037152 -1.848460 -39.886351 -71.688028 206.711111 99.627886 0
1 AA9249 2016/01/15 02:24:22.969 8675 39.12730 -77.53491 2115 0.000016 0.000379 0 9.616173e-07 254.194387 81.910595 -1.846656 -39.953869 -71.505481 206.711111 99.627886 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1860972 4CA4B3 2016/01/29 01:30:01.789 33975 39.60013 -77.68341 320 13.962271 23.094496 6 8.498446e-01 288.421558 95.955069 -1.249279 -85.814791 -42.932469 193.711111 155.627886 0
1860973 4CA4B3 2016/01/29 01:30:03.232 33975 39.59938 -77.68634 320 13.962287 23.094897 6 8.498456e-01 288.328603 96.169497 -1.250902 -85.936638 -43.167886 192.711111 155.627886 0

1860239 rows × 19 columns


In [4]:
from sklearn import metrics

In [33]:
A = LatLon.LatLon(39.17659, -77.44556)
B = LatLon.LatLon(39.17659, -77.44556)
A.distance(B)


Out[33]:
0.0

In [43]:
def latlon_distance(a,b):
    A = LatLon.LatLon(a[0], a[1])
    B = LatLon.LatLon(b[0], b[1])
#     return A.distance(B)
    try:
        return A.distance(B)
        return 0.0
    except Exception as e:
        print A
        print B
        raise

flight = clean[(clean['flight'] == 'AA9249') & (clean['flightpoints'] > 100) & (clean['datenum'] < 1)]
dist = metrics.pairwise_distances(flight[['lat','lon']].fillna(0), metric=latlon_distance)
# dist = metrics.pairwise.euclidean_distances(flight[['lat','lon']].fillna(0))


[[  0.           0.19462991   0.27115516 ...,  57.19184667  62.36616692
   65.52921374]
 [  0.19462991   0.           0.07656745 ...,  57.09767812  62.27352506
   65.43737097]
 [  0.27115516   0.07656745   0.         ...,  57.06342908  62.23987834
   65.40403919]
 ..., 
 [ 57.19184667  57.09767812  57.06342908 ...,   0.           5.20252135
    8.37995702]
 [ 62.36616692  62.27352506  62.23987834 ...,   5.20252135   0.
    3.17744014]
 [ 65.52921374  65.43737097  65.40403919 ...,   8.37995702   3.17744014
    0.        ]]

In [48]:
clean['flightid'] = clean['flight'] + '.' + clean['flightindex'].map(str)

In [49]:
flight = clean[(clean['flight'] == 'AA9249') & (clean['flightpoints'] > 100) & (clean['datenum'] < 1)]
flightindex = flight['flightindex']
for fi in np.unique(flightindex):
    ii = np.where(flightindex == fi)[0]
    print dist[np.meshgrid(ii, ii)].shape, dist.shape
    break


(213, 213) (417, 417)

In [14]:
flight.plot('lon', 'lat', kind='scatter', c='flightindex', lw=0, cmap=pylab.cm.jet)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x119edb550>

In [42]:
pylab.imshow(dist, cmap=pylab.cm.jet)


Out[42]:
<matplotlib.image.AxesImage at 0x1167acf90>

In [ ]:


In [60]:
flightid = np.array(clean['flightid'])
uflightid = np.unique(flightid)
dist = np.zeros((len(uflightid), len(uflightid)))
for i, fi in enumerate(uflightid):
    ii = flightid == fi
    for j, fj in enumerate(uflightid):
        jj = flightid == fj
        d = metrics.pairwise_distances(clean.loc[ii,['lat','lon']].fillna(0), 
                                       clean.loc[jj,['lat','lon']].fillna(0), 
                                       metric=latlon_distance)
        dist[i, j] = np.mean(d)
    break


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-60-541543c67aff> in <module>()
      8         d = metrics.pairwise_distances(clean.loc[ii,['lat','lon']].fillna(0), 
      9                                        clean.loc[jj,['lat','lon']].fillna(0),
---> 10                                        metric=latlon_distance)
     11         dist[i, j] = np.mean(d)
     12     break

/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/sklearn/metrics/pairwise.pyc in pairwise_distances(X, Y, metric, n_jobs, **kwds)
   1205         func = partial(distance.cdist, metric=metric, **kwds)
   1206 
-> 1207     return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1208 
   1209 

/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/sklearn/metrics/pairwise.pyc in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1052     if n_jobs == 1:
   1053         # Special case to avoid picklability checks in delayed
-> 1054         return func(X, Y, **kwds)
   1055 
   1056     # TODO: in some cases, backend='threading' may be appropriate

/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/sklearn/metrics/pairwise.pyc in _pairwise_callable(X, Y, metric, **kwds)
   1090         iterator = itertools.product(range(X.shape[0]), range(Y.shape[0]))
   1091         for i, j in iterator:
-> 1092             out[i, j] = metric(X[i], Y[j], **kwds)
   1093 
   1094     return out

<ipython-input-43-726c7968c86d> in latlon_distance(a, b)
      1 
      2 def latlon_distance(a,b):
----> 3     A = LatLon.LatLon(a[0], a[1])
      4     B = LatLon.LatLon(b[0], b[1])
      5 #     return A.distance(B)

/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/LatLon/lat_lon.pyc in __init__(self, lat, lon, name)
    352                 raise AttributeError
    353         except AttributeError:
--> 354             self.lon = Longitude(lon)
    355         self.name = name
    356 

/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/LatLon/lat_lon.pyc in __init__(self, degree, minute, second)
    229     '''
    230     def __init__(self, degree = 0, minute = 0, second = 0):
--> 231         super(Longitude, self).__init__(degree, minute, second) # Initialize the GeoCoord
    232         decimal_degree = self.range180() # Make sure that longitudes are reported in the range -180 to 180
    233         self.degree, self.minute, self.decimal_minute, self.second = self._calc_degreeminutes(decimal_degree)

/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/LatLon/lat_lon.pyc in __init__(self, degree, minute, second)
     26     __metaclass__ = abc.ABCMeta
     27 
---> 28     def __init__(self, degree = 0, minute = 0, second = 0):
     29         '''
     30         Initialize a GeoCoord object

KeyboardInterrupt: 

In [ ]:
pylab.imshow(d)

In [ ]:
clean[['flightid','lat','lon']].groupby('flightid').plot('lat','lon', 
                                                         kind='scatter', lw=0, alpha=0.5)

In [ ]: