In [3]:
    
import os
import glob
import LatLon 
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
# plot
%matplotlib inline
import matplotlib.pyplot as plt
import pylab
import seaborn as sns
sns.set_style("whitegrid")
from pysurvey.plot import setup, legend, icolorbar, density, minmax
import geoplotlib
import geoplotlib.colors
# date
from dateutil import parser
from matplotlib.dates import date2num
# database
import dataset
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import sompy as SOM
    
In [2]:
    
clean = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_clean_3.csv')
    
In [2]:
    
before = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_before.csv')
after = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_after.csv')
during = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_during.csv')
clean = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_clean_2.csv')
    
In [3]:
    
during
    
    Out[3]:
In [4]:
    
from sklearn import manifold, datasets, cluster
n_neighbors = 5
n_components = 2
n_points=10000
    
In [242]:
    
input_tags = ['lat', 'lon', 'alt', 'hour', 'flightnum']
def make_data(cat):
    np.random.seed(0)
    ii = np.random.choice(before.index, n_points)
    X = np.zeros( (n_points, len(input_tags)) )
    for i,tag in enumerate(input_tags):
        tmp = before[tag][ii]
        X[:, i] = (tmp-tmp.mean()) / tmp.std()
    color = np.array(before['flightnum'][ii])
    return X, color
X, YModel = make_data(during)
print X.shape
    
    
In [248]:
    
isomap = manifold.Isomap(n_neighbors, n_components)
Y = isomap.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
    
    Out[248]:
    
In [249]:
    
tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
Y = tsne.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
    
    Out[249]:
    
In [ ]:
    
se = manifold.SpectralEmbedding(n_components=n_components, affinity='rbf', n_neighbors=n_neighbors, random_state=0)
Y = se.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=color, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
    
In [252]:
    
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
    
    Out[252]:
    
In [ ]:
    
mds = manifold.MDS(n_components, max_iter=100, n_init=1, random_state=0)
Y = mds.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=color, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
    
In [254]:
    
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
    
    Out[254]:
    
In [261]:
    
colors = np.array([x for x in 'bgrcmybgrcmybgrcmybgrcmy'])
colors = np.hstack([colors] * 40)
def run_clustering(model, **kwargs):
    alg = model(**kwargs)
    alg.fit(X)
    if hasattr(alg, 'labels_'):
        y_pred = alg.labels_.astype(np.int)
    else:
        y_pred = alg.predict(X)
    return y_pred
def plot_clustering(y_pred):
    pylab.figure(figsize=(12,6))
    pylab.subplot(121)
    pylab.title('From Flight Numbers')
    plt.scatter(X[:, 0], X[:, 1], c=YModel, s=10, lw=0, cmap=pylab.cm.jet)
    pylab.subplot(122)
    pylab.title('From Clusters')
    plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=10, lw=0, cmap=pylab.cm.jet)
    
In [262]:
    
bandwidth = cluster.estimate_bandwidth(X, quantile=0.009)
y_pred = run_clustering(cluster.MeanShift, bandwidth=bandwidth, bin_seeding=True)
plot_clustering(y_pred)
print minmax(y_pred), bandwidth, len(np.unique(y_pred)), len(np.unique(Ymodel))
    
    
    
In [263]:
    
y_pred = run_clustering(cluster.DBSCAN, eps=0.3)
plot_clustering(y_pred)
print minmax(y_pred), bandwidth, len(np.unique(y_pred)), len(np.unique(Ymodel))
    
    
    
In [264]:
    
from sklearn.neighbors import kneighbors_graph
    
In [265]:
    
n_clusters = len(np.unique(Ymodel))
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
y_pred = run_clustering(cluster.AgglomerativeClustering, 
                        n_clusters=n_clusters, linkage='ward',
                        connectivity=connectivity)
plot_clustering(y_pred)
print minmax(y_pred), bandwidth, len(np.unique(y_pred)), len(np.unique(Ymodel))
    
    
    
In [319]:
    
during.hist('flightnum', bins=np.unique(during['flightnum'])-0.5, )
    
    Out[319]:
    
In [5]:
    
# tmp = during.groupby('flightnum', as_index=True).count().sort('x')
tmp = clean.groupby(['weekday', 'flightnum', 'flightindex'], as_index=True).count().sort('x')
index = tmp.index[-2]
flight = clean[(clean['flightnum'] == index[1]) & (clean['flightpoints'] > 100)]
# color = pylab.cm.jet(flight['flightindex']*1.0/flight['flightindex'].max())
colors = geoplotlib.colors.create_set_cmap(flight['flightindex'], pylab.cm.jet)
# flight.plot('lon', 'lat', 'scatter')
print index, len(flight)
geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(flight['flightindex']):
    geoplotlib.scatter(flight[flight['flightindex'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)
    
    
    
    
In [8]:
    
dtime = lambda x: (x - x.shift(1))
tmp = np.array((flight['datenum'] - flight['datenum'].shift()).fillna(0))*24
    
In [4]:
    
clean.loc[:, 'flightindex'] = 0
    
In [3]:
    
uflightnum = np.unique(clean['flightnum'])
for i, flightnum in enumerate(uflightnum):
    isgood = (clean['flightnum'] == flightnum)
    tmp = clean.loc[isgood, 'datenum']
    delta = np.array((tmp - tmp.shift()).fillna(0))*24.0
    clean.loc[isgood, 'flightindex'] = np.where(delta > 0.5, 1,0)
    clean.loc[isgood, 'flightindex'] = np.cumsum(clean.loc[isgood, 'flightindex'])
    for fi in np.unique(clean.loc[isgood, 'flightindex']):
        isfi = (clean.loc[isgood, 'flightindex'] == fi)
        clean.loc[isgood&isfi, 'flightpoints'] = len(np.where(isfi)[0])
#     break
    if (i%(len(uflightnum)/20.0) == 0) and (i > 0):
        print i
    if i > 10:
        break
# clean.loc[clean['flightindex'] > 0, 'flightindex'] = 1
# for name, group in clean.groupby('flightnum'):
#     tmp = np.array((group['datenum'] - group['datenum'].shift()).fillna(0))*24
#     clean.loc(clean['flightnum']==name, 'deltatime') = tmp
#     break
    
In [7]:
    
clean.hist('flightnum', bins=100, lw=0)
    
    Out[7]:
    
In [20]:
    
clean.where(clean['flightpoints'] > 10 ).hist('flightpoints', bins=100, lw=0)
    
    Out[20]:
    
In [ ]:
    
flight = clean.where(clean['flightpoints'] == clean['flightpoints'].max())
colors = geoplotlib.colors.create_set_cmap(flight['flightindex'], pylab.cm.jet)
geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(flight['flightindex']):
    geoplotlib.scatter(flight[flight['flightindex'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)
    
In [29]:
    
colors = geoplotlib.colors.create_set_cmap(clean['flightnum'], pylab.cm.jet)
geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(clean['flightnum']):
    geoplotlib.scatter(clean[clean['flightnum'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)
    
    
In [30]:
    
clean.plot('flightnum', 'datenum', c='flightindex', kind='scatter', 
           marker='.', alpha=0.5, lw=0, cmap=pylab.cm.jet, colorbar=False)
    
    Out[30]:
    
    
In [33]:
    
clean[['flightnum', 'flightindex']].describe()
    
    Out[33]:
In [23]:
    
from sklearn import metrics
    
In [33]:
    
clean
    
    Out[33]:
In [4]:
    
from sklearn import metrics
    
In [33]:
    
A = LatLon.LatLon(39.17659, -77.44556)
B = LatLon.LatLon(39.17659, -77.44556)
A.distance(B)
    
    Out[33]:
In [43]:
    
def latlon_distance(a,b):
    A = LatLon.LatLon(a[0], a[1])
    B = LatLon.LatLon(b[0], b[1])
#     return A.distance(B)
    try:
        return A.distance(B)
        return 0.0
    except Exception as e:
        print A
        print B
        raise
flight = clean[(clean['flight'] == 'AA9249') & (clean['flightpoints'] > 100) & (clean['datenum'] < 1)]
dist = metrics.pairwise_distances(flight[['lat','lon']].fillna(0), metric=latlon_distance)
# dist = metrics.pairwise.euclidean_distances(flight[['lat','lon']].fillna(0))
    
    
In [48]:
    
clean['flightid'] = clean['flight'] + '.' + clean['flightindex'].map(str)
    
In [49]:
    
flight = clean[(clean['flight'] == 'AA9249') & (clean['flightpoints'] > 100) & (clean['datenum'] < 1)]
flightindex = flight['flightindex']
for fi in np.unique(flightindex):
    ii = np.where(flightindex == fi)[0]
    print dist[np.meshgrid(ii, ii)].shape, dist.shape
    break
    
    
In [14]:
    
flight.plot('lon', 'lat', kind='scatter', c='flightindex', lw=0, cmap=pylab.cm.jet)
    
    Out[14]:
    
In [42]:
    
pylab.imshow(dist, cmap=pylab.cm.jet)
    
    Out[42]:
    
In [ ]:
    
    
In [60]:
    
flightid = np.array(clean['flightid'])
uflightid = np.unique(flightid)
dist = np.zeros((len(uflightid), len(uflightid)))
for i, fi in enumerate(uflightid):
    ii = flightid == fi
    for j, fj in enumerate(uflightid):
        jj = flightid == fj
        d = metrics.pairwise_distances(clean.loc[ii,['lat','lon']].fillna(0), 
                                       clean.loc[jj,['lat','lon']].fillna(0), 
                                       metric=latlon_distance)
        dist[i, j] = np.mean(d)
    break
    
    
In [ ]:
    
pylab.imshow(d)
    
In [ ]:
    
clean[['flightid','lat','lon']].groupby('flightid').plot('lat','lon', 
                                                         kind='scatter', lw=0, alpha=0.5)
    
In [ ]: