In [3]:
import os
import glob
import LatLon
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
# plot
%matplotlib inline
import matplotlib.pyplot as plt
import pylab
import seaborn as sns
sns.set_style("whitegrid")
from pysurvey.plot import setup, legend, icolorbar, density, minmax
import geoplotlib
import geoplotlib.colors
# date
from dateutil import parser
from matplotlib.dates import date2num
# database
import dataset
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import sompy as SOM
In [2]:
clean = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_clean_3.csv')
In [2]:
before = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_before.csv')
after = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_after.csv')
during = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_during.csv')
clean = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_clean_2.csv')
In [3]:
during
Out[3]:
In [4]:
from sklearn import manifold, datasets, cluster
n_neighbors = 5
n_components = 2
n_points=10000
In [242]:
input_tags = ['lat', 'lon', 'alt', 'hour', 'flightnum']
def make_data(cat):
np.random.seed(0)
ii = np.random.choice(before.index, n_points)
X = np.zeros( (n_points, len(input_tags)) )
for i,tag in enumerate(input_tags):
tmp = before[tag][ii]
X[:, i] = (tmp-tmp.mean()) / tmp.std()
color = np.array(before['flightnum'][ii])
return X, color
X, YModel = make_data(during)
print X.shape
In [248]:
isomap = manifold.Isomap(n_neighbors, n_components)
Y = isomap.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
Out[248]:
In [249]:
tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
Y = tsne.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
Out[249]:
In [ ]:
se = manifold.SpectralEmbedding(n_components=n_components, affinity='rbf', n_neighbors=n_neighbors, random_state=0)
Y = se.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=color, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
In [252]:
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
Out[252]:
In [ ]:
mds = manifold.MDS(n_components, max_iter=100, n_init=1, random_state=0)
Y = mds.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=color, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
In [254]:
plt.scatter(Y[:, 0], Y[:, 1], c=YModel, s=10, cmap=plt.cm.Spectral, alpha=0.7, lw=0)
Out[254]:
In [261]:
colors = np.array([x for x in 'bgrcmybgrcmybgrcmybgrcmy'])
colors = np.hstack([colors] * 40)
def run_clustering(model, **kwargs):
alg = model(**kwargs)
alg.fit(X)
if hasattr(alg, 'labels_'):
y_pred = alg.labels_.astype(np.int)
else:
y_pred = alg.predict(X)
return y_pred
def plot_clustering(y_pred):
pylab.figure(figsize=(12,6))
pylab.subplot(121)
pylab.title('From Flight Numbers')
plt.scatter(X[:, 0], X[:, 1], c=YModel, s=10, lw=0, cmap=pylab.cm.jet)
pylab.subplot(122)
pylab.title('From Clusters')
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=10, lw=0, cmap=pylab.cm.jet)
In [262]:
bandwidth = cluster.estimate_bandwidth(X, quantile=0.009)
y_pred = run_clustering(cluster.MeanShift, bandwidth=bandwidth, bin_seeding=True)
plot_clustering(y_pred)
print minmax(y_pred), bandwidth, len(np.unique(y_pred)), len(np.unique(Ymodel))
In [263]:
y_pred = run_clustering(cluster.DBSCAN, eps=0.3)
plot_clustering(y_pred)
print minmax(y_pred), bandwidth, len(np.unique(y_pred)), len(np.unique(Ymodel))
In [264]:
from sklearn.neighbors import kneighbors_graph
In [265]:
n_clusters = len(np.unique(Ymodel))
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
y_pred = run_clustering(cluster.AgglomerativeClustering,
n_clusters=n_clusters, linkage='ward',
connectivity=connectivity)
plot_clustering(y_pred)
print minmax(y_pred), bandwidth, len(np.unique(y_pred)), len(np.unique(Ymodel))
In [319]:
during.hist('flightnum', bins=np.unique(during['flightnum'])-0.5, )
Out[319]:
In [5]:
# tmp = during.groupby('flightnum', as_index=True).count().sort('x')
tmp = clean.groupby(['weekday', 'flightnum', 'flightindex'], as_index=True).count().sort('x')
index = tmp.index[-2]
flight = clean[(clean['flightnum'] == index[1]) & (clean['flightpoints'] > 100)]
# color = pylab.cm.jet(flight['flightindex']*1.0/flight['flightindex'].max())
colors = geoplotlib.colors.create_set_cmap(flight['flightindex'], pylab.cm.jet)
# flight.plot('lon', 'lat', 'scatter')
print index, len(flight)
geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(flight['flightindex']):
geoplotlib.scatter(flight[flight['flightindex'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)
In [8]:
dtime = lambda x: (x - x.shift(1))
tmp = np.array((flight['datenum'] - flight['datenum'].shift()).fillna(0))*24
In [4]:
clean.loc[:, 'flightindex'] = 0
In [3]:
uflightnum = np.unique(clean['flightnum'])
for i, flightnum in enumerate(uflightnum):
isgood = (clean['flightnum'] == flightnum)
tmp = clean.loc[isgood, 'datenum']
delta = np.array((tmp - tmp.shift()).fillna(0))*24.0
clean.loc[isgood, 'flightindex'] = np.where(delta > 0.5, 1,0)
clean.loc[isgood, 'flightindex'] = np.cumsum(clean.loc[isgood, 'flightindex'])
for fi in np.unique(clean.loc[isgood, 'flightindex']):
isfi = (clean.loc[isgood, 'flightindex'] == fi)
clean.loc[isgood&isfi, 'flightpoints'] = len(np.where(isfi)[0])
# break
if (i%(len(uflightnum)/20.0) == 0) and (i > 0):
print i
if i > 10:
break
# clean.loc[clean['flightindex'] > 0, 'flightindex'] = 1
# for name, group in clean.groupby('flightnum'):
# tmp = np.array((group['datenum'] - group['datenum'].shift()).fillna(0))*24
# clean.loc(clean['flightnum']==name, 'deltatime') = tmp
# break
In [7]:
clean.hist('flightnum', bins=100, lw=0)
Out[7]:
In [20]:
clean.where(clean['flightpoints'] > 10 ).hist('flightpoints', bins=100, lw=0)
Out[20]:
In [ ]:
flight = clean.where(clean['flightpoints'] == clean['flightpoints'].max())
colors = geoplotlib.colors.create_set_cmap(flight['flightindex'], pylab.cm.jet)
geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(flight['flightindex']):
geoplotlib.scatter(flight[flight['flightindex'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)
In [29]:
colors = geoplotlib.colors.create_set_cmap(clean['flightnum'], pylab.cm.jet)
geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(clean['flightnum']):
geoplotlib.scatter(clean[clean['flightnum'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)
In [30]:
clean.plot('flightnum', 'datenum', c='flightindex', kind='scatter',
marker='.', alpha=0.5, lw=0, cmap=pylab.cm.jet, colorbar=False)
Out[30]:
In [33]:
clean[['flightnum', 'flightindex']].describe()
Out[33]:
In [23]:
from sklearn import metrics
In [33]:
clean
Out[33]:
In [4]:
from sklearn import metrics
In [33]:
A = LatLon.LatLon(39.17659, -77.44556)
B = LatLon.LatLon(39.17659, -77.44556)
A.distance(B)
Out[33]:
In [43]:
def latlon_distance(a,b):
A = LatLon.LatLon(a[0], a[1])
B = LatLon.LatLon(b[0], b[1])
# return A.distance(B)
try:
return A.distance(B)
return 0.0
except Exception as e:
print A
print B
raise
flight = clean[(clean['flight'] == 'AA9249') & (clean['flightpoints'] > 100) & (clean['datenum'] < 1)]
dist = metrics.pairwise_distances(flight[['lat','lon']].fillna(0), metric=latlon_distance)
# dist = metrics.pairwise.euclidean_distances(flight[['lat','lon']].fillna(0))
In [48]:
clean['flightid'] = clean['flight'] + '.' + clean['flightindex'].map(str)
In [49]:
flight = clean[(clean['flight'] == 'AA9249') & (clean['flightpoints'] > 100) & (clean['datenum'] < 1)]
flightindex = flight['flightindex']
for fi in np.unique(flightindex):
ii = np.where(flightindex == fi)[0]
print dist[np.meshgrid(ii, ii)].shape, dist.shape
break
In [14]:
flight.plot('lon', 'lat', kind='scatter', c='flightindex', lw=0, cmap=pylab.cm.jet)
Out[14]:
In [42]:
pylab.imshow(dist, cmap=pylab.cm.jet)
Out[42]:
In [ ]:
In [60]:
flightid = np.array(clean['flightid'])
uflightid = np.unique(flightid)
dist = np.zeros((len(uflightid), len(uflightid)))
for i, fi in enumerate(uflightid):
ii = flightid == fi
for j, fj in enumerate(uflightid):
jj = flightid == fj
d = metrics.pairwise_distances(clean.loc[ii,['lat','lon']].fillna(0),
clean.loc[jj,['lat','lon']].fillna(0),
metric=latlon_distance)
dist[i, j] = np.mean(d)
break
In [ ]:
pylab.imshow(d)
In [ ]:
clean[['flightid','lat','lon']].groupby('flightid').plot('lat','lon',
kind='scatter', lw=0, alpha=0.5)
In [ ]: