In [1]:
import os
import glob
import LatLon 
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)

# plot
%matplotlib inline
import matplotlib.pyplot as plt
import pylab
import seaborn as sns
sns.set_style("whitegrid")
from pysurvey.plot import setup, legend, icolorbar, density, minmax

import geoplotlib
import geoplotlib.colors

In [2]:
clean = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_clean_3.csv')

In [52]:
clean['flightid'] = flights['flight']+'.'+flights['flightindex'].apply(str)

In [35]:
clean[clean['flightpoints'] > 100].groupby(['flight'], as_index=False).count().sort('date')


/Users/ajmendez/.local/canopy/User/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
Out[35]:
flight date time alt lat lon flightnum datenum hour weekday normtime heading_deg distance heading x y px py flightindex flightpoints
163 40621B 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101
1743 AB7F24 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102
21 06A053 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102
876 A4F6DD 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102
715 A380F1 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1396 A9AD36 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672 5672
1077 A719AA 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432 6432
1411 A9C737 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474 6474
670 A313B4 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631 7631
168 406696 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546 8546

2289 rows × 20 columns


In [72]:
isgood = ( (clean['flight'] == 'A719AA') | 
#            (clean['flight'] == 'A9C737') |
#            (clean['flight'] == 'A313B4') |
           (clean['flight'] == '406696') )
flights = clean[isgood]
print len(flights)
# flights.plot('flightindex', 'flightpoints', kind='scatter')
flights.plot('lon', 'lat', kind='scatter', c='datenum', 
             cmap=pylab.cm.Spectral, lw=0, alpha=0.5)

# pylab.axhline(10)


15371
Out[72]:
<matplotlib.axes._subplots.AxesSubplot at 0x12b1ded90>

In [39]:
colors = geoplotlib.colors.create_set_cmap(flights['flightnum'], pylab.cm.jet)

geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(flights['flightnum']):
    geoplotlib.scatter(flights[flights['flightnum'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)



In [84]:
import sys
sys.path.append('/Users/ajmendez/tmp/sklearn-expertsys/')
import RuleListClassifier
reload(RuleListClassifier)
from RuleListClassifier import *
from sklearn.datasets.mldata import fetch_mldata
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [78]:
feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"]
data = fetch_mldata("diabetes") # get dataset

y = (data.target+1)/2
Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split

In [ ]:
clf = RuleListClassifier(max_iter=100, class1label="diabetes", verbose=False)
clf.fit2(Xtrain, ytrain, feature_labels=feature_labels)
print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf

In [80]:
clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False)
clf.fit(Xtrain, ytrain, feature_labels=feature_labels)
print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf
print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)


RuleListClassifier Accuracy: 0.697916666667 Learned interpretable model:
Trained RuleListClassifier for detecting diabetes
==================================================
IF Glucose concentration test : -inf_to_99.5 THEN probability of diabetes: 8.4% (4.6%-13.3%)
ELSE IF Body mass index : -inf_to_27.8499995 THEN probability of diabetes: 15.5% (9.4%-22.7%)
ELSE IF 2-Hour serum insulin (mu U/ml) : 36.5_to_119.5 THEN probability of diabetes: 20.0% (9.8%-32.7%)
ELSE IF Glucose concentration test : 167.0_to_inf THEN probability of diabetes: 92.9% (84.9%-98.0%)
ELSE IF #Pregnant : 6.5_to_inf THEN probability of diabetes: 75.4% (64.0%-85.3%)
ELSE IF Glucose concentration test : 99.5_to_130.5 THEN probability of diabetes: 32.3% (23.4%-41.9%)
ELSE probability of diabetes: 54.4% (42.6%-66.0%)
=================================================

RandomForestClassifier Accuracy: 0.734375

In [73]:
feature_labels = ['lat', 'lon', 'alt', 'datenum']
Xtrain = flights[feature_labels]
ytrain = flights['flight'] == 'A719AA'
clf = RuleListClassifier(max_iter=1000, 
                         class1label="flight", 
                         verbose=False)
clf.fit(Xtrain, ytrain, feature_labels=feature_labels)


Out[73]:
RuleListClassifier(alpha=array([ 1.,  1.]), class1label='flight',
          listlengthprior=3, listwidthprior=1, max_iter=1000,
          maxcardinality=2, minsupport=10, n_chains=3, verbose=False)

In [76]:
print "RuleListClassifier Accuracy:", clf.score(Xtrain, ytrain), "Learned interpretable model:\n", clf
print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtrain, ytrain)


 RuleListClassifier Accuracy: 0.982499512068 Learned interpretable model:
Trained RuleListClassifier for detecting flight
================================================
IF alt : -inf_to_32962.5 THEN probability of flight: 0.0% (0.0%-0.1%)
ELSE IF lon : -77.015515_to_-76.948455 THEN probability of flight: 100.0% (99.8%-100.0%)
ELSE IF datenum : 11.4166205208_to_11.8940003819 THEN probability of flight: 0.6% (0.0%-2.0%)
ELSE IF lon : -76.948455_to_-76.8153 THEN probability of flight: 100.0% (99.9%-100.0%)
ELSE IF datenum : 13.6763937904_to_13.8865028008 AND lat : 39.474705_to_39.782185 THEN probability of flight: 0.3% (0.0%-1.2%)
ELSE IF datenum : 4.07091896405_to_4.9111564004 THEN probability of flight: 0.4% (0.0%-1.4%)
ELSE IF datenum : 8.23224320595_to_9.6481313657 THEN probability of flight: 99.8% (99.3%-100.0%)
ELSE IF datenum : 3.72189303814_to_3.91504727426 THEN probability of flight: 0.5% (0.0%-1.9%)
ELSE IF lon : -77.204215_to_-77.039575 THEN probability of flight: 99.8% (99.2%-100.0%)
ELSE IF datenum : 1.40955446754_to_3.01352711225 THEN probability of flight: 99.8% (99.2%-100.0%)
ELSE IF alt : 36962.5_to_37012.5 THEN probability of flight: 99.9% (99.7%-100.0%)
ELSE IF lon : -77.51456_to_-77.25565 THEN probability of flight: 0.6% (0.0%-2.4%)
ELSE IF lon : -77.808515_to_-77.51456 THEN probability of flight: 0.7% (0.0%-2.5%)
ELSE IF lat : 39.474705_to_39.782185 THEN probability of flight: 99.1% (96.9%-100.0%)
ELSE IF lat : 39.327735_to_39.407935 THEN probability of flight: 1.5% (0.0%-5.4%)
ELSE IF lon : -76.639945_to_inf THEN probability of flight: 50.0% (2.5%-97.5%)
ELSE IF datenum : 13.1880469386_to_13.6763937904 THEN probability of flight: 92.3% (73.5%-99.8%)
ELSE IF datenum : 6.19013145251_to_6.87653632521 THEN probability of flight: 95.5% (83.9%-99.9%)
ELSE probability of flight: 52.9% (41.2%-64.4%)
===============================================

RandomForestClassifier Accuracy: 1.0

In [ ]: