In [1]:
import os
import glob
import LatLon
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
# plot
%matplotlib inline
import matplotlib.pyplot as plt
import pylab
import seaborn as sns
sns.set_style("whitegrid")
from pysurvey.plot import setup, legend, icolorbar, density, minmax
import geoplotlib
import geoplotlib.colors
In [2]:
clean = pd.DataFrame.from_csv('/Users/ajmendez/tmp/flight/flight_clean_3.csv')
In [52]:
clean['flightid'] = flights['flight']+'.'+flights['flightindex'].apply(str)
In [35]:
clean[clean['flightpoints'] > 100].groupby(['flight'], as_index=False).count().sort('date')
Out[35]:
In [72]:
isgood = ( (clean['flight'] == 'A719AA') |
# (clean['flight'] == 'A9C737') |
# (clean['flight'] == 'A313B4') |
(clean['flight'] == '406696') )
flights = clean[isgood]
print len(flights)
# flights.plot('flightindex', 'flightpoints', kind='scatter')
flights.plot('lon', 'lat', kind='scatter', c='datenum',
cmap=pylab.cm.Spectral, lw=0, alpha=0.5)
# pylab.axhline(10)
Out[72]:
In [39]:
colors = geoplotlib.colors.create_set_cmap(flights['flightnum'], pylab.cm.jet)
geoplotlib.tiles_provider('darkmatter')
for fi in np.unique(flights['flightnum']):
geoplotlib.scatter(flights[flights['flightnum'] == fi], color=colors[fi])
bbox = geoplotlib.utils.BoundingBox(40.5,-78.0,38.5,-76)
geoplotlib.set_bbox(bbox)
geoplotlib.inline(800)
In [84]:
import sys
sys.path.append('/Users/ajmendez/tmp/sklearn-expertsys/')
import RuleListClassifier
reload(RuleListClassifier)
from RuleListClassifier import *
from sklearn.datasets.mldata import fetch_mldata
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
In [78]:
feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"]
data = fetch_mldata("diabetes") # get dataset
y = (data.target+1)/2
Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split
In [ ]:
clf = RuleListClassifier(max_iter=100, class1label="diabetes", verbose=False)
clf.fit2(Xtrain, ytrain, feature_labels=feature_labels)
print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf
In [80]:
clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False)
clf.fit(Xtrain, ytrain, feature_labels=feature_labels)
print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf
print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
In [73]:
feature_labels = ['lat', 'lon', 'alt', 'datenum']
Xtrain = flights[feature_labels]
ytrain = flights['flight'] == 'A719AA'
clf = RuleListClassifier(max_iter=1000,
class1label="flight",
verbose=False)
clf.fit(Xtrain, ytrain, feature_labels=feature_labels)
Out[73]:
In [76]:
print "RuleListClassifier Accuracy:", clf.score(Xtrain, ytrain), "Learned interpretable model:\n", clf
print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtrain, ytrain)
In [ ]: