In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas.io.sql as pd_sql

%matplotlib inline

Explore Dataset


In [2]:
df = pd.read_csv("computations_12122015.csv")

In [3]:
df.head(5)


Out[3]:
id totalcost totalduration airport flightid flightcost parkingcost drivingcost timeleavehome flightdeparture flightduration atairporttime drivingduration airline costperhour
0 1 180.81 552 BWI 2ciqOzND4aTUuEtKvqnX82002 143.1 35.6 2.11 2016-01-25T04:15 2016-01-25T07:40-05:00 360 120 72 AS 20.090000
1 2 189.31 698 BWI 2ciqOzND4aTUuEtKvqnX82007 151.6 35.6 2.11 2016-01-25T10:00 2016-01-25T13:20-05:00 511 120 67 UA 17.210000
2 3 189.31 821 BWI 2ciqOzND4aTUuEtKvqnX8200d 151.6 35.6 2.11 2016-01-25T05:00 2016-01-25T08:12-05:00 629 120 72 UA 14.562308
3 4 194.31 795 BWI 2ciqOzND4aTUuEtKvqnX8200E 156.6 35.6 2.11 2016-01-25T14:00 2016-01-25T17:15-05:00 607 120 68 AS 14.946923
4 5 194.31 775 BWI 2ciqOzND4aTUuEtKvqnX8200D 156.6 35.6 2.11 2016-01-25T14:00 2016-01-25T17:15-05:00 587 120 68 AS 16.192500

In [16]:
list(df.columns.values)


Out[16]:
['id',
 'totalcost',
 'totalduration',
 'airport',
 'flightid',
 'flightcost',
 'parkingcost',
 'drivingcost',
 'timeleavehome',
 'flightdeparture',
 'flightduration',
 'atairporttime',
 'drivingduration',
 'airline',
 'costperhour']

In [17]:
FEATURES  = [
    "flightcost",
    "parkingcost",
    "drivingcost",
    "flightduration",
    "atairporttime",
    "drivingduration",
    "airline",
    "costperhour"
]

LABEL_MAP = {
    1: "IAD",
    2: "DCA",
    3: "BWI",
}

In [18]:
for k,v in LABEL_MAP.items():
    df.ix[df.airport == k, 'airport'] = v

# Describe the dataset
print df.describe()


                id    totalcost  totalduration   flightcost  parkingcost  \
count  1316.000000  1316.000000    1316.000000  1316.000000  1316.000000   
mean    658.500000   406.321345     880.164894   355.275479    49.581763   
std     380.040787   132.397859     120.487133   133.837318    14.521403   
min       1.000000   165.810000     470.000000   128.100000    35.600000   
25%     329.750000   324.060000     809.000000   276.850000    40.000000   
50%     658.500000   389.010000     884.000000   327.600000    40.000000   
75%     987.250000   461.310000     956.000000   408.852500    68.000000   
max    1316.000000  1624.310000    1289.000000  1586.600000    68.000000   

       drivingcost  flightduration  atairporttime  drivingduration  \
count  1316.000000     1316.000000    1316.000000      1316.000000   
mean      1.464103      731.861702     102.902736        45.400456   
std       0.825445      111.163350      21.850017        19.918770   
min       0.410000      350.000000      75.000000        21.000000   
25%       0.410000      668.000000      75.000000        21.000000   
50%       2.110000      741.000000     120.000000        52.000000   
75%       2.110000      802.000000     120.000000        57.000000   
max       2.110000     1097.000000     120.000000        72.000000   

       costperhour  
count  1316.000000  
mean     29.352719  
std      11.269272  
min      12.967143  
25%      22.952333  
50%      27.154308  
75%      32.417500  
max     135.359167  

In [19]:
sum(df['flightid'].isnull())


Out[19]:
0

In [20]:
df.describe()


Out[20]:
id totalcost totalduration flightcost parkingcost drivingcost flightduration atairporttime drivingduration costperhour
count 1316.000000 1316.000000 1316.000000 1316.000000 1316.000000 1316.000000 1316.000000 1316.000000 1316.000000 1316.000000
mean 658.500000 406.321345 880.164894 355.275479 49.581763 1.464103 731.861702 102.902736 45.400456 29.352719
std 380.040787 132.397859 120.487133 133.837318 14.521403 0.825445 111.163350 21.850017 19.918770 11.269272
min 1.000000 165.810000 470.000000 128.100000 35.600000 0.410000 350.000000 75.000000 21.000000 12.967143
25% 329.750000 324.060000 809.000000 276.850000 40.000000 0.410000 668.000000 75.000000 21.000000 22.952333
50% 658.500000 389.010000 884.000000 327.600000 40.000000 2.110000 741.000000 120.000000 52.000000 27.154308
75% 987.250000 461.310000 956.000000 408.852500 68.000000 2.110000 802.000000 120.000000 57.000000 32.417500
max 1316.000000 1624.310000 1289.000000 1586.600000 68.000000 2.110000 1097.000000 120.000000 72.000000 135.359167

Explore Dataset (airport)


In [21]:
# Determine the shape of the data
print "{} instances with {} features\n".format(*df.shape)

# Determine the frequency of each class
print df.groupby('airport')['airport'].count()


1316 instances with 15 features

airport
BWI    316
DCA    500
IAD    500
Name: airport, dtype: int64

In [22]:
fig, ax = plt.subplots()
df['airport'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fe957f0>

Explore Dataset (airline)


In [23]:
print "{} instances with {} features\n".format(*df.shape)

# Determine the frequency of each airline
print df.groupby('airline')['airline'].count()


1316 instances with 15 features

airline
AC      15
AS      15
B6      39
F9       1
NK       3
UA    1232
VX      11
Name: airline, dtype: int64

In [24]:
fig, ax = plt.subplots()
df['airline'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fe824a8>

Flight Cost


In [25]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['flightcost'], bins = 40, range = (df['flightcost'].min(),df['totalcost'].max()))
plt.title('flight cost distribution')
plt.xlabel('flightcost')
plt.ylabel('Count of Flights with same price range')
plt.show()


Total Cost (flight cost, real time driving cost and parking cost)


In [26]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['totalcost'], bins = 40, range = (df['flightcost'].min(),df['totalcost'].max()))
plt.title('totalcost distribution')
plt.xlabel('totalcost')
plt.ylabel('Count of Flights with same total cost range')
plt.show()


Flight Duration


In [27]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['flightduration'], bins = 40, range = (df['flightduration'].min(),df['totalduration'].max()))
plt.title('flight duration distribution')
plt.xlabel('flightduration')
plt.ylabel('Count of Flights with same duration range')
plt.show()


Total Duration (flight duration, time at airport and real time driving duration)


In [28]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['totalduration'], bins = 40, range = (df['flightduration'].min(),df['totalduration'].max()))
plt.title('total duration distribution')
plt.xlabel('totalduration')
plt.ylabel('Count of Flights with same total duration range')
plt.show()


User data


In [6]:
userdata = pd.read_csv("userdata.csv")

In [5]:
userdata.head(5)


Out[5]:
id flightid airline flightnum departime cost airport flightdur costperhour
0 13 NaN AA 245 17:15 181 DCA 355 31
1 14 NaN AA 245 17:15 181 DCA 355 31
2 15 NaN AA 245 17:15 181 DCA 355 31
3 16 NaN VX 89 9:00 181 IAD 350 31
4 17 RSETozIzLjLQdvO1tO9iRU001 AS 5 9:15 181 DCA 374 29

In [31]:
# Set some variables
numberOfSelectedFlights = userdata.shape[0] 
selectedFlightForIAD = len(userdata[userdata.airport == 'IAD'])
selectedFlightForDCA = len(userdata[userdata.airport == 'DCA'])
selectedFlightForBWI = len(userdata[userdata.airport == 'BWI'])

print 'the number of selectedflights is %d.' % numberOfSelectedFlights
print 'selectedFlightForIAD is %d.' % selectedFlightForIAD
print 'selectedFlightForDCA is %d.' % selectedFlightForDCA
print 'selectedFlightForBWI is %d.' % selectedFlightForBWI


the number of selectedflights is 34.
selectedFlightForIAD is 5.
selectedFlightForDCA is 17.
selectedFlightForBWI is 12.

In [32]:
fig, ax = plt.subplots()
userdata['airport'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x20642f28>

In [33]:
numberOfSelectedFlights = userdata.shape[0] 
selectedFlightForAA = len(userdata[userdata.airline == 'AA'])
selectedFlightForVX = len(userdata[userdata.airline == 'VX'])
selectedFlightForAS = len(userdata[userdata.airline == 'AS'])
selectedFlightForUA = len(userdata[userdata.airline == 'UA'])
selectedFlightForNK = len(userdata[userdata.airline == 'NK'])
selectedFlightForF9 = len(userdata[userdata.airline == 'F9'])
selectedFlightForAK = len(userdata[userdata.airline == 'AK'])

print 'the number of selectedflights is %d.' % numberOfSelectedFlights
print 'selectedFlightForAA is %d.' % selectedFlightForAA
print 'selectedFlightForVX is %d.' % selectedFlightForVX
print 'selectedFlightForAS is %d.' % selectedFlightForAS
print 'selectedFlightForUA is %d.' % selectedFlightForUA
print 'selectedFlightForNK is %d.' % selectedFlightForNK
print 'selectedFlightForF9 is %d.' % selectedFlightForF9
print 'selectedFlightForAK is %d.' % selectedFlightForAK


the number of selectedflights is 34.
selectedFlightForAA is 7.
selectedFlightForVX is 4.
selectedFlightForAS is 9.
selectedFlightForUA is 1.
selectedFlightForNK is 11.
selectedFlightForF9 is 1.
selectedFlightForAK is 1.

In [34]:
fig, ax = plt.subplots()
userdata['airline'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x205c9a58>

cost per hour distribution


In [12]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(userdata['costperhour'], bins = 20, range = (userdata['costperhour'].min(),userdata['costperhour'].max()))
plt.title('cost per hour distribution')
plt.xlabel('cost per hour')
plt.ylabel('Count of cost per hour with same bucket')
plt.show()


Modeling (K-means clustering)


In [20]:
from sklearn.cluster import KMeans

In [21]:
import csv

In [26]:
x=[]
y=[]

In [27]:
with open('userdata.csv', 'rb') as csvf:
    reader = csv.reader(csvf, delimiter=',')
    headers = next(reader)
    for row in reader:
        try:
            x.append(float(row[5]))
            y.append(float(row[7]))
        except ValueError,e:
            print "error",e,"on line",row

In [24]:
data=[]
for i in range(0,34):
  data.append([x[i],y[i]])

In [11]:
plt.figure(figsize=(6,6))

plt.xlabel("cost",fontsize=14)
plt.ylabel("duration", fontsize=14)

plt.title("Before Clustering ", fontsize=20)

plt.plot(x, y, 'k.', color='#0080ff', markersize=30, alpha=0.6)

plt.show()



In [12]:
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)

# kmeans = KMeans(init='random', n_clusters=3, n_init=10)

kmeans.fit(data)


Out[12]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [43]:
plt.figure(figsize=(6,6))

plt.xlabel("cost",fontsize=14)
plt.ylabel("duration", fontsize=14)

plt.title("After K-Means Clustering", fontsize=20)

plt.plot(x, y, 'k.', color='#ffaaaa', markersize=45, alpha=0.6)

# Plot the centroids as a blue X
centroids = kmeans.cluster_centers_

plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200,
  linewidths=3, color='b', zorder=10)
plt.show()


Clustering (DBSCAN)


In [13]:
from sklearn.cluster import DBSCAN

In [14]:
dbscan = DBSCAN(random_state=111)

In [15]:
dbscan


Out[15]:
DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    min_samples=5, p=None, random_state=111)

In [16]:
dbscan.fit(data)


C:\Users\faye\Anaconda2\lib\site-packages\sklearn\cluster\dbscan_.py:106: DeprecationWarning: The parameter random_state is deprecated in 0.16 and will be removed in version 0.18. DBSCAN is deterministic except for rare border cases.
  category=DeprecationWarning)
Out[16]:
DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    min_samples=5, p=None, random_state=111)

In [17]:
dbscan.labels_


Out[17]:
array([ 0,  0,  0, -1, -1, -1,  1, -1,  1,  1,  0,  1,  1,  1, -1, -1, -1,
       -1, -1,  1, -1, -1,  2,  2, -1,  0, -1, -1,  2,  2, -1, -1,  2, -1], dtype=int64)

In [18]:
for i in range(0, 34):
    if dbscan.labels_[i] == 0:
        c1 = plt.scatter(data[i][0],data[i][1],c='r',marker='+', s=200)
    elif dbscan.labels_[i] == 1:
        c2 = plt.scatter(data[i][0],data[i][1],c='g',marker='o', s=200)
    elif dbscan.labels_[i] == 2:
        c3 = plt.scatter(data[i][0],data[i][1],c='y',marker='x', s=200)
    elif dbscan.labels_[i] == -1:
        c4 = plt.scatter(data[i][0],data[i][1],c='b',marker='*', s=200)

plt.legend([c1, c2, c3, c4], ['Cluster 1', 'Cluster 2','Cluster 3','Noise'])
plt.title('DBSCAN finds 3 clusters and noise')
plt.show()


C:\Users\faye\Anaconda2\lib\site-packages\matplotlib\collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

group user data by cost per hour


In [28]:
from sklearn.cluster import MeanShift, estimate_bandwidth

In [31]:
z=[]
with open('userdata.csv', 'rb') as csvf:
    reader = csv.reader(csvf, delimiter=',')
    headers = next(reader)
    for row in reader:
        try:
            z.append(float(row[8]))
        except ValueError,e:
            print "error",e,"on line",row

In [32]:
z = np.array(zip(z,np.zeros(len(z))), dtype=np.int)
bandwidth = estimate_bandwidth(z, quantile=0.2)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(z)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

for k in range(n_clusters_):
    my_members = labels == k
    print "cluster {0}: {1}".format(k, z[my_members, 0])


cluster 0: [17 17 17 17 17 17 17 15 18 21 17 22 20 20 20 20]
cluster 1: [31 31 31 31 29 31 31 29 31 27 27 30 27 27 29 27]
cluster 2: [60 63]

In [13]:
#Some current studies show the following 3 major factors in purchasing flight:
#Price (43%)
#Schedule and convenient flight time (21%)
#Frequent Flyer Program (13%)
#costWeightFactor
#durationWeightFactor
#our user data shows people weight time as 57% and weight cost as 43%
costWeightFactor=0.21
durationWeightFactor=0.43
selectedIndexWithWeight=0
selectedIndexAsIs=0
selectedOneWithWeight=df['totalcost'][0]*costWeightFactor+df['totalduration'][0]*durationWeightFactor
selectedOneAsIs=df['totalcost'][0]+df['totalduration'][0]
for index, row in df.iterrows():
    costFunctionWithWeight=row['totalcost']*costWeightFactor+row['totalduration']*durationWeightFactor
    costFunctionAsIs=row['totalcost']+row['totalduration']
    if costFunctionWithWeight < selectedOneWithWeight:
        selectedOneWithWeight=costFunctionWithWeight
        selectedIndexWithWeight=index
        selectedCostPerHourWithWeight=row['costperhour']
    if costFunctionAsIs < selectedOneAsIs:
        selectedOneAsIs=costFunctionAsIs
        selectedIndexAsIs=index 
        selectedCostPerHourAsIs=row['costperhour']

recommended flight (totalcost and total duration have equal weight)


In [45]:
print "This is the recommended flight without weight factor:"
print "airport,      flightid,   flightcost, flightduration, costperhour"
df['airport'][selectedIndexAsIs],df['flightid'][selectedIndexAsIs],float(df['flightcost'][selectedIndexAsIs]),df['flightduration'][selectedIndexAsIs],float(df['costperhour'][selectedIndexAsIs])


This is the recommended flight without weight factor:
airport,      flightid,   flightcost, flightduration, costperhour
Out[45]:
('BWI', '2ciqOzND4aTUuEtKvqnX82001', 128.1, 352, 18.423333333333332)

recommended flight (totalcost and totalduration have different weight factor)


In [14]:
print "This is the recommended flight with weight factor (the weight factor is based on a research):"
print "airport,      flightid,   flightcost, flightduration, costperhour"
df['airport'][selectedIndexWithWeight],df['flightid'][selectedIndexWithWeight],float(df['flightcost'][selectedIndexWithWeight]),df['flightduration'][selectedIndexWithWeight],df['costperhour'][selectedIndexWithWeight]


This is the recommended flight with weight factor (the weight factor is based on a research):
airport,      flightid,   flightcost, flightduration, costperhour
Out[14]:
('DCA', 'RSETozIzLjLQdvO1tO9iRU001', 180.1, 374, 35.501428571428569)

In [ ]: