In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas.io.sql as pd_sql

%matplotlib inline

Explore Dataset



In [2]:

    
df = pd.read_csv("computations_12122015.csv")



In [3]:

    
df.head(5)









    Out[3]:






  
    
      
      id
      totalcost
      totalduration
      airport
      flightid
      flightcost
      parkingcost
      drivingcost
      timeleavehome
      flightdeparture
      flightduration
      atairporttime
      drivingduration
      airline
      costperhour
    
  
  
    
      0
      1
      180.81
      552
      BWI
      2ciqOzND4aTUuEtKvqnX82002
      143.1
      35.6
      2.11
      2016-01-25T04:15
      2016-01-25T07:40-05:00
      360
      120
      72
      AS
      20.090000
    
    
      1
      2
      189.31
      698
      BWI
      2ciqOzND4aTUuEtKvqnX82007
      151.6
      35.6
      2.11
      2016-01-25T10:00
      2016-01-25T13:20-05:00
      511
      120
      67
      UA
      17.210000
    
    
      2
      3
      189.31
      821
      BWI
      2ciqOzND4aTUuEtKvqnX8200d
      151.6
      35.6
      2.11
      2016-01-25T05:00
      2016-01-25T08:12-05:00
      629
      120
      72
      UA
      14.562308
    
    
      3
      4
      194.31
      795
      BWI
      2ciqOzND4aTUuEtKvqnX8200E
      156.6
      35.6
      2.11
      2016-01-25T14:00
      2016-01-25T17:15-05:00
      607
      120
      68
      AS
      14.946923
    
    
      4
      5
      194.31
      775
      BWI
      2ciqOzND4aTUuEtKvqnX8200D
      156.6
      35.6
      2.11
      2016-01-25T14:00
      2016-01-25T17:15-05:00
      587
      120
      68
      AS
      16.192500



In [16]:

    
list(df.columns.values)









    Out[16]:





['id',
 'totalcost',
 'totalduration',
 'airport',
 'flightid',
 'flightcost',
 'parkingcost',
 'drivingcost',
 'timeleavehome',
 'flightdeparture',
 'flightduration',
 'atairporttime',
 'drivingduration',
 'airline',
 'costperhour']



In [17]:

    
FEATURES  = [
    "flightcost",
    "parkingcost",
    "drivingcost",
    "flightduration",
    "atairporttime",
    "drivingduration",
    "airline",
    "costperhour"
]

LABEL_MAP = {
    1: "IAD",
    2: "DCA",
    3: "BWI",
}



In [18]:

    
for k,v in LABEL_MAP.items():
    df.ix[df.airport == k, 'airport'] = v

# Describe the dataset
print df.describe()









    



                id    totalcost  totalduration   flightcost  parkingcost  \
count  1316.000000  1316.000000    1316.000000  1316.000000  1316.000000   
mean    658.500000   406.321345     880.164894   355.275479    49.581763   
std     380.040787   132.397859     120.487133   133.837318    14.521403   
min       1.000000   165.810000     470.000000   128.100000    35.600000   
25%     329.750000   324.060000     809.000000   276.850000    40.000000   
50%     658.500000   389.010000     884.000000   327.600000    40.000000   
75%     987.250000   461.310000     956.000000   408.852500    68.000000   
max    1316.000000  1624.310000    1289.000000  1586.600000    68.000000   

       drivingcost  flightduration  atairporttime  drivingduration  \
count  1316.000000     1316.000000    1316.000000      1316.000000   
mean      1.464103      731.861702     102.902736        45.400456   
std       0.825445      111.163350      21.850017        19.918770   
min       0.410000      350.000000      75.000000        21.000000   
25%       0.410000      668.000000      75.000000        21.000000   
50%       2.110000      741.000000     120.000000        52.000000   
75%       2.110000      802.000000     120.000000        57.000000   
max       2.110000     1097.000000     120.000000        72.000000   

       costperhour  
count  1316.000000  
mean     29.352719  
std      11.269272  
min      12.967143  
25%      22.952333  
50%      27.154308  
75%      32.417500  
max     135.359167



In [19]:

    
sum(df['flightid'].isnull())









    Out[19]:





0



In [20]:

    
df.describe()









    Out[20]:






  
    
      
      id
      totalcost
      totalduration
      flightcost
      parkingcost
      drivingcost
      flightduration
      atairporttime
      drivingduration
      costperhour
    
  
  
    
      count
      1316.000000
      1316.000000
      1316.000000
      1316.000000
      1316.000000
      1316.000000
      1316.000000
      1316.000000
      1316.000000
      1316.000000
    
    
      mean
      658.500000
      406.321345
      880.164894
      355.275479
      49.581763
      1.464103
      731.861702
      102.902736
      45.400456
      29.352719
    
    
      std
      380.040787
      132.397859
      120.487133
      133.837318
      14.521403
      0.825445
      111.163350
      21.850017
      19.918770
      11.269272
    
    
      min
      1.000000
      165.810000
      470.000000
      128.100000
      35.600000
      0.410000
      350.000000
      75.000000
      21.000000
      12.967143
    
    
      25%
      329.750000
      324.060000
      809.000000
      276.850000
      40.000000
      0.410000
      668.000000
      75.000000
      21.000000
      22.952333
    
    
      50%
      658.500000
      389.010000
      884.000000
      327.600000
      40.000000
      2.110000
      741.000000
      120.000000
      52.000000
      27.154308
    
    
      75%
      987.250000
      461.310000
      956.000000
      408.852500
      68.000000
      2.110000
      802.000000
      120.000000
      57.000000
      32.417500
    
    
      max
      1316.000000
      1624.310000
      1289.000000
      1586.600000
      68.000000
      2.110000
      1097.000000
      120.000000
      72.000000
      135.359167

Explore Dataset (airport)



In [21]:

    
# Determine the shape of the data
print "{} instances with {} features\n".format(*df.shape)

# Determine the frequency of each class
print df.groupby('airport')['airport'].count()









    



1316 instances with 15 features

airport
BWI    316
DCA    500
IAD    500
Name: airport, dtype: int64



In [22]:

    
fig, ax = plt.subplots()
df['airport'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')









    Out[22]:





<matplotlib.axes._subplots.AxesSubplot at 0x1fe957f0>

Explore Dataset (airline)



In [23]:

    
print "{} instances with {} features\n".format(*df.shape)

# Determine the frequency of each airline
print df.groupby('airline')['airline'].count()









    



1316 instances with 15 features

airline
AC      15
AS      15
B6      39
F9       1
NK       3
UA    1232
VX      11
Name: airline, dtype: int64



In [24]:

    
fig, ax = plt.subplots()
df['airline'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x1fe824a8>

Flight Cost



In [25]:

    
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['flightcost'], bins = 40, range = (df['flightcost'].min(),df['totalcost'].max()))
plt.title('flight cost distribution')
plt.xlabel('flightcost')
plt.ylabel('Count of Flights with same price range')
plt.show()

Total Cost (flight cost, real time driving cost and parking cost)



In [26]:

    
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['totalcost'], bins = 40, range = (df['flightcost'].min(),df['totalcost'].max()))
plt.title('totalcost distribution')
plt.xlabel('totalcost')
plt.ylabel('Count of Flights with same total cost range')
plt.show()

Flight Duration



In [27]:

    
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['flightduration'], bins = 40, range = (df['flightduration'].min(),df['totalduration'].max()))
plt.title('flight duration distribution')
plt.xlabel('flightduration')
plt.ylabel('Count of Flights with same duration range')
plt.show()

Total Duration (flight duration, time at airport and real time driving duration)



In [28]:

    
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['totalduration'], bins = 40, range = (df['flightduration'].min(),df['totalduration'].max()))
plt.title('total duration distribution')
plt.xlabel('totalduration')
plt.ylabel('Count of Flights with same total duration range')
plt.show()

User data



In [6]:

    
userdata = pd.read_csv("userdata.csv")



In [5]:

    
userdata.head(5)









    Out[5]:






  
    
      
      id
      flightid
      airline
      flightnum
      departime
      cost
      airport
      flightdur
      costperhour
    
  
  
    
      0
      13
      NaN
      AA
      245
      17:15
      181
      DCA
      355
      31
    
    
      1
      14
      NaN
      AA
      245
      17:15
      181
      DCA
      355
      31
    
    
      2
      15
      NaN
      AA
      245
      17:15
      181
      DCA
      355
      31
    
    
      3
      16
      NaN
      VX
      89
      9:00
      181
      IAD
      350
      31
    
    
      4
      17
      RSETozIzLjLQdvO1tO9iRU001
      AS
      5
      9:15
      181
      DCA
      374
      29



In [31]:

    
# Set some variables
numberOfSelectedFlights = userdata.shape[0] 
selectedFlightForIAD = len(userdata[userdata.airport == 'IAD'])
selectedFlightForDCA = len(userdata[userdata.airport == 'DCA'])
selectedFlightForBWI = len(userdata[userdata.airport == 'BWI'])

print 'the number of selectedflights is %d.' % numberOfSelectedFlights
print 'selectedFlightForIAD is %d.' % selectedFlightForIAD
print 'selectedFlightForDCA is %d.' % selectedFlightForDCA
print 'selectedFlightForBWI is %d.' % selectedFlightForBWI









    



the number of selectedflights is 34.
selectedFlightForIAD is 5.
selectedFlightForDCA is 17.
selectedFlightForBWI is 12.



In [32]:

    
fig, ax = plt.subplots()
userdata['airport'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')









    Out[32]:





<matplotlib.axes._subplots.AxesSubplot at 0x20642f28>



In [33]:

    
numberOfSelectedFlights = userdata.shape[0] 
selectedFlightForAA = len(userdata[userdata.airline == 'AA'])
selectedFlightForVX = len(userdata[userdata.airline == 'VX'])
selectedFlightForAS = len(userdata[userdata.airline == 'AS'])
selectedFlightForUA = len(userdata[userdata.airline == 'UA'])
selectedFlightForNK = len(userdata[userdata.airline == 'NK'])
selectedFlightForF9 = len(userdata[userdata.airline == 'F9'])
selectedFlightForAK = len(userdata[userdata.airline == 'AK'])

print 'the number of selectedflights is %d.' % numberOfSelectedFlights
print 'selectedFlightForAA is %d.' % selectedFlightForAA
print 'selectedFlightForVX is %d.' % selectedFlightForVX
print 'selectedFlightForAS is %d.' % selectedFlightForAS
print 'selectedFlightForUA is %d.' % selectedFlightForUA
print 'selectedFlightForNK is %d.' % selectedFlightForNK
print 'selectedFlightForF9 is %d.' % selectedFlightForF9
print 'selectedFlightForAK is %d.' % selectedFlightForAK









    



the number of selectedflights is 34.
selectedFlightForAA is 7.
selectedFlightForVX is 4.
selectedFlightForAS is 9.
selectedFlightForUA is 1.
selectedFlightForNK is 11.
selectedFlightForF9 is 1.
selectedFlightForAK is 1.



In [34]:

    
fig, ax = plt.subplots()
userdata['airline'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')









    Out[34]:





<matplotlib.axes._subplots.AxesSubplot at 0x205c9a58>

cost per hour distribution



In [12]:

    
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(userdata['costperhour'], bins = 20, range = (userdata['costperhour'].min(),userdata['costperhour'].max()))
plt.title('cost per hour distribution')
plt.xlabel('cost per hour')
plt.ylabel('Count of cost per hour with same bucket')
plt.show()

Modeling (K-means clustering)



In [20]:

    
from sklearn.cluster import KMeans



In [21]:

    
import csv



In [26]:

    
x=[]
y=[]



In [27]:

    
with open('userdata.csv', 'rb') as csvf:
    reader = csv.reader(csvf, delimiter=',')
    headers = next(reader)
    for row in reader:
        try:
            x.append(float(row[5]))
            y.append(float(row[7]))
        except ValueError,e:
            print "error",e,"on line",row



In [24]:

    
data=[]
for i in range(0,34):
  data.append([x[i],y[i]])



In [11]:

    
plt.figure(figsize=(6,6))

plt.xlabel("cost",fontsize=14)
plt.ylabel("duration", fontsize=14)

plt.title("Before Clustering ", fontsize=20)

plt.plot(x, y, 'k.', color='#0080ff', markersize=30, alpha=0.6)

plt.show()



In [12]:

    
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)

# kmeans = KMeans(init='random', n_clusters=3, n_init=10)

kmeans.fit(data)









    Out[12]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)



In [43]:

    
plt.figure(figsize=(6,6))

plt.xlabel("cost",fontsize=14)
plt.ylabel("duration", fontsize=14)

plt.title("After K-Means Clustering", fontsize=20)

plt.plot(x, y, 'k.', color='#ffaaaa', markersize=45, alpha=0.6)

# Plot the centroids as a blue X
centroids = kmeans.cluster_centers_

plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200,
  linewidths=3, color='b', zorder=10)
plt.show()

Clustering (DBSCAN)



In [13]:

    
from sklearn.cluster import DBSCAN



In [14]:

    
dbscan = DBSCAN(random_state=111)



In [15]:

    
dbscan









    Out[15]:





DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    min_samples=5, p=None, random_state=111)



In [16]:

    
dbscan.fit(data)









    



C:\Users\faye\Anaconda2\lib\site-packages\sklearn\cluster\dbscan_.py:106: DeprecationWarning: The parameter random_state is deprecated in 0.16 and will be removed in version 0.18. DBSCAN is deterministic except for rare border cases.
  category=DeprecationWarning)






    Out[16]:





DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    min_samples=5, p=None, random_state=111)



In [17]:

    
dbscan.labels_









    Out[17]:





array([ 0,  0,  0, -1, -1, -1,  1, -1,  1,  1,  0,  1,  1,  1, -1, -1, -1,
       -1, -1,  1, -1, -1,  2,  2, -1,  0, -1, -1,  2,  2, -1, -1,  2, -1], dtype=int64)



In [18]:

    
for i in range(0, 34):
    if dbscan.labels_[i] == 0:
        c1 = plt.scatter(data[i][0],data[i][1],c='r',marker='+', s=200)
    elif dbscan.labels_[i] == 1:
        c2 = plt.scatter(data[i][0],data[i][1],c='g',marker='o', s=200)
    elif dbscan.labels_[i] == 2:
        c3 = plt.scatter(data[i][0],data[i][1],c='y',marker='x', s=200)
    elif dbscan.labels_[i] == -1:
        c4 = plt.scatter(data[i][0],data[i][1],c='b',marker='*', s=200)

plt.legend([c1, c2, c3, c4], ['Cluster 1', 'Cluster 2','Cluster 3','Noise'])
plt.title('DBSCAN finds 3 clusters and noise')
plt.show()









    



C:\Users\faye\Anaconda2\lib\site-packages\matplotlib\collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

group user data by cost per hour



In [28]:

    
from sklearn.cluster import MeanShift, estimate_bandwidth



In [31]:

    
z=[]
with open('userdata.csv', 'rb') as csvf:
    reader = csv.reader(csvf, delimiter=',')
    headers = next(reader)
    for row in reader:
        try:
            z.append(float(row[8]))
        except ValueError,e:
            print "error",e,"on line",row



In [32]:

    
z = np.array(zip(z,np.zeros(len(z))), dtype=np.int)
bandwidth = estimate_bandwidth(z, quantile=0.2)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(z)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

for k in range(n_clusters_):
    my_members = labels == k
    print "cluster {0}: {1}".format(k, z[my_members, 0])









    



cluster 0: [17 17 17 17 17 17 17 15 18 21 17 22 20 20 20 20]
cluster 1: [31 31 31 31 29 31 31 29 31 27 27 30 27 27 29 27]
cluster 2: [60 63]



In [13]:

    
#Some current studies show the following 3 major factors in purchasing flight:
#Price (43%)
#Schedule and convenient flight time (21%)
#Frequent Flyer Program (13%)
#costWeightFactor
#durationWeightFactor
#our user data shows people weight time as 57% and weight cost as 43%
costWeightFactor=0.21
durationWeightFactor=0.43
selectedIndexWithWeight=0
selectedIndexAsIs=0
selectedOneWithWeight=df['totalcost'][0]*costWeightFactor+df['totalduration'][0]*durationWeightFactor
selectedOneAsIs=df['totalcost'][0]+df['totalduration'][0]
for index, row in df.iterrows():
    costFunctionWithWeight=row['totalcost']*costWeightFactor+row['totalduration']*durationWeightFactor
    costFunctionAsIs=row['totalcost']+row['totalduration']
    if costFunctionWithWeight < selectedOneWithWeight:
        selectedOneWithWeight=costFunctionWithWeight
        selectedIndexWithWeight=index
        selectedCostPerHourWithWeight=row['costperhour']
    if costFunctionAsIs < selectedOneAsIs:
        selectedOneAsIs=costFunctionAsIs
        selectedIndexAsIs=index 
        selectedCostPerHourAsIs=row['costperhour']

recommended flight (totalcost and total duration have equal weight)



In [45]:

    
print "This is the recommended flight without weight factor:"
print "airport,      flightid,   flightcost, flightduration, costperhour"
df['airport'][selectedIndexAsIs],df['flightid'][selectedIndexAsIs],float(df['flightcost'][selectedIndexAsIs]),df['flightduration'][selectedIndexAsIs],float(df['costperhour'][selectedIndexAsIs])









    



This is the recommended flight without weight factor:
airport,      flightid,   flightcost, flightduration, costperhour






    Out[45]:





('BWI', '2ciqOzND4aTUuEtKvqnX82001', 128.1, 352, 18.423333333333332)

recommended flight (totalcost and totalduration have different weight factor)



In [14]:

    
print "This is the recommended flight with weight factor (the weight factor is based on a research):"
print "airport,      flightid,   flightcost, flightduration, costperhour"
df['airport'][selectedIndexWithWeight],df['flightid'][selectedIndexWithWeight],float(df['flightcost'][selectedIndexWithWeight]),df['flightduration'][selectedIndexWithWeight],df['costperhour'][selectedIndexWithWeight]









    



This is the recommended flight with weight factor (the weight factor is based on a research):
airport,      flightid,   flightcost, flightduration, costperhour






    Out[14]:





('DCA', 'RSETozIzLjLQdvO1tO9iRU001', 180.1, 374, 35.501428571428569)



In [ ]:

	id	totalcost	totalduration	airport	flightid	flightcost	parkingcost	drivingcost	timeleavehome	flightdeparture	flightduration	atairporttime	drivingduration	airline	costperhour
0	1	180.81	552	BWI	2ciqOzND4aTUuEtKvqnX82002	143.1	35.6	2.11	2016-01-25T04:15	2016-01-25T07:40-05:00	360	120	72	AS	20.090000
1	2	189.31	698	BWI	2ciqOzND4aTUuEtKvqnX82007	151.6	35.6	2.11	2016-01-25T10:00	2016-01-25T13:20-05:00	511	120	67	UA	17.210000
2	3	189.31	821	BWI	2ciqOzND4aTUuEtKvqnX8200d	151.6	35.6	2.11	2016-01-25T05:00	2016-01-25T08:12-05:00	629	120	72	UA	14.562308
3	4	194.31	795	BWI	2ciqOzND4aTUuEtKvqnX8200E	156.6	35.6	2.11	2016-01-25T14:00	2016-01-25T17:15-05:00	607	120	68	AS	14.946923
4	5	194.31	775	BWI	2ciqOzND4aTUuEtKvqnX8200D	156.6	35.6	2.11	2016-01-25T14:00	2016-01-25T17:15-05:00	587	120	68	AS	16.192500

	id	totalcost	totalduration	flightcost	parkingcost	drivingcost	flightduration	atairporttime	drivingduration	costperhour
count	1316.000000	1316.000000	1316.000000	1316.000000	1316.000000	1316.000000	1316.000000	1316.000000	1316.000000	1316.000000
mean	658.500000	406.321345	880.164894	355.275479	49.581763	1.464103	731.861702	102.902736	45.400456	29.352719
std	380.040787	132.397859	120.487133	133.837318	14.521403	0.825445	111.163350	21.850017	19.918770	11.269272
min	1.000000	165.810000	470.000000	128.100000	35.600000	0.410000	350.000000	75.000000	21.000000	12.967143
25%	329.750000	324.060000	809.000000	276.850000	40.000000	0.410000	668.000000	75.000000	21.000000	22.952333
50%	658.500000	389.010000	884.000000	327.600000	40.000000	2.110000	741.000000	120.000000	52.000000	27.154308
75%	987.250000	461.310000	956.000000	408.852500	68.000000	2.110000	802.000000	120.000000	57.000000	32.417500
max	1316.000000	1624.310000	1289.000000	1586.600000	68.000000	2.110000	1097.000000	120.000000	72.000000	135.359167

	id	flightid	airline	flightnum	departime	cost	airport	flightdur	costperhour
0	13	NaN	AA	245	17:15	181	DCA	355	31
1	14	NaN	AA	245	17:15	181	DCA	355	31
2	15	NaN	AA	245	17:15	181	DCA	355	31
3	16	NaN	VX	89	9:00	181	IAD	350	31
4	17	RSETozIzLjLQdvO1tO9iRU001	AS	5	9:15	181	DCA	374	29