In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas.io.sql as pd_sql
%matplotlib inline
In [2]:
df = pd.read_csv("computations_12122015.csv")
In [3]:
df.head(5)
Out[3]:
In [16]:
list(df.columns.values)
Out[16]:
In [17]:
FEATURES = [
"flightcost",
"parkingcost",
"drivingcost",
"flightduration",
"atairporttime",
"drivingduration",
"airline",
"costperhour"
]
LABEL_MAP = {
1: "IAD",
2: "DCA",
3: "BWI",
}
In [18]:
for k,v in LABEL_MAP.items():
df.ix[df.airport == k, 'airport'] = v
# Describe the dataset
print df.describe()
In [19]:
sum(df['flightid'].isnull())
Out[19]:
In [20]:
df.describe()
Out[20]:
In [21]:
# Determine the shape of the data
print "{} instances with {} features\n".format(*df.shape)
# Determine the frequency of each class
print df.groupby('airport')['airport'].count()
In [22]:
fig, ax = plt.subplots()
df['airport'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')
Out[22]:
In [23]:
print "{} instances with {} features\n".format(*df.shape)
# Determine the frequency of each airline
print df.groupby('airline')['airline'].count()
In [24]:
fig, ax = plt.subplots()
df['airline'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')
Out[24]:
In [25]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['flightcost'], bins = 40, range = (df['flightcost'].min(),df['totalcost'].max()))
plt.title('flight cost distribution')
plt.xlabel('flightcost')
plt.ylabel('Count of Flights with same price range')
plt.show()
In [26]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['totalcost'], bins = 40, range = (df['flightcost'].min(),df['totalcost'].max()))
plt.title('totalcost distribution')
plt.xlabel('totalcost')
plt.ylabel('Count of Flights with same total cost range')
plt.show()
In [27]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['flightduration'], bins = 40, range = (df['flightduration'].min(),df['totalduration'].max()))
plt.title('flight duration distribution')
plt.xlabel('flightduration')
plt.ylabel('Count of Flights with same duration range')
plt.show()
In [28]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['totalduration'], bins = 40, range = (df['flightduration'].min(),df['totalduration'].max()))
plt.title('total duration distribution')
plt.xlabel('totalduration')
plt.ylabel('Count of Flights with same total duration range')
plt.show()
In [6]:
userdata = pd.read_csv("userdata.csv")
In [5]:
userdata.head(5)
Out[5]:
In [31]:
# Set some variables
numberOfSelectedFlights = userdata.shape[0]
selectedFlightForIAD = len(userdata[userdata.airport == 'IAD'])
selectedFlightForDCA = len(userdata[userdata.airport == 'DCA'])
selectedFlightForBWI = len(userdata[userdata.airport == 'BWI'])
print 'the number of selectedflights is %d.' % numberOfSelectedFlights
print 'selectedFlightForIAD is %d.' % selectedFlightForIAD
print 'selectedFlightForDCA is %d.' % selectedFlightForDCA
print 'selectedFlightForBWI is %d.' % selectedFlightForBWI
In [32]:
fig, ax = plt.subplots()
userdata['airport'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')
Out[32]:
In [33]:
numberOfSelectedFlights = userdata.shape[0]
selectedFlightForAA = len(userdata[userdata.airline == 'AA'])
selectedFlightForVX = len(userdata[userdata.airline == 'VX'])
selectedFlightForAS = len(userdata[userdata.airline == 'AS'])
selectedFlightForUA = len(userdata[userdata.airline == 'UA'])
selectedFlightForNK = len(userdata[userdata.airline == 'NK'])
selectedFlightForF9 = len(userdata[userdata.airline == 'F9'])
selectedFlightForAK = len(userdata[userdata.airline == 'AK'])
print 'the number of selectedflights is %d.' % numberOfSelectedFlights
print 'selectedFlightForAA is %d.' % selectedFlightForAA
print 'selectedFlightForVX is %d.' % selectedFlightForVX
print 'selectedFlightForAS is %d.' % selectedFlightForAS
print 'selectedFlightForUA is %d.' % selectedFlightForUA
print 'selectedFlightForNK is %d.' % selectedFlightForNK
print 'selectedFlightForF9 is %d.' % selectedFlightForF9
print 'selectedFlightForAK is %d.' % selectedFlightForAK
In [34]:
fig, ax = plt.subplots()
userdata['airline'].value_counts().plot(ax=ax, color=['r', 'g', 'b'],kind='bar')
Out[34]:
In [12]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(userdata['costperhour'], bins = 20, range = (userdata['costperhour'].min(),userdata['costperhour'].max()))
plt.title('cost per hour distribution')
plt.xlabel('cost per hour')
plt.ylabel('Count of cost per hour with same bucket')
plt.show()
In [20]:
from sklearn.cluster import KMeans
In [21]:
import csv
In [26]:
x=[]
y=[]
In [27]:
with open('userdata.csv', 'rb') as csvf:
reader = csv.reader(csvf, delimiter=',')
headers = next(reader)
for row in reader:
try:
x.append(float(row[5]))
y.append(float(row[7]))
except ValueError,e:
print "error",e,"on line",row
In [24]:
data=[]
for i in range(0,34):
data.append([x[i],y[i]])
In [11]:
plt.figure(figsize=(6,6))
plt.xlabel("cost",fontsize=14)
plt.ylabel("duration", fontsize=14)
plt.title("Before Clustering ", fontsize=20)
plt.plot(x, y, 'k.', color='#0080ff', markersize=30, alpha=0.6)
plt.show()
In [12]:
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)
# kmeans = KMeans(init='random', n_clusters=3, n_init=10)
kmeans.fit(data)
Out[12]:
In [43]:
plt.figure(figsize=(6,6))
plt.xlabel("cost",fontsize=14)
plt.ylabel("duration", fontsize=14)
plt.title("After K-Means Clustering", fontsize=20)
plt.plot(x, y, 'k.', color='#ffaaaa', markersize=45, alpha=0.6)
# Plot the centroids as a blue X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200,
linewidths=3, color='b', zorder=10)
plt.show()
In [13]:
from sklearn.cluster import DBSCAN
In [14]:
dbscan = DBSCAN(random_state=111)
In [15]:
dbscan
Out[15]:
In [16]:
dbscan.fit(data)
Out[16]:
In [17]:
dbscan.labels_
Out[17]:
In [18]:
for i in range(0, 34):
if dbscan.labels_[i] == 0:
c1 = plt.scatter(data[i][0],data[i][1],c='r',marker='+', s=200)
elif dbscan.labels_[i] == 1:
c2 = plt.scatter(data[i][0],data[i][1],c='g',marker='o', s=200)
elif dbscan.labels_[i] == 2:
c3 = plt.scatter(data[i][0],data[i][1],c='y',marker='x', s=200)
elif dbscan.labels_[i] == -1:
c4 = plt.scatter(data[i][0],data[i][1],c='b',marker='*', s=200)
plt.legend([c1, c2, c3, c4], ['Cluster 1', 'Cluster 2','Cluster 3','Noise'])
plt.title('DBSCAN finds 3 clusters and noise')
plt.show()
In [28]:
from sklearn.cluster import MeanShift, estimate_bandwidth
In [31]:
z=[]
with open('userdata.csv', 'rb') as csvf:
reader = csv.reader(csvf, delimiter=',')
headers = next(reader)
for row in reader:
try:
z.append(float(row[8]))
except ValueError,e:
print "error",e,"on line",row
In [32]:
z = np.array(zip(z,np.zeros(len(z))), dtype=np.int)
bandwidth = estimate_bandwidth(z, quantile=0.2)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(z)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
for k in range(n_clusters_):
my_members = labels == k
print "cluster {0}: {1}".format(k, z[my_members, 0])
In [13]:
#Some current studies show the following 3 major factors in purchasing flight:
#Price (43%)
#Schedule and convenient flight time (21%)
#Frequent Flyer Program (13%)
#costWeightFactor
#durationWeightFactor
#our user data shows people weight time as 57% and weight cost as 43%
costWeightFactor=0.21
durationWeightFactor=0.43
selectedIndexWithWeight=0
selectedIndexAsIs=0
selectedOneWithWeight=df['totalcost'][0]*costWeightFactor+df['totalduration'][0]*durationWeightFactor
selectedOneAsIs=df['totalcost'][0]+df['totalduration'][0]
for index, row in df.iterrows():
costFunctionWithWeight=row['totalcost']*costWeightFactor+row['totalduration']*durationWeightFactor
costFunctionAsIs=row['totalcost']+row['totalduration']
if costFunctionWithWeight < selectedOneWithWeight:
selectedOneWithWeight=costFunctionWithWeight
selectedIndexWithWeight=index
selectedCostPerHourWithWeight=row['costperhour']
if costFunctionAsIs < selectedOneAsIs:
selectedOneAsIs=costFunctionAsIs
selectedIndexAsIs=index
selectedCostPerHourAsIs=row['costperhour']
In [45]:
print "This is the recommended flight without weight factor:"
print "airport, flightid, flightcost, flightduration, costperhour"
df['airport'][selectedIndexAsIs],df['flightid'][selectedIndexAsIs],float(df['flightcost'][selectedIndexAsIs]),df['flightduration'][selectedIndexAsIs],float(df['costperhour'][selectedIndexAsIs])
Out[45]:
In [14]:
print "This is the recommended flight with weight factor (the weight factor is based on a research):"
print "airport, flightid, flightcost, flightduration, costperhour"
df['airport'][selectedIndexWithWeight],df['flightid'][selectedIndexWithWeight],float(df['flightcost'][selectedIndexWithWeight]),df['flightduration'][selectedIndexWithWeight],df['costperhour'][selectedIndexWithWeight]
Out[14]:
In [ ]: