In [2]:
import time
import re
from unidecode import unidecode
import pandas as pd
import datetime
from dateutil.parser import parse
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

# ======== Task 1 =========
def scrape_data(start_date, from_place, to_place, city_name):
    """
    start_date: a datetime object for the start date that you should use in your query to Google Flight explorer
    from_place: a string with the name of the origin of the flights
    to_place: a string with the name of the regional destination of the flights
    city_name: a string for the name of the city who's data that you should actually scrape
    
    return: a pandas DataFrame object with two columns "Date_of_Flight" and "Price." , and one row for each day
    
    """
    driver = webdriver.Chrome()
    driver.get('https://www.google.com/flights/explore/')
    time.sleep(2)  # wait for the driver to load data
    
    # from_place
    driver.find_elements_by_class_name('LJTSM3-p-a')[0].click()
    ActionChains(driver).send_keys(from_place).perform()
    ActionChains(driver).send_keys(Keys.ENTER).perform()
    time.sleep(1)
    
    # to_place
    driver.find_elements_by_class_name('LJTSM3-p-a')[1].click()
    ActionChains(driver).send_keys(to_place).perform()
    ActionChains(driver).send_keys(Keys.ENTER).perform()
    time.sleep(1)
    
    # start_date
    start_date_str = start_date.strftime("%Y-%m-%d")
    url_date = driver.current_url[:-10] + start_date_str
    driver.get(url_date)
    time.sleep(2)
    
    # use city_name to choose target city
    results_city = driver.find_elements_by_class_name('LJTSM3-v-c')
    city_num = None

    for i in range(len(results_city)):
        city = results_city[i]
        if re.findall(city_name.lower(), unidecode(city.text).lower()):
            city_num = i
    if city_num == None:
        driver.quit()
        raise Exception('City Name not Found!')    
    time.sleep(1)
    
    # get bars of target city
    results = driver.find_elements_by_class_name('LJTSM3-v-d')
    target = results[city_num]
    bars = target.find_elements_by_class_name('LJTSM3-w-x')
    data = []
    time.sleep(2)

    for bar in bars:
        ActionChains(driver).move_to_element(bar).perform()
        time.sleep(0.01)
        data.append((target.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[0].text,  # get price
               target.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[1].text))      # get date of flight

    data = [x for x in data if str(x[0]) != '']  # exclude null bars
    
    # convert into dataframe and return
    clean_data = [(float(d[0].replace('$', '').replace(',', '')), parse(d[1].split('-')[0].strip()))  for d in data]
    df = pd.DataFrame(clean_data, columns=['Price', 'Date_of_Flight'])
    driver.quit()
    return df

Task 3 question 2


In [31]:
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

flight_data = data_60
plt.boxplot(flight_data['Price'])
plt.title('Boxplot of Prices')
plt.savefig('task_3_iqr.png')



In [32]:
import numpy as np
q1 = np.percentile(flight_data['Price'], 25)
q3 = np.percentile(flight_data['Price'], 75)
IQR = q3 - q1
low_bound = q1-1.5*IQR
outliers = flight_data[flight_data['Price'] < low_bound]

print low_bound
outliers.shape


720.875
Out[32]:
(0, 2)

In [35]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# ======== Task 3 question 2 =========
def task_3_IQR(flight_data):
    df = flight_data
    plt.boxplot(df['Price'])
    plt.title('Boxplot of Prices')
    plt.savefig('task_3_iqr.png')
    
    q1 = np.percentile(df['Price'], 25)
    q3 = np.percentile(df['Price'], 75)
    IQR = q3 - q1
    low_bound = q1-1.5*IQR
    outliers = df[df['Price'] < low_bound]
    if outliers.shape[0] != 0:
        return outliers
    else:
        raise Exception('There is no outlier price!')

In [36]:
outliers_IQR = task_3_IQR(data_60)
outliers_IQR


---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-36-3b98241d5634> in <module>()
----> 1 outliers_IQR = task_3_IQR(data_60)
      2 outliers_IQR

<ipython-input-35-b510f155a918> in task_3_IQR(flight_data)
     18         return outliers
     19     else:
---> 20         raise Exception('There is no outlier price!')

Exception: There is no outlier price!

Task 3 question 1


In [343]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np

In [374]:
# clustering
df = data_60
start_date = []
for i in range(len(df)):
    start_date.append((df['Date_of_Flight'][i]- df['Date_of_Flight'][0]).days +1)

df['Start_Date'] = pd.Series(start_date).values
X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
db = DBSCAN(eps=.3, min_samples=3).fit(X)
df['dbscan_labels'] = db.labels_

print X
df


[[-1.7034199  -0.37614705]
 [-1.64567685 -0.61527181]
 [-1.5879338  -0.61527181]
 [-1.53019075 -0.61527181]
 [-1.47244771 -1.15045581]
 [-1.41470466 -1.15045581]
 [-1.35696161 -1.15045581]
 [-1.29921857 -1.15045581]
 [-1.24147552 -1.15045581]
 [-1.18373247 -1.15045581]
 [-1.12598942 -1.07074755]
 [-1.06824638 -1.15045581]
 [-1.01050333 -1.15045581]
 [-0.95276028 -1.07074755]
 [-0.89501723 -1.07074755]
 [-0.83727419  0.09071558]
 [-0.77953114  0.45509618]
 [-0.72178809 -0.37614705]
 [-0.66404504 -1.07074755]
 [-0.606302   -1.15045581]
 [-0.54855895 -0.61527181]
 [-0.4908159  -0.61527181]
 [-0.43307286 -0.61527181]
 [-0.37532981 -0.61527181]
 [-0.31758676 -0.61527181]
 [-0.25984371 -0.61527181]
 [-0.20210067 -0.61527181]
 [-0.14435762 -0.05731403]
 [-0.08661457 -0.05731403]
 [-0.02887152 -0.05731403]
 [ 0.02887152 -0.05731403]
 [ 0.08661457 -0.05731403]
 [ 0.14435762 -0.05731403]
 [ 0.20210067  0.20458452]
 [ 0.25984371  0.20458452]
 [ 0.31758676  0.20458452]
 [ 0.37532981 -0.05731403]
 [ 0.43307286  0.4209355 ]
 [ 0.4908159   0.31845345]
 [ 0.54855895  0.92195881]
 [ 0.606302   -0.06870092]
 [ 0.66404504 -0.06870092]
 [ 0.72178809 -0.05731403]
 [ 0.77953114 -0.05731403]
 [ 0.83727419 -0.05731403]
 [ 0.89501723 -0.05731403]
 [ 0.95276028 -0.05731403]
 [ 1.01050333  1.52546416]
 [ 1.06824638  1.115536  ]
 [ 1.12598942  1.38882144]
 [ 1.18373247  2.18590399]
 [ 1.24147552  1.115536  ]
 [ 1.29921857 -0.12563539]
 [ 1.35696161  2.84634381]
 [ 1.41470466  0.9447326 ]
 [ 1.47244771  1.19524425]
 [ 1.53019075  0.9447326 ]
 [ 1.5879338   2.74386177]
 [ 1.64567685  1.19524425]
 [ 1.7034199   2.47057632]]
Out[374]:
Price Date_of_Flight Start_Date dbscan_labels
0 609.0 2017-05-10 1 0
1 588.0 2017-05-11 2 0
2 588.0 2017-05-12 3 0
3 588.0 2017-05-13 4 0
4 541.0 2017-05-14 5 1
5 541.0 2017-05-15 6 1
6 541.0 2017-05-16 7 1
7 541.0 2017-05-17 8 1
8 541.0 2017-05-18 9 1
9 541.0 2017-05-19 10 1
10 548.0 2017-05-20 11 1
11 541.0 2017-05-21 12 1
12 541.0 2017-05-22 13 1
13 548.0 2017-05-23 14 1
14 548.0 2017-05-24 15 1
15 650.0 2017-05-25 16 -1
16 682.0 2017-05-26 17 -1
17 609.0 2017-05-27 18 2
18 548.0 2017-05-28 19 1
19 541.0 2017-05-29 20 1
20 588.0 2017-05-30 21 2
21 588.0 2017-05-31 22 2
22 588.0 2017-06-01 23 2
23 588.0 2017-06-02 24 2
24 588.0 2017-06-03 25 2
25 588.0 2017-06-04 26 2
26 588.0 2017-06-05 27 2
27 637.0 2017-06-06 28 3
28 637.0 2017-06-07 29 3
29 637.0 2017-06-08 30 3
30 637.0 2017-06-09 31 3
31 637.0 2017-06-10 32 3
32 637.0 2017-06-11 33 3
33 660.0 2017-06-12 34 3
34 660.0 2017-06-13 35 3
35 660.0 2017-06-14 36 3
36 637.0 2017-06-15 37 3
37 679.0 2017-06-16 38 3
38 670.0 2017-06-17 39 3
39 723.0 2017-06-18 40 -1
40 636.0 2017-06-19 41 3
41 636.0 2017-06-20 42 3
42 637.0 2017-06-21 43 3
43 637.0 2017-06-22 44 3
44 637.0 2017-06-23 45 3
45 637.0 2017-06-24 46 3
46 637.0 2017-06-25 47 3
47 776.0 2017-06-26 48 4
48 740.0 2017-06-27 49 4
49 764.0 2017-06-28 50 4
50 834.0 2017-06-29 51 -1
51 740.0 2017-06-30 52 4
52 631.0 2017-07-01 53 -1
53 892.0 2017-07-02 54 5
54 725.0 2017-07-03 55 4
55 747.0 2017-07-04 56 4
56 725.0 2017-07-05 57 4
57 883.0 2017-07-06 58 5
58 747.0 2017-07-07 59 4
59 859.0 2017-07-08 60 5

In [345]:
# result of clustering
labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
print labels
print clusters
print unique_labels


[ 1  0  0  0  0  1  1  1  1 -1  1  1  1  1  1 -1  1  1  1  1  2  2 -1  2 -1
  2 -1 -1  2 -1 -1 -1  2  2  2  2  2  2  2  2 -1  2 -1  3 -1  2  3  2  3 -1
  3  3  3  3  3  3  3 -1 -1 -1]
5
set([0, 1, 2, 3, -1])

In [346]:
# plot results of clustering
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

matplotlib.style.use('ggplot')
plt.subplots(figsize=(12,8))

colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
 
for k, c in zip(unique_labels, colors):
    class_member_mask = (labels == k)  # get all the points in class k
    xy = X[class_member_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
            markeredgecolor='k', markersize=14)
 
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01)
#plt.savefig('task_3_dbscan.png')


Out[346]:
<matplotlib.text.Text at 0x11d428110>

In [347]:
# Mean of each cluster & other features of cluster
mean_points = []
for n in unique_labels:
    x = pd.DataFrame(X)[df.dbscan_labels == n]
    mp = x.mean(axis=0)
    #calculate threshold for each cluster
    d = df[df.dbscan_labels == n]
    m = np.mean(d.Price)
    std = np.std(d.Price)
    thrshd = m - max(2*std,50)
    mean_points.append((n, mp[0], mp[1], m, std, thrshd))

mean_points = pd.DataFrame(mean_points)
mean_points.columns=['Cluster','Start_Date_X','Price_X','Mean_Price','Std_Price','Threshold']
mean_points = mean_points[mean_points.Cluster != -1]
mean_points


Out[347]:
Cluster Start_Date_X Price_X Mean_Price Std_Price Threshould
0 0 -1.559062 -0.700911 921.750000 7.395100 871.750000
1 1 -1.059997 -0.300562 960.571429 9.240550 910.571429
2 2 0.223754 -0.697043 922.125000 23.642850 872.125000
3 3 1.229927 1.420900 1127.500000 21.448776 1077.500000

In [348]:
# Outliers after scaling
df_outliers = pd.DataFrame(X)[df.dbscan_labels == -1]
df_outliers.columns=['Start_Date_X', 'Price_X']
df_outliers


Out[348]:
Start_Date_X Price_X
9 -1.183732 -1.141773
15 -0.837274 -1.007710
22 -0.433073 0.023547
24 -0.317587 -0.347705
26 -0.202101 -1.678026
27 -0.144358 -2.049279
29 -0.028872 0.621676
30 0.028872 0.116360
31 0.086615 0.106048
40 0.606302 0.477300
42 0.721788 2.302624
44 0.837274 0.477300
49 1.125989 2.271686
57 1.587934 2.312937
58 1.645677 0.735114
59 1.703420 0.735114

In [349]:
# Find the closest cluster for outlier flights
from scipy.spatial import distance

min_dist_list=[]
nearest_cluster=[]
for j in range(df_outliers.shape[0]):
    outlier = df_outliers.iloc[j]
    dist = []
    for i in range(mean_points.shape[0]):
        mean = mean_points[['Start_Date_X', 'Price_X']].iloc[i]
        dist.append(distance.euclidean(outlier,mean))
    min_dist_list.append(min(dist))
    for k, d in enumerate(dist):
        if d == min(dist):
            nearest_cluster.append(k)

print nearest_cluster
min_dist_list


[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]
Out[349]:
[0.5789921539509403,
 0.741392587548575,
 0.7057486213718538,
 0.6442726871797768,
 1.0694295963071705,
 1.40144433439152,
 1.3426989124640343,
 0.8364237540348355,
 0.8147162069556212,
 1.1310563514824572,
 1.0176653949178447,
 1.0220355181034213,
 0.8571119544046722,
 0.9611965269296006,
 0.8019662659841272,
 0.8333651590576142]

In [350]:
outliers = df_outliers.copy()
outliers['closest_cluster'] = pd.Series(nearest_cluster).values  # Nearest cluster of each outlier
outliers['Price'] = df['Price'][df.dbscan_labels == -1]  # Orignal price of each outlier
outliers['Date_of_Flight'] = df['Date_of_Flight'][df.dbscan_labels == -1]  # Orignal date of each outlier
outliers['threshold'] = mean_points['Threshold'][outliers['closest_cluster']].values # Threshold of the closest cluster
outliers


Out[350]:
Start_Date_X Price_X closest_cluster Price Date_of_Flight threshould
9 -1.183732 -1.141773 0 879.0 2017-04-19 871.750000
15 -0.837274 -1.007710 1 892.0 2017-04-25 910.571429
22 -0.433073 0.023547 1 992.0 2017-05-02 910.571429
24 -0.317587 -0.347705 2 956.0 2017-05-04 872.125000
26 -0.202101 -1.678026 2 827.0 2017-05-06 872.125000
27 -0.144358 -2.049279 2 791.0 2017-05-07 872.125000
29 -0.028872 0.621676 2 1050.0 2017-05-09 872.125000
30 0.028872 0.116360 2 1001.0 2017-05-10 872.125000
31 0.086615 0.106048 2 1000.0 2017-05-11 872.125000
40 0.606302 0.477300 3 1036.0 2017-05-20 1077.500000
42 0.721788 2.302624 3 1213.0 2017-05-22 1077.500000
44 0.837274 0.477300 3 1036.0 2017-05-24 1077.500000
49 1.125989 2.271686 3 1210.0 2017-05-29 1077.500000
57 1.587934 2.312937 3 1214.0 2017-06-06 1077.500000
58 1.645677 0.735114 3 1061.0 2017-06-07 1077.500000
59 1.703420 0.735114 3 1061.0 2017-06-08 1077.500000

In [354]:
result = outliers[['Price','Date_of_Flight']][outliers.Price <= outliers.threshold]
result


Out[354]:
Price Date_of_Flight
15 892.0 2017-04-25
26 827.0 2017-05-06
27 791.0 2017-05-07
40 1036.0 2017-05-20
44 1036.0 2017-05-24
58 1061.0 2017-06-07
59 1061.0 2017-06-08

In [358]:
result = pd.DataFrame()
result.shape


Out[358]:
(0, 0)

In [5]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

# ========== Task 3 question 1 ==========
def task_3_dbscan(flight_data):
    '''
    flight_data: a pandas DataFrame object with 2 columns 'Price' & 'Date_of_Flight', one row for each day
    return: a pandas DataFrame object with 2 colomns 'Price' & 'Date_of_Flight',  one row for each outlier flight
    '''
    df = flight_data
    start_date = []
    for i in range(len(df)):
        start_date.append((df['Date_of_Flight'][i]- df['Date_of_Flight'][0]).days +1)
    
    # clustering
    df['Start_Date'] = pd.Series(start_date).values
    X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
    db = DBSCAN(eps = 0.3, min_samples = 3).fit(X)
    df['dbscan_labels'] = db.labels_
    
    # plot results of clustering
    labels = db.labels_
    clusters = len(set(labels))
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    matplotlib.style.use('ggplot')
    plt.subplots(figsize=(12,8))
    
    for k, c in zip(unique_labels, colors):
        class_member_mask = (labels == k)  # get all the points in class k
        xy = X[class_member_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
                markeredgecolor='k', markersize=14)
 
    plt.title("Total Clusters: {}".format(clusters)+ " Eps=0.3 Min_spl=3", fontsize=14, y=1.01)
    plt.savefig('task_3_dbscan.png')
    
    # Mean of each cluster & other features of cluster
    mean_points = []
    for n in unique_labels:
        x = pd.DataFrame(X)[df.dbscan_labels == n]
        mp = x.mean(axis=0)
        # calculate threshold for each cluster
        d = df[df.dbscan_labels == n]
        m = np.mean(d.Price)
        std = np.std(d.Price)
        thrshd = m - max(2*std,50)
        mean_points.append((n, mp[0], mp[1], m, std, thrshd))
    
    mean_points = pd.DataFrame(mean_points)
    mean_points.columns=['Cluster','Start_Date_X','Price_X','Mean_Price','Std_Price','Threshold']
    mean_points = mean_points[mean_points.Cluster != -1]
    
    # Outliers with scaled features
    df_outliers = pd.DataFrame(X)[df.dbscan_labels == -1]
    df_outliers.columns=['Start_Date_X', 'Price_X']
    
    # Find the closest cluster for outlier flights
    #min_dist_list=[]
    nearest_cluster=[]
    for j in range(df_outliers.shape[0]):
        outlier = df_outliers.iloc[j]
        dist = []
        for i in range(mean_points.shape[0]):
            mean = mean_points[['Start_Date_X', 'Price_X']].iloc[i]
            dist.append(distance.euclidean(outlier,mean))
        #min_dist_list.append(min(dist))
        for k, d in enumerate(dist):
            if d == min(dist):
                nearest_cluster.append(k)
    
    outliers = df_outliers.copy()
    outliers['closest_cluster'] = pd.Series(nearest_cluster).values  # Nearest cluster of each outlier
    outliers['Price'] = df['Price'][df.dbscan_labels == -1]  # Original price of each outlier
    outliers['Date_of_Flight'] = df['Date_of_Flight'][df.dbscan_labels == -1]  # Original date of each outlier
    outliers['threshold'] = mean_points['Threshold'][outliers['closest_cluster']].values # Threshold of the closest cluster
    
    result = outliers[['Price','Date_of_Flight']][outliers.Price <= outliers.threshold]
    if result.shape[0] != 0:
        return result
    else:
        raise Exception('There is no outlier price in this period!')

In [7]:
# test 1
data_60 = scrape_data(datetime.datetime(2017, 4, 10), 'Beijing', 'Mexico', 'Mexico City')
task_3_dbscan(data_60)


Out[7]:
Price Date_of_Flight
26 827.0 2017-05-06
27 791.0 2017-05-07

In [376]:
# test 2
data_60 = scrape_data(datetime.datetime(2017, 5, 10), 'Beijing', 'United States', 'San Francisco')
task_3_dbscan(data_60)


---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-376-aa197a72d72a> in <module>()
      2 #data_60 = scrape_data(datetime.datetime(2017, 5, 10), 'Beijing', 'Mexico', 'Cancun')
      3 #data_60 = scrape_data(datetime.datetime(2017, 5, 10), 'Beijing', 'United States', 'San Francisco')
----> 4 task_3_dbscan(data_60)

<ipython-input-375-038ae8f8c45d> in task_3_dbscan(flight_data)
     85         return result
     86     else:
---> 87         raise Exception('Good Price not Found!')
     88 

Exception: Good Price not Found!

Choose the best epsilon and min_sample value


In [381]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

# ========== Task 3 question 1 ==========
def task_3_dbscan_tuning(flight_data, eps_val, min_spl):
    '''
    flight_data: a pandas DataFrame object with 2 columns 'Price' & 'Date_of_Flight', one row for each day
    return: a pandas DataFrame object with 2 colomns 'Price' & 'Date_of_Flight',  one row for each outlier flight
    '''
    df = flight_data
    start_date = []
    for i in range(len(df)):
        start_date.append((df['Date_of_Flight'][i]- df['Date_of_Flight'][0]).days +1)
    
    # clustering
    df['Start_Date'] = pd.Series(start_date).values
    X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
    db = DBSCAN(eps = eps_val, min_samples = min_spl).fit(X)
    df['dbscan_labels'] = db.labels_
    
    # plot
    labels = db.labels_
    clusters = len(set(labels))
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    matplotlib.style.use('ggplot')
    plt.subplots(figsize=(12,8))
    
    for k, c in zip(unique_labels, colors):
        class_member_mask = (labels == k)  # get all the points in class k
        xy = X[class_member_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
                markeredgecolor='k', markersize=14)
 
    plt.title("Total Clusters: {}".format(clusters)+ " Eps={}".format(eps_val)+ " Min_spl={}".format(min_spl), fontsize=14, y=1.01)
    #plt.savefig('task_3_dbscan.png')
    print unique_labels

In [382]:
task_3_dbscan_tuning(data_60, 0.3, 3)


set([0, 1, 2, 3, 4, 5, -1])

In [ ]: