In [2]:

    
import time
import re
from unidecode import unidecode
import pandas as pd
import datetime
from dateutil.parser import parse
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

# ======== Task 1 =========
def scrape_data(start_date, from_place, to_place, city_name):
    """
    start_date: a datetime object for the start date that you should use in your query to Google Flight explorer
    from_place: a string with the name of the origin of the flights
    to_place: a string with the name of the regional destination of the flights
    city_name: a string for the name of the city who's data that you should actually scrape
    
    return: a pandas DataFrame object with two columns "Date_of_Flight" and "Price." , and one row for each day
    
    """
    driver = webdriver.Chrome()
    driver.get('https://www.google.com/flights/explore/')
    time.sleep(2)  # wait for the driver to load data
    
    # from_place
    driver.find_elements_by_class_name('LJTSM3-p-a')[0].click()
    ActionChains(driver).send_keys(from_place).perform()
    ActionChains(driver).send_keys(Keys.ENTER).perform()
    time.sleep(1)
    
    # to_place
    driver.find_elements_by_class_name('LJTSM3-p-a')[1].click()
    ActionChains(driver).send_keys(to_place).perform()
    ActionChains(driver).send_keys(Keys.ENTER).perform()
    time.sleep(1)
    
    # start_date
    start_date_str = start_date.strftime("%Y-%m-%d")
    url_date = driver.current_url[:-10] + start_date_str
    driver.get(url_date)
    time.sleep(2)
    
    # use city_name to choose target city
    results_city = driver.find_elements_by_class_name('LJTSM3-v-c')
    city_num = None

    for i in range(len(results_city)):
        city = results_city[i]
        if re.findall(city_name.lower(), unidecode(city.text).lower()):
            city_num = i
    if city_num == None:
        driver.quit()
        raise Exception('City Name not Found!')    
    time.sleep(1)
    
    # get bars of target city
    results = driver.find_elements_by_class_name('LJTSM3-v-d')
    target = results[city_num]
    bars = target.find_elements_by_class_name('LJTSM3-w-x')
    data = []
    time.sleep(2)

    for bar in bars:
        ActionChains(driver).move_to_element(bar).perform()
        time.sleep(0.01)
        data.append((target.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[0].text,  # get price
               target.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[1].text))      # get date of flight

    data = [x for x in data if str(x[0]) != '']  # exclude null bars
    
    # convert into dataframe and return
    clean_data = [(float(d[0].replace('$', '').replace(',', '')), parse(d[1].split('-')[0].strip()))  for d in data]
    df = pd.DataFrame(clean_data, columns=['Price', 'Date_of_Flight'])
    driver.quit()
    return df

Task 3 question 2



In [31]:

    
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

flight_data = data_60
plt.boxplot(flight_data['Price'])
plt.title('Boxplot of Prices')
plt.savefig('task_3_iqr.png')



In [32]:

    
import numpy as np
q1 = np.percentile(flight_data['Price'], 25)
q3 = np.percentile(flight_data['Price'], 75)
IQR = q3 - q1
low_bound = q1-1.5*IQR
outliers = flight_data[flight_data['Price'] < low_bound]

print low_bound
outliers.shape









    



720.875






    Out[32]:





(0, 2)



In [35]:

    
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# ======== Task 3 question 2 =========
def task_3_IQR(flight_data):
    df = flight_data
    plt.boxplot(df['Price'])
    plt.title('Boxplot of Prices')
    plt.savefig('task_3_iqr.png')
    
    q1 = np.percentile(df['Price'], 25)
    q3 = np.percentile(df['Price'], 75)
    IQR = q3 - q1
    low_bound = q1-1.5*IQR
    outliers = df[df['Price'] < low_bound]
    if outliers.shape[0] != 0:
        return outliers
    else:
        raise Exception('There is no outlier price!')



In [36]:

    
outliers_IQR = task_3_IQR(data_60)
outliers_IQR









    



---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-36-3b98241d5634> in <module>()
----> 1 outliers_IQR = task_3_IQR(data_60)
      2 outliers_IQR

<ipython-input-35-b510f155a918> in task_3_IQR(flight_data)
     18         return outliers
     19     else:
---> 20         raise Exception('There is no outlier price!')

Exception: There is no outlier price!

Task 3 question 1



In [343]:

    
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np



In [374]:

    
# clustering
df = data_60
start_date = []
for i in range(len(df)):
    start_date.append((df['Date_of_Flight'][i]- df['Date_of_Flight'][0]).days +1)

df['Start_Date'] = pd.Series(start_date).values
X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
db = DBSCAN(eps=.3, min_samples=3).fit(X)
df['dbscan_labels'] = db.labels_

print X
df









    



[[-1.7034199  -0.37614705]
 [-1.64567685 -0.61527181]
 [-1.5879338  -0.61527181]
 [-1.53019075 -0.61527181]
 [-1.47244771 -1.15045581]
 [-1.41470466 -1.15045581]
 [-1.35696161 -1.15045581]
 [-1.29921857 -1.15045581]
 [-1.24147552 -1.15045581]
 [-1.18373247 -1.15045581]
 [-1.12598942 -1.07074755]
 [-1.06824638 -1.15045581]
 [-1.01050333 -1.15045581]
 [-0.95276028 -1.07074755]
 [-0.89501723 -1.07074755]
 [-0.83727419  0.09071558]
 [-0.77953114  0.45509618]
 [-0.72178809 -0.37614705]
 [-0.66404504 -1.07074755]
 [-0.606302   -1.15045581]
 [-0.54855895 -0.61527181]
 [-0.4908159  -0.61527181]
 [-0.43307286 -0.61527181]
 [-0.37532981 -0.61527181]
 [-0.31758676 -0.61527181]
 [-0.25984371 -0.61527181]
 [-0.20210067 -0.61527181]
 [-0.14435762 -0.05731403]
 [-0.08661457 -0.05731403]
 [-0.02887152 -0.05731403]
 [ 0.02887152 -0.05731403]
 [ 0.08661457 -0.05731403]
 [ 0.14435762 -0.05731403]
 [ 0.20210067  0.20458452]
 [ 0.25984371  0.20458452]
 [ 0.31758676  0.20458452]
 [ 0.37532981 -0.05731403]
 [ 0.43307286  0.4209355 ]
 [ 0.4908159   0.31845345]
 [ 0.54855895  0.92195881]
 [ 0.606302   -0.06870092]
 [ 0.66404504 -0.06870092]
 [ 0.72178809 -0.05731403]
 [ 0.77953114 -0.05731403]
 [ 0.83727419 -0.05731403]
 [ 0.89501723 -0.05731403]
 [ 0.95276028 -0.05731403]
 [ 1.01050333  1.52546416]
 [ 1.06824638  1.115536  ]
 [ 1.12598942  1.38882144]
 [ 1.18373247  2.18590399]
 [ 1.24147552  1.115536  ]
 [ 1.29921857 -0.12563539]
 [ 1.35696161  2.84634381]
 [ 1.41470466  0.9447326 ]
 [ 1.47244771  1.19524425]
 [ 1.53019075  0.9447326 ]
 [ 1.5879338   2.74386177]
 [ 1.64567685  1.19524425]
 [ 1.7034199   2.47057632]]






    Out[374]:






  
    
      
      Price
      Date_of_Flight
      Start_Date
      dbscan_labels
    
  
  
    
      0
      609.0
      2017-05-10
      1
      0
    
    
      1
      588.0
      2017-05-11
      2
      0
    
    
      2
      588.0
      2017-05-12
      3
      0
    
    
      3
      588.0
      2017-05-13
      4
      0
    
    
      4
      541.0
      2017-05-14
      5
      1
    
    
      5
      541.0
      2017-05-15
      6
      1
    
    
      6
      541.0
      2017-05-16
      7
      1
    
    
      7
      541.0
      2017-05-17
      8
      1
    
    
      8
      541.0
      2017-05-18
      9
      1
    
    
      9
      541.0
      2017-05-19
      10
      1
    
    
      10
      548.0
      2017-05-20
      11
      1
    
    
      11
      541.0
      2017-05-21
      12
      1
    
    
      12
      541.0
      2017-05-22
      13
      1
    
    
      13
      548.0
      2017-05-23
      14
      1
    
    
      14
      548.0
      2017-05-24
      15
      1
    
    
      15
      650.0
      2017-05-25
      16
      -1
    
    
      16
      682.0
      2017-05-26
      17
      -1
    
    
      17
      609.0
      2017-05-27
      18
      2
    
    
      18
      548.0
      2017-05-28
      19
      1
    
    
      19
      541.0
      2017-05-29
      20
      1
    
    
      20
      588.0
      2017-05-30
      21
      2
    
    
      21
      588.0
      2017-05-31
      22
      2
    
    
      22
      588.0
      2017-06-01
      23
      2
    
    
      23
      588.0
      2017-06-02
      24
      2
    
    
      24
      588.0
      2017-06-03
      25
      2
    
    
      25
      588.0
      2017-06-04
      26
      2
    
    
      26
      588.0
      2017-06-05
      27
      2
    
    
      27
      637.0
      2017-06-06
      28
      3
    
    
      28
      637.0
      2017-06-07
      29
      3
    
    
      29
      637.0
      2017-06-08
      30
      3
    
    
      30
      637.0
      2017-06-09
      31
      3
    
    
      31
      637.0
      2017-06-10
      32
      3
    
    
      32
      637.0
      2017-06-11
      33
      3
    
    
      33
      660.0
      2017-06-12
      34
      3
    
    
      34
      660.0
      2017-06-13
      35
      3
    
    
      35
      660.0
      2017-06-14
      36
      3
    
    
      36
      637.0
      2017-06-15
      37
      3
    
    
      37
      679.0
      2017-06-16
      38
      3
    
    
      38
      670.0
      2017-06-17
      39
      3
    
    
      39
      723.0
      2017-06-18
      40
      -1
    
    
      40
      636.0
      2017-06-19
      41
      3
    
    
      41
      636.0
      2017-06-20
      42
      3
    
    
      42
      637.0
      2017-06-21
      43
      3
    
    
      43
      637.0
      2017-06-22
      44
      3
    
    
      44
      637.0
      2017-06-23
      45
      3
    
    
      45
      637.0
      2017-06-24
      46
      3
    
    
      46
      637.0
      2017-06-25
      47
      3
    
    
      47
      776.0
      2017-06-26
      48
      4
    
    
      48
      740.0
      2017-06-27
      49
      4
    
    
      49
      764.0
      2017-06-28
      50
      4
    
    
      50
      834.0
      2017-06-29
      51
      -1
    
    
      51
      740.0
      2017-06-30
      52
      4
    
    
      52
      631.0
      2017-07-01
      53
      -1
    
    
      53
      892.0
      2017-07-02
      54
      5
    
    
      54
      725.0
      2017-07-03
      55
      4
    
    
      55
      747.0
      2017-07-04
      56
      4
    
    
      56
      725.0
      2017-07-05
      57
      4
    
    
      57
      883.0
      2017-07-06
      58
      5
    
    
      58
      747.0
      2017-07-07
      59
      4
    
    
      59
      859.0
      2017-07-08
      60
      5



In [345]:

    
# result of clustering
labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
print labels
print clusters
print unique_labels









    



[ 1  0  0  0  0  1  1  1  1 -1  1  1  1  1  1 -1  1  1  1  1  2  2 -1  2 -1
  2 -1 -1  2 -1 -1 -1  2  2  2  2  2  2  2  2 -1  2 -1  3 -1  2  3  2  3 -1
  3  3  3  3  3  3  3 -1 -1 -1]
5
set([0, 1, 2, 3, -1])



In [346]:

    
# plot results of clustering
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

matplotlib.style.use('ggplot')
plt.subplots(figsize=(12,8))

colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
 
for k, c in zip(unique_labels, colors):
    class_member_mask = (labels == k)  # get all the points in class k
    xy = X[class_member_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
            markeredgecolor='k', markersize=14)
 
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01)
#plt.savefig('task_3_dbscan.png')









    Out[346]:





<matplotlib.text.Text at 0x11d428110>



In [347]:

    
# Mean of each cluster & other features of cluster
mean_points = []
for n in unique_labels:
    x = pd.DataFrame(X)[df.dbscan_labels == n]
    mp = x.mean(axis=0)
    #calculate threshold for each cluster
    d = df[df.dbscan_labels == n]
    m = np.mean(d.Price)
    std = np.std(d.Price)
    thrshd = m - max(2*std,50)
    mean_points.append((n, mp[0], mp[1], m, std, thrshd))

mean_points = pd.DataFrame(mean_points)
mean_points.columns=['Cluster','Start_Date_X','Price_X','Mean_Price','Std_Price','Threshold']
mean_points = mean_points[mean_points.Cluster != -1]
mean_points









    Out[347]:






  
    
      
      Cluster
      Start_Date_X
      Price_X
      Mean_Price
      Std_Price
      Threshould
    
  
  
    
      0
      0
      -1.559062
      -0.700911
      921.750000
      7.395100
      871.750000
    
    
      1
      1
      -1.059997
      -0.300562
      960.571429
      9.240550
      910.571429
    
    
      2
      2
      0.223754
      -0.697043
      922.125000
      23.642850
      872.125000
    
    
      3
      3
      1.229927
      1.420900
      1127.500000
      21.448776
      1077.500000



In [348]:

    
# Outliers after scaling
df_outliers = pd.DataFrame(X)[df.dbscan_labels == -1]
df_outliers.columns=['Start_Date_X', 'Price_X']
df_outliers









    Out[348]:






  
    
      
      Start_Date_X
      Price_X
    
  
  
    
      9
      -1.183732
      -1.141773
    
    
      15
      -0.837274
      -1.007710
    
    
      22
      -0.433073
      0.023547
    
    
      24
      -0.317587
      -0.347705
    
    
      26
      -0.202101
      -1.678026
    
    
      27
      -0.144358
      -2.049279
    
    
      29
      -0.028872
      0.621676
    
    
      30
      0.028872
      0.116360
    
    
      31
      0.086615
      0.106048
    
    
      40
      0.606302
      0.477300
    
    
      42
      0.721788
      2.302624
    
    
      44
      0.837274
      0.477300
    
    
      49
      1.125989
      2.271686
    
    
      57
      1.587934
      2.312937
    
    
      58
      1.645677
      0.735114
    
    
      59
      1.703420
      0.735114



In [349]:

    
# Find the closest cluster for outlier flights
from scipy.spatial import distance

min_dist_list=[]
nearest_cluster=[]
for j in range(df_outliers.shape[0]):
    outlier = df_outliers.iloc[j]
    dist = []
    for i in range(mean_points.shape[0]):
        mean = mean_points[['Start_Date_X', 'Price_X']].iloc[i]
        dist.append(distance.euclidean(outlier,mean))
    min_dist_list.append(min(dist))
    for k, d in enumerate(dist):
        if d == min(dist):
            nearest_cluster.append(k)

print nearest_cluster
min_dist_list









    



[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]






    Out[349]:





[0.5789921539509403,
 0.741392587548575,
 0.7057486213718538,
 0.6442726871797768,
 1.0694295963071705,
 1.40144433439152,
 1.3426989124640343,
 0.8364237540348355,
 0.8147162069556212,
 1.1310563514824572,
 1.0176653949178447,
 1.0220355181034213,
 0.8571119544046722,
 0.9611965269296006,
 0.8019662659841272,
 0.8333651590576142]



In [350]:

    
outliers = df_outliers.copy()
outliers['closest_cluster'] = pd.Series(nearest_cluster).values  # Nearest cluster of each outlier
outliers['Price'] = df['Price'][df.dbscan_labels == -1]  # Orignal price of each outlier
outliers['Date_of_Flight'] = df['Date_of_Flight'][df.dbscan_labels == -1]  # Orignal date of each outlier
outliers['threshold'] = mean_points['Threshold'][outliers['closest_cluster']].values # Threshold of the closest cluster
outliers









    Out[350]:






  
    
      
      Start_Date_X
      Price_X
      closest_cluster
      Price
      Date_of_Flight
      threshould
    
  
  
    
      9
      -1.183732
      -1.141773
      0
      879.0
      2017-04-19
      871.750000
    
    
      15
      -0.837274
      -1.007710
      1
      892.0
      2017-04-25
      910.571429
    
    
      22
      -0.433073
      0.023547
      1
      992.0
      2017-05-02
      910.571429
    
    
      24
      -0.317587
      -0.347705
      2
      956.0
      2017-05-04
      872.125000
    
    
      26
      -0.202101
      -1.678026
      2
      827.0
      2017-05-06
      872.125000
    
    
      27
      -0.144358
      -2.049279
      2
      791.0
      2017-05-07
      872.125000
    
    
      29
      -0.028872
      0.621676
      2
      1050.0
      2017-05-09
      872.125000
    
    
      30
      0.028872
      0.116360
      2
      1001.0
      2017-05-10
      872.125000
    
    
      31
      0.086615
      0.106048
      2
      1000.0
      2017-05-11
      872.125000
    
    
      40
      0.606302
      0.477300
      3
      1036.0
      2017-05-20
      1077.500000
    
    
      42
      0.721788
      2.302624
      3
      1213.0
      2017-05-22
      1077.500000
    
    
      44
      0.837274
      0.477300
      3
      1036.0
      2017-05-24
      1077.500000
    
    
      49
      1.125989
      2.271686
      3
      1210.0
      2017-05-29
      1077.500000
    
    
      57
      1.587934
      2.312937
      3
      1214.0
      2017-06-06
      1077.500000
    
    
      58
      1.645677
      0.735114
      3
      1061.0
      2017-06-07
      1077.500000
    
    
      59
      1.703420
      0.735114
      3
      1061.0
      2017-06-08
      1077.500000



In [354]:

    
result = outliers[['Price','Date_of_Flight']][outliers.Price <= outliers.threshold]
result









    Out[354]:






  
    
      
      Price
      Date_of_Flight
    
  
  
    
      15
      892.0
      2017-04-25
    
    
      26
      827.0
      2017-05-06
    
    
      27
      791.0
      2017-05-07
    
    
      40
      1036.0
      2017-05-20
    
    
      44
      1036.0
      2017-05-24
    
    
      58
      1061.0
      2017-06-07
    
    
      59
      1061.0
      2017-06-08



In [358]:

    
result = pd.DataFrame()
result.shape









    Out[358]:





(0, 0)



In [5]:

    
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

# ========== Task 3 question 1 ==========
def task_3_dbscan(flight_data):
    '''
    flight_data: a pandas DataFrame object with 2 columns 'Price' & 'Date_of_Flight', one row for each day
    return: a pandas DataFrame object with 2 colomns 'Price' & 'Date_of_Flight',  one row for each outlier flight
    '''
    df = flight_data
    start_date = []
    for i in range(len(df)):
        start_date.append((df['Date_of_Flight'][i]- df['Date_of_Flight'][0]).days +1)
    
    # clustering
    df['Start_Date'] = pd.Series(start_date).values
    X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
    db = DBSCAN(eps = 0.3, min_samples = 3).fit(X)
    df['dbscan_labels'] = db.labels_
    
    # plot results of clustering
    labels = db.labels_
    clusters = len(set(labels))
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    matplotlib.style.use('ggplot')
    plt.subplots(figsize=(12,8))
    
    for k, c in zip(unique_labels, colors):
        class_member_mask = (labels == k)  # get all the points in class k
        xy = X[class_member_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
                markeredgecolor='k', markersize=14)
 
    plt.title("Total Clusters: {}".format(clusters)+ " Eps=0.3 Min_spl=3", fontsize=14, y=1.01)
    plt.savefig('task_3_dbscan.png')
    
    # Mean of each cluster & other features of cluster
    mean_points = []
    for n in unique_labels:
        x = pd.DataFrame(X)[df.dbscan_labels == n]
        mp = x.mean(axis=0)
        # calculate threshold for each cluster
        d = df[df.dbscan_labels == n]
        m = np.mean(d.Price)
        std = np.std(d.Price)
        thrshd = m - max(2*std,50)
        mean_points.append((n, mp[0], mp[1], m, std, thrshd))
    
    mean_points = pd.DataFrame(mean_points)
    mean_points.columns=['Cluster','Start_Date_X','Price_X','Mean_Price','Std_Price','Threshold']
    mean_points = mean_points[mean_points.Cluster != -1]
    
    # Outliers with scaled features
    df_outliers = pd.DataFrame(X)[df.dbscan_labels == -1]
    df_outliers.columns=['Start_Date_X', 'Price_X']
    
    # Find the closest cluster for outlier flights
    #min_dist_list=[]
    nearest_cluster=[]
    for j in range(df_outliers.shape[0]):
        outlier = df_outliers.iloc[j]
        dist = []
        for i in range(mean_points.shape[0]):
            mean = mean_points[['Start_Date_X', 'Price_X']].iloc[i]
            dist.append(distance.euclidean(outlier,mean))
        #min_dist_list.append(min(dist))
        for k, d in enumerate(dist):
            if d == min(dist):
                nearest_cluster.append(k)
    
    outliers = df_outliers.copy()
    outliers['closest_cluster'] = pd.Series(nearest_cluster).values  # Nearest cluster of each outlier
    outliers['Price'] = df['Price'][df.dbscan_labels == -1]  # Original price of each outlier
    outliers['Date_of_Flight'] = df['Date_of_Flight'][df.dbscan_labels == -1]  # Original date of each outlier
    outliers['threshold'] = mean_points['Threshold'][outliers['closest_cluster']].values # Threshold of the closest cluster
    
    result = outliers[['Price','Date_of_Flight']][outliers.Price <= outliers.threshold]
    if result.shape[0] != 0:
        return result
    else:
        raise Exception('There is no outlier price in this period!')



In [7]:

    
# test 1
data_60 = scrape_data(datetime.datetime(2017, 4, 10), 'Beijing', 'Mexico', 'Mexico City')
task_3_dbscan(data_60)









    Out[7]:






  
    
      
      Price
      Date_of_Flight
    
  
  
    
      26
      827.0
      2017-05-06
    
    
      27
      791.0
      2017-05-07



In [376]:

    
# test 2
data_60 = scrape_data(datetime.datetime(2017, 5, 10), 'Beijing', 'United States', 'San Francisco')
task_3_dbscan(data_60)









    



---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-376-aa197a72d72a> in <module>()
      2 #data_60 = scrape_data(datetime.datetime(2017, 5, 10), 'Beijing', 'Mexico', 'Cancun')
      3 #data_60 = scrape_data(datetime.datetime(2017, 5, 10), 'Beijing', 'United States', 'San Francisco')
----> 4 task_3_dbscan(data_60)

<ipython-input-375-038ae8f8c45d> in task_3_dbscan(flight_data)
     85         return result
     86     else:
---> 87         raise Exception('Good Price not Found!')
     88 

Exception: Good Price not Found!

Choose the best epsilon and min_sample value



In [381]:

    
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

# ========== Task 3 question 1 ==========
def task_3_dbscan_tuning(flight_data, eps_val, min_spl):
    '''
    flight_data: a pandas DataFrame object with 2 columns 'Price' & 'Date_of_Flight', one row for each day
    return: a pandas DataFrame object with 2 colomns 'Price' & 'Date_of_Flight',  one row for each outlier flight
    '''
    df = flight_data
    start_date = []
    for i in range(len(df)):
        start_date.append((df['Date_of_Flight'][i]- df['Date_of_Flight'][0]).days +1)
    
    # clustering
    df['Start_Date'] = pd.Series(start_date).values
    X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
    db = DBSCAN(eps = eps_val, min_samples = min_spl).fit(X)
    df['dbscan_labels'] = db.labels_
    
    # plot
    labels = db.labels_
    clusters = len(set(labels))
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    matplotlib.style.use('ggplot')
    plt.subplots(figsize=(12,8))
    
    for k, c in zip(unique_labels, colors):
        class_member_mask = (labels == k)  # get all the points in class k
        xy = X[class_member_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
                markeredgecolor='k', markersize=14)
 
    plt.title("Total Clusters: {}".format(clusters)+ " Eps={}".format(eps_val)+ " Min_spl={}".format(min_spl), fontsize=14, y=1.01)
    #plt.savefig('task_3_dbscan.png')
    print unique_labels



In [382]:

    
task_3_dbscan_tuning(data_60, 0.3, 3)









    



set([0, 1, 2, 3, 4, 5, -1])



In [ ]:

	Price	Date_of_Flight	Start_Date	dbscan_labels
0	609.0	2017-05-10	1	0
1	588.0	2017-05-11	2	0
2	588.0	2017-05-12	3	0
3	588.0	2017-05-13	4	0
4	541.0	2017-05-14	5	1
5	541.0	2017-05-15	6	1
6	541.0	2017-05-16	7	1
7	541.0	2017-05-17	8	1
8	541.0	2017-05-18	9	1
9	541.0	2017-05-19	10	1
10	548.0	2017-05-20	11	1
11	541.0	2017-05-21	12	1
12	541.0	2017-05-22	13	1
13	548.0	2017-05-23	14	1
14	548.0	2017-05-24	15	1
15	650.0	2017-05-25	16	-1
16	682.0	2017-05-26	17	-1
17	609.0	2017-05-27	18	2
18	548.0	2017-05-28	19	1
19	541.0	2017-05-29	20	1
20	588.0	2017-05-30	21	2
21	588.0	2017-05-31	22	2
22	588.0	2017-06-01	23	2
23	588.0	2017-06-02	24	2
24	588.0	2017-06-03	25	2
25	588.0	2017-06-04	26	2
26	588.0	2017-06-05	27	2
27	637.0	2017-06-06	28	3
28	637.0	2017-06-07	29	3
29	637.0	2017-06-08	30	3
30	637.0	2017-06-09	31	3
31	637.0	2017-06-10	32	3
32	637.0	2017-06-11	33	3
33	660.0	2017-06-12	34	3
34	660.0	2017-06-13	35	3
35	660.0	2017-06-14	36	3
36	637.0	2017-06-15	37	3
37	679.0	2017-06-16	38	3
38	670.0	2017-06-17	39	3
39	723.0	2017-06-18	40	-1
40	636.0	2017-06-19	41	3
41	636.0	2017-06-20	42	3
42	637.0	2017-06-21	43	3
43	637.0	2017-06-22	44	3
44	637.0	2017-06-23	45	3
45	637.0	2017-06-24	46	3
46	637.0	2017-06-25	47	3
47	776.0	2017-06-26	48	4
48	740.0	2017-06-27	49	4
49	764.0	2017-06-28	50	4
50	834.0	2017-06-29	51	-1
51	740.0	2017-06-30	52	4
52	631.0	2017-07-01	53	-1
53	892.0	2017-07-02	54	5
54	725.0	2017-07-03	55	4
55	747.0	2017-07-04	56	4
56	725.0	2017-07-05	57	4
57	883.0	2017-07-06	58	5
58	747.0	2017-07-07	59	4
59	859.0	2017-07-08	60	5

	Cluster	Start_Date_X	Price_X	Mean_Price	Std_Price	Threshould
0	0	-1.559062	-0.700911	921.750000	7.395100	871.750000
1	1	-1.059997	-0.300562	960.571429	9.240550	910.571429
2	2	0.223754	-0.697043	922.125000	23.642850	872.125000
3	3	1.229927	1.420900	1127.500000	21.448776	1077.500000

	Start_Date_X	Price_X
9	-1.183732	-1.141773
15	-0.837274	-1.007710
22	-0.433073	0.023547
24	-0.317587	-0.347705
26	-0.202101	-1.678026
27	-0.144358	-2.049279
29	-0.028872	0.621676
30	0.028872	0.116360
31	0.086615	0.106048
40	0.606302	0.477300
42	0.721788	2.302624
44	0.837274	0.477300
49	1.125989	2.271686
57	1.587934	2.312937
58	1.645677	0.735114
59	1.703420	0.735114

	Start_Date_X	Price_X	closest_cluster	Price	Date_of_Flight	threshould
9	-1.183732	-1.141773	0	879.0	2017-04-19	871.750000
15	-0.837274	-1.007710	1	892.0	2017-04-25	910.571429
22	-0.433073	0.023547	1	992.0	2017-05-02	910.571429
24	-0.317587	-0.347705	2	956.0	2017-05-04	872.125000
26	-0.202101	-1.678026	2	827.0	2017-05-06	872.125000
27	-0.144358	-2.049279	2	791.0	2017-05-07	872.125000
29	-0.028872	0.621676	2	1050.0	2017-05-09	872.125000
30	0.028872	0.116360	2	1001.0	2017-05-10	872.125000
31	0.086615	0.106048	2	1000.0	2017-05-11	872.125000
40	0.606302	0.477300	3	1036.0	2017-05-20	1077.500000
42	0.721788	2.302624	3	1213.0	2017-05-22	1077.500000
44	0.837274	0.477300	3	1036.0	2017-05-24	1077.500000
49	1.125989	2.271686	3	1210.0	2017-05-29	1077.500000
57	1.587934	2.312937	3	1214.0	2017-06-06	1077.500000
58	1.645677	0.735114	3	1061.0	2017-06-07	1077.500000
59	1.703420	0.735114	3	1061.0	2017-06-08	1077.500000