notebook.community

Edit and run



In [11]:

    
import bs4
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import time
from unidecode import unidecode
import pandas as pd
import datetime
from dateutil.parser import parse
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [18]:

    
def scrape_data(start_date, from_place, to_place, city_name):

    driver = webdriver.Chrome()
    driver.get('https://www.google.com/flights/explore/')
    time.sleep(1.5)
    #input to_place
    to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[3]/div/div[4]/div/div')
    to_input.click()
    actions = ActionChains(driver)
    actions.send_keys(to_place)
    actions.send_keys(Keys.ENTER)
    actions.perform()
    time.sleep(0.5)
    #input from_place
    to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[3]/div/div[2]/div/div')
    to_input.click()
    actions = ActionChains(driver)
    actions.send_keys(from_place)
    actions.send_keys(Keys.ENTER)
    actions.perform()
    time.sleep(0.5)
    #input start_date
    driver.get(driver.current_url[:-10]+start_date)
    time.sleep(0.5)
    #find the city_name
    data = []
    city_name0=unicode(city_name,'utf-8')
    city_name_unicode=unidecode(city_name0)
    city_name1=city_name_unicode.lower().split(' ')
    city_name2=''
    for i in range(len(city_name1)):
        city_name2=city_name2+city_name1[i][0].upper()+city_name1[i][1:]+' '
    city_name2=city_name2.strip()
    results = driver.find_elements_by_class_name('LJTSM3-v-d')
    for result in results:
        if city_name2 in result.text:
            bars = result.find_elements_by_class_name('LJTSM3-w-x')

            for bar in bars:
                ActionChains(driver).move_to_element(bar).perform()
                time.sleep(0.0001)
                data.append((result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[0].text,
                             result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[1].text))
        else:
            pass
    time.sleep(0.01)
    driver.quit()
    return data



In [20]:

    
flight_data=scrape_data('2017-04-30','Chicago','America','boston')
len(flight_data)









    Out[20]:





60



In [2]:

    
def scrape_data_90(start_date, from_place, to_place, city_name):

    driver = webdriver.Chrome()
    driver.get('https://www.google.com/flights/explore/')
    time.sleep(1.5)
    #driver.get('https://www.google.com/flights/explore/')
    #input to_place
    to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[3]/div/div[4]/div/div')
    to_input.click()
    actions = ActionChains(driver)
    actions.send_keys(to_place)
    actions.send_keys(Keys.ENTER)
    actions.perform()
    time.sleep(0.5)
    #input from_place
    to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[3]/div/div[2]/div/div')
    to_input.click()
    actions = ActionChains(driver)
    actions.send_keys(from_place)
    actions.send_keys(Keys.ENTER)
    actions.perform()
    time.sleep(0.5)
    #input start_date
    driver.get(driver.current_url[:-10]+start_date)
    time.sleep(0.5)
    #find the city_name
    data = []
    city_name0=unicode(city_name,'utf-8')
    city_name_unicode=unidecode(city_name0)
    city_name1=city_name_unicode.lower().split(' ')
    city_name2=''
    for i in range(len(city_name1)):
        city_name2=city_name2+city_name1[i][0].upper()+city_name1[i][1:]+' '
    city_name2=city_name2.strip()
    results = driver.find_elements_by_class_name('LJTSM3-v-d')
    for result in results:
        if city_name2 in result.text:
            bars = result.find_elements_by_class_name('LJTSM3-w-x')

            for bar in bars:
                ActionChains(driver).move_to_element(bar).perform()
                time.sleep(0.0001)
                data.append((result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[0].text,
                             result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[1].text))
        else:
            pass
    time.sleep(0.5)
    to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[4]/div/div[2]/div[1]/div/div[2]/div[2]/div/div[2]/div[5]/div')
    to_input.click()
    time.sleep(0.5)
    results = driver.find_elements_by_class_name('LJTSM3-v-d')
    for result in results:
        if city_name2 in result.text:
            bars = result.find_elements_by_class_name('LJTSM3-w-x')

            for bar in bars:
                ActionChains(driver).move_to_element(bar).perform()
                time.sleep(0.0001)
                data.append((result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[0].text,
                             result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[1].text))
        else:
            pass

    data=data[:60]+data[-30:]
    driver.quit()
    return data



In [3]:

    
flight_data=scrape_data_90('2017-04-30','Chicago','America','boston')
flight_data









    Out[3]:





[(u'$256', u'Sun Apr 30 - Wed May 3'),
 (u'$194', u'Mon May 1 - Thu May 4'),
 (u'$133', u'Tue May 2 - Sat May 6'),
 (u'$113', u'Wed May 3 - Sat May 6'),
 (u'$158', u'Thu May 4 - Tue May 9'),
 (u'$148', u'Fri May 5 - Tue May 9'),
 (u'$113', u'Sat May 6 - Tue May 9'),
 (u'$211', u'Sun May 7 - Wed May 10'),
 (u'$178', u'Mon May 8 - Thu May 11'),
 (u'$113', u'Tue May 9 - Sat May 13'),
 (u'$113', u'Wed May 10 - Sat May 13'),
 (u'$178', u'Thu May 11 - Tue May 16'),
 (u'$196', u'Fri May 12 - Tue May 16'),
 (u'$113', u'Sat May 13 - Wed May 17'),
 (u'$196', u'Sun May 14 - Wed May 17'),
 (u'$194', u'Mon May 15 - Thu May 18'),
 (u'$113', u'Tue May 16 - Sat May 20'),
 (u'$113', u'Wed May 17 - Sat May 20'),
 (u'$230', u'Thu May 18 - Tue May 23'),
 (u'$216', u'Fri May 19 - Tue May 23'),
 (u'$113', u'Sat May 20 - Wed May 24'),
 (u'$231', u'Sun May 21 - Wed May 24'),
 (u'$250', u'Mon May 22 - Sat May 27'),
 (u'$133', u'Tue May 23 - Sat May 27'),
 (u'$153', u'Wed May 24 - Sat May 27'),
 (u'$250', u'Thu May 25 - Tue May 30'),
 (u'$250', u'Fri May 26 - Wed May 31'),
 (u'$153', u'Sat May 27 - Tue May 30'),
 (u'$216', u'Sun May 28 - Wed May 31'),
 (u'$250', u'Mon May 29 - Sat Jun 3'),
 (u'$178', u'Tue May 30 - Sat Jun 3'),
 (u'$153', u'Wed May 31 - Sat Jun 3'),
 (u'$194', u'Thu Jun 1 - Tue Jun 6'),
 (u'$196', u'Fri Jun 2 - Wed Jun 7'),
 (u'$153', u'Sat Jun 3 - Tue Jun 6'),
 (u'$245', u'Sun Jun 4 - Wed Jun 7'),
 (u'$194', u'Mon Jun 5 - Thu Jun 8'),
 (u'$153', u'Tue Jun 6 - Sat Jun 10'),
 (u'$153', u'Wed Jun 7 - Sat Jun 10'),
 (u'$216', u'Thu Jun 8 - Tue Jun 13'),
 (u'$216', u'Fri Jun 9 - Tue Jun 13'),
 (u'$153', u'Sat Jun 10 - Thu Jun 15'),
 (u'$231', u'Sun Jun 11 - Thu Jun 15'),
 (u'$216', u'Mon Jun 12 - Sat Jun 17'),
 (u'$153', u'Tue Jun 13 - Sat Jun 17'),
 (u'$153', u'Wed Jun 14 - Sat Jun 17'),
 (u'$153', u'Thu Jun 15 - Tue Jun 20'),
 (u'$178', u'Fri Jun 16 - Tue Jun 20'),
 (u'$153', u'Sat Jun 17 - Wed Jun 21'),
 (u'$250', u'Sun Jun 18 - Wed Jun 21'),
 (u'$178', u'Mon Jun 19 - Sat Jun 24'),
 (u'$153', u'Tue Jun 20 - Sat Jun 24'),
 (u'$153', u'Wed Jun 21 - Sat Jun 24'),
 (u'$178', u'Thu Jun 22 - Tue Jun 27'),
 (u'$178', u'Fri Jun 23 - Wed Jun 28'),
 (u'$153', u'Sat Jun 24 - Tue Jun 27'),
 (u'$250', u'Sun Jun 25 - Wed Jun 28'),
 (u'$178', u'Mon Jun 26 - Sat Jul 1'),
 (u'$153', u'Tue Jun 27 - Sat Jul 1'),
 (u'$153', u'Wed Jun 28 - Sat Jul 1'),
 (u'$178', u'Thu Jun 29 - Mon Jul 3'),
 (u'$216', u'Fri Jun 30 - Tue Jul 4'),
 (u'$178', u'Sat Jul 1 - Tue Jul 4'),
 (u'$241', u'Sun Jul 2 - Thu Jul 6'),
 (u'$178', u'Mon Jul 3 - Sat Jul 8'),
 (u'$178', u'Tue Jul 4 - Sat Jul 8'),
 (u'$178', u'Wed Jul 5 - Mon Jul 10'),
 (u'$178', u'Thu Jul 6 - Mon Jul 10'),
 (u'$153', u'Fri Jul 7 - Wed Jul 12'),
 (u'$153', u'Sat Jul 8 - Tue Jul 11'),
 (u'$216', u'Sun Jul 9 - Wed Jul 12'),
 (u'$153', u'Mon Jul 10 - Sat Jul 15'),
 (u'$153', u'Tue Jul 11 - Sat Jul 15'),
 (u'$153', u'Wed Jul 12 - Sat Jul 15'),
 (u'$153', u'Thu Jul 13 - Tue Jul 18'),
 (u'$153', u'Fri Jul 14 - Wed Jul 19'),
 (u'$153', u'Sat Jul 15 - Wed Jul 19'),
 (u'$216', u'Sun Jul 16 - Thu Jul 20'),
 (u'$153', u'Mon Jul 17 - Fri Jul 21'),
 (u'$153', u'Tue Jul 18 - Sat Jul 22'),
 (u'$153', u'Wed Jul 19 - Mon Jul 24'),
 (u'$153', u'Thu Jul 20 - Tue Jul 25'),
 (u'$153', u'Fri Jul 21 - Wed Jul 26'),
 (u'$153', u'Sat Jul 22 - Tue Jul 25'),
 (u'$216', u'Sun Jul 23 - Thu Jul 27'),
 (u'$153', u'Mon Jul 24 - Thu Jul 27'),
 (u'$153', u'Tue Jul 25 - Fri Jul 28'),
 (u'$153', u'Wed Jul 26 - Mon Jul 31'),
 (u'$153', u'Thu Jul 27 - Tue Aug 1'),
 (u'$153', u'Fri Jul 28 - Tue Aug 1')]



In [12]:

    
def task_3_dbscan(flight_data):
    clean_data = [(float(d[0].replace('$', '').replace(',', '')), 
               (parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days, 
               reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
    df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])

    X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
    db = DBSCAN(eps=.5, min_samples=3).fit(X)

    labels = db.labels_
    clusters = len(set(labels))
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

    plt.subplots(figsize=(12,8))

    for k, c in zip(unique_labels, colors):
        class_member_mask = (labels == k)
        xy = X[class_member_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
                markeredgecolor='k', markersize=14)
    
    plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01)
    df['dbscan_labels'] = db.labels_
    plt.savefig('task_3_dbscan.png')
    outliers=df[df['dbscan_labels']==-1].copy()
    outliers_1=zip(outliers.Start_Date,outliers.Price)
    clusters=df[df['dbscan_labels']!=-1].copy()
    clusters_1=zip(clusters.Start_Date,clusters.Price,clusters.dbscan_labels)
    outliers_label=[]
    for outlier in outliers_1:
        min_cluster_label = -1
        min_dist = 9999
        for cluster in clusters_1:
            dist=(float(outlier[0])-float(cluster[0]))**2+((float(outlier[1])-float(cluster[1]))/100)**2
            # I think the weight of date in distance is more important than weight of price. Therefore, I did not use Euclidean distance.
            if dist < min_dist:
                min_dist = dist
                min_cluster_label = cluster[2]
        outliers_label.append(min_cluster_label)
    outliers_2=zip(outliers.Start_Date,outliers.Price,outliers_label)
    agg = df[df['dbscan_labels']!=-1].groupby('dbscan_labels')['Start_Date','Price'].agg(['std','mean','count']).copy()
    outliers_3=[]
    for outlier in outliers_2:
        mean=agg[agg.index==outlier[2]]['Price']['mean']
        std=max(float(agg[agg.index==outlier[2]]['Price']['std']),10)
        line=max(float(mean-2*std),50)
        if outlier[1]<line:
            outliers_3.append(outlier[0])
    if len(outliers_3)==0:
        return 'There is no low price outlier.'
    else:
        return df.loc[outliers_3]



In [13]:

    
task_3_dbscan(flight_data)









    Out[13]:





'There is no low price outlier.'



In [14]:

    
def task_3_IQR(flight_data):
    clean_data = [(float(d[0].replace('$', '').replace(',', '')), 
               (parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days, 
               reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
    df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])
    plt.boxplot(df['Price']);
    plt.savefig('task_3_iqr.png')
    Q1 = df.Price.describe()['25%']
    Q3 = df.Price.describe()['75%']
    IQR = Q3 - Q1
    low_line = Q1 - 1.5 * IQR
    result = df[df['Price'] < low_line]
    if len(result) == 0:
        return 'No outliers'
    else:
        return result



In [15]:

    
task_3_IQR(flight_data)









    Out[15]:





'No outliers'



In [16]:

    
def task_4_dbscan(flight_data):
    clean_data = [(float(d[0].replace('$', '').replace(',', '')), 
               (parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days, 
               reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
    df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])    
    X = df[['Start_Date', 'Price']].values*np.array([20,1])
    radius = np.sqrt(np.square(20.00) + np.square(20.00))
    db = DBSCAN(eps=radius, min_samples=3).fit(X)

    df['dbscan_labels'] = db.labels_
    
    clusters=df.dbscan_labels.unique()
    clusters_5=[]
    for cluster in clusters:
        if cluster!=-1 and len(df[df['dbscan_labels']==cluster])>4:
            for i in range(len(df[df['dbscan_labels']==cluster])-4):
                clusters_5.append(df[df['dbscan_labels']==cluster]['Start_Date'].values[i:i+5])
    mean_min=9999
    cluster_mean_min=[]
    for cluster_5 in clusters_5:
        df_5=df.loc[cluster_5][['Start_Date','Price']]
        cluster_max=df_5['Price'].max()
        cluster_min=df_5['Price'].min()
        cluster_mean=df_5['Price'].mean()
        if cluster_max-cluster_min<=20 and cluster_mean < mean_min:
            mean_min=cluster_mean
            cluster_mean_min=cluster_5
        else:
            pass
    if len(cluster_mean_min)==0:
        return 'No required value'
    else:
        return df.loc[cluster_mean_min]



In [17]:

    
task_4_dbscan(flight_data)









    Out[17]:






  
    
      
      Price
      Start_Date
      Trip_Length
      dbscan_labels
    
  
  
    
      71
      153.0
      71
      5
      2
    
    
      72
      153.0
      72
      4
      2
    
    
      73
      153.0
      73
      3
      2
    
    
      74
      153.0
      74
      5
      2
    
    
      75
      153.0
      75
      5
      2



In [721]:

    
clean_data = [(float(d[0].replace('$', '').replace(',', '')), 
               (parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days, 
               reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
    df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])    
    X = df[['Start_Date', 'Price']].values*np.array([20,1])
    radius = np.sqrt(np.square(20.00) + np.square(20.00))
    db = DBSCAN(eps=radius, min_samples=3).fit(X)

    df['dbscan_labels'] = db.labels_



In [725]:

    
clusters=df.dbscan_labels.unique()
    clusters









    Out[725]:





array([-1,  0,  1,  2,  3,  4])



In [722]:

    
clusters_5=[]
    for cluster in clusters:
        if cluster!=-1:
            if len(df[df['dbscan_labels']==cluster])>4:
                clusters_5.append(cluster)
        else:
            pass



In [727]:

    
clusters_5









    Out[727]:





[2, 3, 4]



In [ ]:

    
mean_min=9999
    cluster_mean_min=[]
    for cluster_5 in clusters_5:
        df_5=df.loc[cluster_5][['Start_Date','Price']]
        cluster_max=df_5['Price'].max()
        cluster_min=df_5['Price'].min()
        cluster_mean=df_5['Price'].mean()

        if cluster_max-cluster_min<=20 and cluster_mean < mean_min:
            mean_min=cluster_mean
            cluster_mean_min=cluster_5
        else:
            pass



In [728]:

    
cluster_mean_min









    Out[728]:





3



In [525]:

    
import pandas as pd
import datetime
%matplotlib inline
from dateutil.parser import parse

clean_data = [(float(d[0].replace('$', '').replace(',', '')), 
               (parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days, 
               reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
clean_data









    Out[525]:





[(256.0, 0, 3),
 (194.0, 1, 3),
 (133.0, 2, 4),
 (113.0, 3, 3),
 (158.0, 4, 5),
 (148.0, 5, 4),
 (113.0, 6, 3),
 (211.0, 7, 3),
 (178.0, 8, 3),
 (113.0, 9, 4),
 (113.0, 10, 3),
 (178.0, 11, 5),
 (196.0, 12, 4),
 (113.0, 13, 4),
 (196.0, 14, 3),
 (194.0, 15, 3),
 (113.0, 16, 4),
 (113.0, 17, 3),
 (230.0, 18, 5),
 (216.0, 19, 4),
 (113.0, 20, 4),
 (231.0, 21, 3),
 (250.0, 22, 5),
 (133.0, 23, 4),
 (153.0, 24, 3),
 (250.0, 25, 5),
 (250.0, 26, 5),
 (153.0, 27, 3),
 (216.0, 28, 3),
 (250.0, 29, 5),
 (178.0, 30, 4),
 (153.0, 31, 3),
 (194.0, 32, 5),
 (196.0, 33, 5),
 (153.0, 34, 3),
 (245.0, 35, 3),
 (194.0, 36, 3),
 (153.0, 37, 4),
 (153.0, 38, 3),
 (216.0, 39, 5),
 (216.0, 40, 4),
 (153.0, 41, 5),
 (231.0, 42, 4),
 (216.0, 43, 5),
 (153.0, 44, 4),
 (153.0, 45, 3),
 (153.0, 46, 5),
 (178.0, 47, 4),
 (153.0, 48, 4),
 (250.0, 49, 3),
 (178.0, 50, 5),
 (153.0, 51, 4),
 (153.0, 52, 3),
 (178.0, 53, 5),
 (178.0, 54, 5),
 (153.0, 55, 3),
 (250.0, 56, 3),
 (178.0, 57, 5),
 (153.0, 58, 4),
 (153.0, 59, 3),
 (178.0, 60, 4),
 (216.0, 61, 4),
 (178.0, 62, 3),
 (241.0, 63, 4),
 (178.0, 64, 5),
 (178.0, 65, 4),
 (178.0, 66, 5),
 (178.0, 67, 4),
 (153.0, 68, 5),
 (153.0, 69, 3),
 (216.0, 70, 3),
 (153.0, 71, 5),
 (153.0, 72, 4),
 (153.0, 73, 3),
 (153.0, 74, 5),
 (153.0, 75, 5),
 (153.0, 76, 4),
 (216.0, 77, 4),
 (153.0, 78, 4),
 (153.0, 79, 4),
 (153.0, 80, 5),
 (153.0, 81, 5),
 (153.0, 82, 5),
 (153.0, 83, 3),
 (216.0, 84, 4),
 (153.0, 85, 3),
 (153.0, 86, 3),
 (153.0, 87, 5),
 (153.0, 88, 5),
 (153.0, 89, 4)]



In [553]:

    
radius = np.sqrt(np.square(20.00) + np.square(20.00))
radius









    Out[553]:





28.284271247461902



In [703]:

    
df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])
import matplotlib
import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')

# Pandas has a ton of built-in visualizations
# Play and Learn
# http://pandas.pydata.org/pandas-docs/stable/visualization.html
df.plot.scatter(x='Start_Date', y='Price')









    Out[703]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fa7340f3d50>



In [571]:

    
df = df.set_value(12, 'Price', 20)
df = df.set_value(13, 'Price', 25)
df = df.set_value(14, 'Price', 40)
df = df.set_value(15, 'Price', 20)
df = df.set_value(16, 'Price', 25)
df = df.set_value(17, 'Price', 40)
df = df.set_value(18, 'Price', 50)
df = df.set_value(19, 'Price', 60)
# Time for a Google Investigation
# "IQR Outlier"



In [704]:

    
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
# All of pandas' viz is built on top of matplotlib as you might have noticed
# You can get started learning matplotlib here: http://matplotlib.org/users/pyplot_tutorial.html


# df = df.set_value(49, 'Price', 255)
# X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
  
X = df[['Start_Date', 'Price']].values*np.array([20,1])
radius = np.sqrt(np.square(20.00) + np.square(20.00))
db = DBSCAN(eps=radius, min_samples=3).fit(X)

labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
 
plt.subplots(figsize=(12,8))
 
for k, c in zip(unique_labels, colors):
    class_member_mask = (labels == k)
    xy = X[class_member_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
            markeredgecolor='k', markersize=14)
 
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01)
df['dbscan_labels'] = db.labels_



In [705]:

    
clusters=df.dbscan_labels.unique()
clusters









    Out[705]:





array([-1,  0,  1,  2,  3,  4])



In [706]:

    
clusters_5=[]
for cluster in clusters:
    if cluster!=-1 and len(df[df['dbscan_labels']==cluster])>4:
        for i in range(len(df[df['dbscan_labels']==cluster])-4):
            clusters_5.append(df[df['dbscan_labels']==cluster]['Start_Date'].values[i:i+5])
    else:
        pass
clusters_5









    Out[706]:





[array([71, 72, 73, 74, 75]),
 array([72, 73, 74, 75, 76]),
 array([78, 79, 80, 81, 82]),
 array([79, 80, 81, 82, 83]),
 array([85, 86, 87, 88, 89])]



In [707]:

    
mean_min=9999
cluster_mean_min=''
for cluster_5 in clusters_5:
    df_5=df.loc[cluster_5][['Start_Date','Price']]
    cluster_max=df_5['Price'].max()
    cluster_min=df_5['Price'].min()
    cluster_mean=df_5['Price'].mean()

    if cluster_max-cluster_min<=20 and cluster_mean < mean_min:
        mean_min=cluster_mean
        cluster_mean_min=cluster_5
    else:
        pass



In [711]:

    
df.loc[cluster_mean_min]









    Out[711]:






  
    
      
      Price
      Start_Date
      Trip_Length
      dbscan_labels
    
  
  
    
      71
      153.0
      71
      5
      2
    
    
      72
      153.0
      72
      4
      2
    
    
      73
      153.0
      73
      3
      2
    
    
      74
      153.0
      74
      5
      2
    
    
      75
      153.0
      75
      5
      2



In [709]:

    
cluster_mean_min









    Out[709]:





array([71, 72, 73, 74, 75])



In [512]:

    
plt.subplots(figsize=(12,8))
for k, c in zip(unique_labels, colors):
    class_member_mask = (labels == k)
    xy = X[class_member_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
            markeredgecolor='k', markersize=14)



In [504]:

    
start_date = parse(flight_data[0][1].split('-')[0].strip())
    db = DBSCAN(eps=radius, min_samples=3).fit(X)
    df['label'] = db.labels_
    clusters = len(set(labels))
    unique_labels = set(labels)
    list_of_dfs = []
    for label in unique_labels:
        if label != -1:
            one_cluster = df[df.label == label]

            # find start/end date for every 5-day-consecutive period
            consecutive_list = []
            days = one_cluster.Start_Date.values
            for i in range(len(days) - 4):
                if days[i + 4] - days[i] == 4:
                    consecutive_list.append((days[i], days[i + 4]))

            # a list of 5-day-consecutive period dataframes
            for start, end in consecutive_list:
                df_five_day = one_cluster.loc[start - 1:end - 1, ['Start_Date', 'Price']]

                if df_five_day.Price.describe()['max'] - df_five_day.Price.describe()['min'] <= 20:
                    df_five_day.Start_Date = df_five_day.Start_Date.apply(
                        lambda x: (start_date + datetime.timedelta(
                            days=x - 1)).strftime('%Y-%m-%d'))
                    list_of_dfs.append(df_five_day)
    means = [one_df.Price.mean() for one_df in list_of_dfs]
    df_min_index = means.index(min(means))



In [507]:

    
list_of_dfs[df_min_index]



In [319]:

    
outliers=df[df['dbscan_labels']==-1].copy()
outliers_1=zip(outliers.Start_Date,outliers.Price)
outliers_1









    Out[319]:





[(10, 40.0), (15, 200.0), (23, 557.0), (24, 785.0), (25, 634.0), (26, 712.0)]



In [320]:

    
clusters=df[df['dbscan_labels']!=-1].copy()
clusters_1=zip(clusters.Start_Date,clusters.Price,clusters.dbscan_labels)
clusters_1









    Out[320]:





[(0, 476.0, 0),
 (1, 431.0, 0),
 (2, 476.0, 0),
 (3, 481.0, 0),
 (4, 499.0, 0),
 (5, 476.0, 0),
 (6, 451.0, 0),
 (7, 476.0, 0),
 (8, 431.0, 0),
 (9, 450.0, 0),
 (11, 431.0, 0),
 (12, 476.0, 0),
 (13, 480.0, 0),
 (14, 476.0, 0),
 (16, 425.0, 0),
 (17, 521.0, 1),
 (18, 518.0, 1),
 (19, 518.0, 1),
 (20, 518.0, 1),
 (21, 518.0, 1),
 (22, 518.0, 1),
 (27, 520.0, 1),
 (28, 525.0, 1),
 (29, 495.0, 1),
 (30, 477.0, 1),
 (31, 495.0, 1),
 (32, 520.0, 1),
 (33, 518.0, 1),
 (34, 517.0, 1),
 (35, 520.0, 1),
 (36, 520.0, 1),
 (37, 518.0, 1),
 (38, 480.0, 1),
 (39, 521.0, 1),
 (40, 518.0, 1),
 (41, 518.0, 1),
 (42, 520.0, 1),
 (43, 520.0, 1),
 (44, 525.0, 1),
 (45, 521.0, 1),
 (46, 521.0, 1),
 (47, 490.0, 1),
 (48, 435.0, 1),
 (49, 521.0, 1),
 (50, 482.0, 1),
 (51, 450.0, 1),
 (52, 429.0, 1),
 (53, 530.0, 1),
 (54, 620.0, 1),
 (55, 591.0, 1),
 (56, 561.0, 1),
 (57, 566.0, 1),
 (58, 507.0, 1),
 (59, 509.0, 1),
 (60, 521.0, 1),
 (61, 521.0, 1),
 (62, 518.0, 1),
 (63, 520.0, 1),
 (64, 520.0, 1),
 (65, 521.0, 1),
 (66, 586.0, 1),
 (67, 525.0, 1),
 (68, 525.0, 1),
 (69, 534.0, 1),
 (70, 566.0, 1),
 (71, 526.0, 1),
 (72, 620.0, 1),
 (73, 606.0, 1),
 (74, 521.0, 1),
 (75, 521.0, 1),
 (76, 520.0, 1),
 (77, 566.0, 1),
 (78, 622.0, 1),
 (79, 620.0, 1),
 (80, 566.0, 1),
 (81, 521.0, 1),
 (82, 521.0, 1),
 (83, 520.0, 1),
 (84, 622.0, 1),
 (85, 566.0, 1),
 (86, 620.0, 1),
 (87, 606.0, 1),
 (88, 520.0, 1),
 (89, 518.0, 1)]



In [321]:

    
outliers_label=[]
for outlier in outliers_1:
    min_cluster_label = -1
    min_dist = 9999
    for cluster in clusters_1:
        dist=(float(outlier[0])-float(cluster[0]))**2+((float(outlier[1])-float(cluster[1]))/100)**2
        if dist < min_dist:
            min_dist = dist
            min_cluster_label = cluster[2]
    outliers_label.append(min_cluster_label)
outliers_label
outliers_2=zip(outliers.Start_Date,outliers.Price,outliers_label)
outliers_2









    Out[321]:





[(10, 40.0, 0),
 (15, 200.0, 0),
 (23, 557.0, 1),
 (24, 785.0, 1),
 (25, 634.0, 1),
 (26, 712.0, 1)]



In [322]:

    
agg = df[df['dbscan_labels']!=-1].groupby('dbscan_labels')['Start_Date','Price'].agg(['std','mean','count']).copy()
agg









    Out[322]:






  
    
      
      Start_Date
      Price
    
    
      
      std
      mean
      count
      std
      mean
      count
    
    
      dbscan_labels
      
      
      
      
      
      
    
  
  
    
      0
      4.997142
      7.400000
      15
      23.521014
      462.333333
      15
    
    
      1
      20.640356
      54.652174
      69
      41.540523
      530.869565
      69



In [323]:

    
mean=agg[agg.index==outliers_2[0][2]]['Price']['mean']
std=agg[agg.index==outliers_2[0][2]]['Price']['std']
max(float(mean-2*std),50)









    Out[323]:





415.2913055182786



In [324]:

    
outliers_3=[]
for outlier in outliers_2:
    mean=agg[agg.index==outlier[2]]['Price']['mean']
    std=agg[agg.index==outlier[2]]['Price']['std']
    line=max(float(mean-2*std),50)
    if outlier[1]<line:
        outliers_3.append(outlier[0])
outliers_3









    Out[324]:





[10, 15]



In [325]:

    
df.loc[outliers_3]









    Out[325]:






  
    
      
      Price
      Start_Date
      Trip_Length
      dbscan_labels
    
  
  
    
      10
      40.0
      10
      5
      -1
    
    
      15
      200.0
      15
      4
      -1



In [138]:

    
if bool(re.search('multi', 'A mUltiCased string', re.IGNORECASE)):
    print 'true'
else:
    print 'false'









    



true



In [343]:

    
a='New York'



In [347]:

    
b=a.lower().split(' ')



In [357]:

    
c=''
for i in range(len(b)):
    c=c+b[i][0].upper()+b[i][1:]+' '



In [358]:

    
c









    Out[358]:





'New York '



In [434]:

    
plt.boxplot(df['Price']);
    plt.savefig('task_3_iqr.png')



In [435]:

    
plt.boxplot(df['Price']);
    plt.savefig('task_3_iqr.png')
    Q1 = df.Price.describe()['25%']
    Q3 = df.Price.describe()['75%']
    IQR = Q3 - Q1
    low_line = Q1 - 1.5 * IQR
    result = df[df.Price < low_line]



In [447]:

    
result









    Out[447]:






  
    
      
      Price
      Start_Date
      Trip_Length
      dbscan_labels
    
  
  
    
      15
      20.0
      15
      3
      0



In [453]:

    
len(result)









    Out[453]:





1



In [450]:

    
if len(result) == 0:
    return 'No outliers'
else:
    return result









    



  File "<ipython-input-450-6d580a26b40c>", line 2
    return 'No outliers'
SyntaxError: 'return' outside function



In [462]:

    
df.Price.describe()









    Out[462]:





count     90.000000
mean     173.355556
std       41.575790
min       20.000000
25%      153.000000
50%      153.000000
75%      196.000000
max      256.000000
Name: Price, dtype: float64



In [456]:

    
if a == 0:
    print 'No outliers'
else:
    print result









    



    Price  Start_Date  Trip_Length  dbscan_labels
15   20.0          15            3              0



In [ ]:

	Start_Date	Price
71	2017-07-09	153.0
72	2017-07-10	153.0
73	2017-07-11	153.0
74	2017-07-12	153.0

	Start_Date			Price
	std	mean	count	std	mean	count
dbscan_labels
0	4.997142	7.400000	15	23.521014	462.333333	15
1	20.640356	54.652174	69	41.540523	530.869565	69