In [2]:
import time
import re
from unidecode import unidecode
import pandas as pd
import datetime
from dateutil.parser import parse
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
# ======== Task 1 =========
def scrape_data(start_date, from_place, to_place, city_name):
"""
start_date: a datetime object for the start date that you should use in your query to Google Flight explorer
from_place: a string with the name of the origin of the flights
to_place: a string with the name of the regional destination of the flights
city_name: a string for the name of the city who's data that you should actually scrape
return: a pandas DataFrame object with two columns "Date_of_Flight" and "Price." , and one row for each day
"""
driver = webdriver.Chrome()
driver.get('https://www.google.com/flights/explore/')
time.sleep(2) # wait for the driver to load data
# from_place
driver.find_elements_by_class_name('LJTSM3-p-a')[0].click()
ActionChains(driver).send_keys(from_place).perform()
ActionChains(driver).send_keys(Keys.ENTER).perform()
time.sleep(1)
# to_place
driver.find_elements_by_class_name('LJTSM3-p-a')[1].click()
ActionChains(driver).send_keys(to_place).perform()
ActionChains(driver).send_keys(Keys.ENTER).perform()
time.sleep(1)
# start_date
start_date_str = start_date.strftime("%Y-%m-%d")
url_date = driver.current_url[:-10] + start_date_str
driver.get(url_date)
time.sleep(2)
# use city_name to choose target city
results_city = driver.find_elements_by_class_name('LJTSM3-v-c')
city_num = None
for i in range(len(results_city)):
city = results_city[i]
if re.findall(city_name.lower(), unidecode(city.text).lower()):
city_num = i
if city_num == None:
driver.quit()
raise Exception('City Name not Found!')
time.sleep(1)
# get bars of target city
results = driver.find_elements_by_class_name('LJTSM3-v-d')
target = results[city_num]
bars = target.find_elements_by_class_name('LJTSM3-w-x')
data = []
time.sleep(2)
for bar in bars:
ActionChains(driver).move_to_element(bar).perform()
time.sleep(0.01)
data.append((target.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[0].text, # get price
target.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[1].text)) # get date of flight
data = [x for x in data if str(x[0]) != ''] # exclude null bars
# convert into dataframe and return
clean_data = [(float(d[0].replace('$', '').replace(',', '')), parse(d[1].split('-')[0].strip())) for d in data]
df = pd.DataFrame(clean_data, columns=['Price', 'Date_of_Flight'])
driver.quit()
return df
In [31]:
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline
flight_data = data_60
plt.boxplot(flight_data['Price'])
plt.title('Boxplot of Prices')
plt.savefig('task_3_iqr.png')
In [32]:
import numpy as np
q1 = np.percentile(flight_data['Price'], 25)
q3 = np.percentile(flight_data['Price'], 75)
IQR = q3 - q1
low_bound = q1-1.5*IQR
outliers = flight_data[flight_data['Price'] < low_bound]
print low_bound
outliers.shape
Out[32]:
In [35]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# ======== Task 3 question 2 =========
def task_3_IQR(flight_data):
df = flight_data
plt.boxplot(df['Price'])
plt.title('Boxplot of Prices')
plt.savefig('task_3_iqr.png')
q1 = np.percentile(df['Price'], 25)
q3 = np.percentile(df['Price'], 75)
IQR = q3 - q1
low_bound = q1-1.5*IQR
outliers = df[df['Price'] < low_bound]
if outliers.shape[0] != 0:
return outliers
else:
raise Exception('There is no outlier price!')
In [36]:
outliers_IQR = task_3_IQR(data_60)
outliers_IQR
In [343]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
In [374]:
# clustering
df = data_60
start_date = []
for i in range(len(df)):
start_date.append((df['Date_of_Flight'][i]- df['Date_of_Flight'][0]).days +1)
df['Start_Date'] = pd.Series(start_date).values
X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
db = DBSCAN(eps=.3, min_samples=3).fit(X)
df['dbscan_labels'] = db.labels_
print X
df
Out[374]:
In [345]:
# result of clustering
labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
print labels
print clusters
print unique_labels
In [346]:
# plot results of clustering
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline
matplotlib.style.use('ggplot')
plt.subplots(figsize=(12,8))
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, c in zip(unique_labels, colors):
class_member_mask = (labels == k) # get all the points in class k
xy = X[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
markeredgecolor='k', markersize=14)
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01)
#plt.savefig('task_3_dbscan.png')
Out[346]:
In [347]:
# Mean of each cluster & other features of cluster
mean_points = []
for n in unique_labels:
x = pd.DataFrame(X)[df.dbscan_labels == n]
mp = x.mean(axis=0)
#calculate threshold for each cluster
d = df[df.dbscan_labels == n]
m = np.mean(d.Price)
std = np.std(d.Price)
thrshd = m - max(2*std,50)
mean_points.append((n, mp[0], mp[1], m, std, thrshd))
mean_points = pd.DataFrame(mean_points)
mean_points.columns=['Cluster','Start_Date_X','Price_X','Mean_Price','Std_Price','Threshold']
mean_points = mean_points[mean_points.Cluster != -1]
mean_points
Out[347]:
In [348]:
# Outliers after scaling
df_outliers = pd.DataFrame(X)[df.dbscan_labels == -1]
df_outliers.columns=['Start_Date_X', 'Price_X']
df_outliers
Out[348]:
In [349]:
# Find the closest cluster for outlier flights
from scipy.spatial import distance
min_dist_list=[]
nearest_cluster=[]
for j in range(df_outliers.shape[0]):
outlier = df_outliers.iloc[j]
dist = []
for i in range(mean_points.shape[0]):
mean = mean_points[['Start_Date_X', 'Price_X']].iloc[i]
dist.append(distance.euclidean(outlier,mean))
min_dist_list.append(min(dist))
for k, d in enumerate(dist):
if d == min(dist):
nearest_cluster.append(k)
print nearest_cluster
min_dist_list
Out[349]:
In [350]:
outliers = df_outliers.copy()
outliers['closest_cluster'] = pd.Series(nearest_cluster).values # Nearest cluster of each outlier
outliers['Price'] = df['Price'][df.dbscan_labels == -1] # Orignal price of each outlier
outliers['Date_of_Flight'] = df['Date_of_Flight'][df.dbscan_labels == -1] # Orignal date of each outlier
outliers['threshold'] = mean_points['Threshold'][outliers['closest_cluster']].values # Threshold of the closest cluster
outliers
Out[350]:
In [354]:
result = outliers[['Price','Date_of_Flight']][outliers.Price <= outliers.threshold]
result
Out[354]:
In [358]:
result = pd.DataFrame()
result.shape
Out[358]:
In [5]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline
# ========== Task 3 question 1 ==========
def task_3_dbscan(flight_data):
'''
flight_data: a pandas DataFrame object with 2 columns 'Price' & 'Date_of_Flight', one row for each day
return: a pandas DataFrame object with 2 colomns 'Price' & 'Date_of_Flight', one row for each outlier flight
'''
df = flight_data
start_date = []
for i in range(len(df)):
start_date.append((df['Date_of_Flight'][i]- df['Date_of_Flight'][0]).days +1)
# clustering
df['Start_Date'] = pd.Series(start_date).values
X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
db = DBSCAN(eps = 0.3, min_samples = 3).fit(X)
df['dbscan_labels'] = db.labels_
# plot results of clustering
labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
matplotlib.style.use('ggplot')
plt.subplots(figsize=(12,8))
for k, c in zip(unique_labels, colors):
class_member_mask = (labels == k) # get all the points in class k
xy = X[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
markeredgecolor='k', markersize=14)
plt.title("Total Clusters: {}".format(clusters)+ " Eps=0.3 Min_spl=3", fontsize=14, y=1.01)
plt.savefig('task_3_dbscan.png')
# Mean of each cluster & other features of cluster
mean_points = []
for n in unique_labels:
x = pd.DataFrame(X)[df.dbscan_labels == n]
mp = x.mean(axis=0)
# calculate threshold for each cluster
d = df[df.dbscan_labels == n]
m = np.mean(d.Price)
std = np.std(d.Price)
thrshd = m - max(2*std,50)
mean_points.append((n, mp[0], mp[1], m, std, thrshd))
mean_points = pd.DataFrame(mean_points)
mean_points.columns=['Cluster','Start_Date_X','Price_X','Mean_Price','Std_Price','Threshold']
mean_points = mean_points[mean_points.Cluster != -1]
# Outliers with scaled features
df_outliers = pd.DataFrame(X)[df.dbscan_labels == -1]
df_outliers.columns=['Start_Date_X', 'Price_X']
# Find the closest cluster for outlier flights
#min_dist_list=[]
nearest_cluster=[]
for j in range(df_outliers.shape[0]):
outlier = df_outliers.iloc[j]
dist = []
for i in range(mean_points.shape[0]):
mean = mean_points[['Start_Date_X', 'Price_X']].iloc[i]
dist.append(distance.euclidean(outlier,mean))
#min_dist_list.append(min(dist))
for k, d in enumerate(dist):
if d == min(dist):
nearest_cluster.append(k)
outliers = df_outliers.copy()
outliers['closest_cluster'] = pd.Series(nearest_cluster).values # Nearest cluster of each outlier
outliers['Price'] = df['Price'][df.dbscan_labels == -1] # Original price of each outlier
outliers['Date_of_Flight'] = df['Date_of_Flight'][df.dbscan_labels == -1] # Original date of each outlier
outliers['threshold'] = mean_points['Threshold'][outliers['closest_cluster']].values # Threshold of the closest cluster
result = outliers[['Price','Date_of_Flight']][outliers.Price <= outliers.threshold]
if result.shape[0] != 0:
return result
else:
raise Exception('There is no outlier price in this period!')
In [7]:
# test 1
data_60 = scrape_data(datetime.datetime(2017, 4, 10), 'Beijing', 'Mexico', 'Mexico City')
task_3_dbscan(data_60)
Out[7]:
In [376]:
# test 2
data_60 = scrape_data(datetime.datetime(2017, 5, 10), 'Beijing', 'United States', 'San Francisco')
task_3_dbscan(data_60)
In [381]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline
# ========== Task 3 question 1 ==========
def task_3_dbscan_tuning(flight_data, eps_val, min_spl):
'''
flight_data: a pandas DataFrame object with 2 columns 'Price' & 'Date_of_Flight', one row for each day
return: a pandas DataFrame object with 2 colomns 'Price' & 'Date_of_Flight', one row for each outlier flight
'''
df = flight_data
start_date = []
for i in range(len(df)):
start_date.append((df['Date_of_Flight'][i]- df['Date_of_Flight'][0]).days +1)
# clustering
df['Start_Date'] = pd.Series(start_date).values
X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
db = DBSCAN(eps = eps_val, min_samples = min_spl).fit(X)
df['dbscan_labels'] = db.labels_
# plot
labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
matplotlib.style.use('ggplot')
plt.subplots(figsize=(12,8))
for k, c in zip(unique_labels, colors):
class_member_mask = (labels == k) # get all the points in class k
xy = X[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
markeredgecolor='k', markersize=14)
plt.title("Total Clusters: {}".format(clusters)+ " Eps={}".format(eps_val)+ " Min_spl={}".format(min_spl), fontsize=14, y=1.01)
#plt.savefig('task_3_dbscan.png')
print unique_labels
In [382]:
task_3_dbscan_tuning(data_60, 0.3, 3)
In [ ]: