In [11]:
import bs4
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import time
from unidecode import unidecode
import pandas as pd
import datetime
from dateutil.parser import parse
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [18]:
def scrape_data(start_date, from_place, to_place, city_name):
driver = webdriver.Chrome()
driver.get('https://www.google.com/flights/explore/')
time.sleep(1.5)
#input to_place
to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[3]/div/div[4]/div/div')
to_input.click()
actions = ActionChains(driver)
actions.send_keys(to_place)
actions.send_keys(Keys.ENTER)
actions.perform()
time.sleep(0.5)
#input from_place
to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[3]/div/div[2]/div/div')
to_input.click()
actions = ActionChains(driver)
actions.send_keys(from_place)
actions.send_keys(Keys.ENTER)
actions.perform()
time.sleep(0.5)
#input start_date
driver.get(driver.current_url[:-10]+start_date)
time.sleep(0.5)
#find the city_name
data = []
city_name0=unicode(city_name,'utf-8')
city_name_unicode=unidecode(city_name0)
city_name1=city_name_unicode.lower().split(' ')
city_name2=''
for i in range(len(city_name1)):
city_name2=city_name2+city_name1[i][0].upper()+city_name1[i][1:]+' '
city_name2=city_name2.strip()
results = driver.find_elements_by_class_name('LJTSM3-v-d')
for result in results:
if city_name2 in result.text:
bars = result.find_elements_by_class_name('LJTSM3-w-x')
for bar in bars:
ActionChains(driver).move_to_element(bar).perform()
time.sleep(0.0001)
data.append((result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[0].text,
result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[1].text))
else:
pass
time.sleep(0.01)
driver.quit()
return data
In [20]:
flight_data=scrape_data('2017-04-30','Chicago','America','boston')
len(flight_data)
Out[20]:
In [2]:
def scrape_data_90(start_date, from_place, to_place, city_name):
driver = webdriver.Chrome()
driver.get('https://www.google.com/flights/explore/')
time.sleep(1.5)
#driver.get('https://www.google.com/flights/explore/')
#input to_place
to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[3]/div/div[4]/div/div')
to_input.click()
actions = ActionChains(driver)
actions.send_keys(to_place)
actions.send_keys(Keys.ENTER)
actions.perform()
time.sleep(0.5)
#input from_place
to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[3]/div/div[2]/div/div')
to_input.click()
actions = ActionChains(driver)
actions.send_keys(from_place)
actions.send_keys(Keys.ENTER)
actions.perform()
time.sleep(0.5)
#input start_date
driver.get(driver.current_url[:-10]+start_date)
time.sleep(0.5)
#find the city_name
data = []
city_name0=unicode(city_name,'utf-8')
city_name_unicode=unidecode(city_name0)
city_name1=city_name_unicode.lower().split(' ')
city_name2=''
for i in range(len(city_name1)):
city_name2=city_name2+city_name1[i][0].upper()+city_name1[i][1:]+' '
city_name2=city_name2.strip()
results = driver.find_elements_by_class_name('LJTSM3-v-d')
for result in results:
if city_name2 in result.text:
bars = result.find_elements_by_class_name('LJTSM3-w-x')
for bar in bars:
ActionChains(driver).move_to_element(bar).perform()
time.sleep(0.0001)
data.append((result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[0].text,
result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[1].text))
else:
pass
time.sleep(0.5)
to_input = driver.find_element_by_xpath('//*[@id="root"]/div[3]/div[4]/div/div[2]/div[1]/div/div[2]/div[2]/div/div[2]/div[5]/div')
to_input.click()
time.sleep(0.5)
results = driver.find_elements_by_class_name('LJTSM3-v-d')
for result in results:
if city_name2 in result.text:
bars = result.find_elements_by_class_name('LJTSM3-w-x')
for bar in bars:
ActionChains(driver).move_to_element(bar).perform()
time.sleep(0.0001)
data.append((result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[0].text,
result.find_element_by_class_name('LJTSM3-w-k').find_elements_by_tag_name('div')[1].text))
else:
pass
data=data[:60]+data[-30:]
driver.quit()
return data
In [3]:
flight_data=scrape_data_90('2017-04-30','Chicago','America','boston')
flight_data
Out[3]:
In [12]:
def task_3_dbscan(flight_data):
clean_data = [(float(d[0].replace('$', '').replace(',', '')),
(parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days,
reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])
X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
db = DBSCAN(eps=.5, min_samples=3).fit(X)
labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
plt.subplots(figsize=(12,8))
for k, c in zip(unique_labels, colors):
class_member_mask = (labels == k)
xy = X[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
markeredgecolor='k', markersize=14)
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01)
df['dbscan_labels'] = db.labels_
plt.savefig('task_3_dbscan.png')
outliers=df[df['dbscan_labels']==-1].copy()
outliers_1=zip(outliers.Start_Date,outliers.Price)
clusters=df[df['dbscan_labels']!=-1].copy()
clusters_1=zip(clusters.Start_Date,clusters.Price,clusters.dbscan_labels)
outliers_label=[]
for outlier in outliers_1:
min_cluster_label = -1
min_dist = 9999
for cluster in clusters_1:
dist=(float(outlier[0])-float(cluster[0]))**2+((float(outlier[1])-float(cluster[1]))/100)**2
# I think the weight of date in distance is more important than weight of price. Therefore, I did not use Euclidean distance.
if dist < min_dist:
min_dist = dist
min_cluster_label = cluster[2]
outliers_label.append(min_cluster_label)
outliers_2=zip(outliers.Start_Date,outliers.Price,outliers_label)
agg = df[df['dbscan_labels']!=-1].groupby('dbscan_labels')['Start_Date','Price'].agg(['std','mean','count']).copy()
outliers_3=[]
for outlier in outliers_2:
mean=agg[agg.index==outlier[2]]['Price']['mean']
std=max(float(agg[agg.index==outlier[2]]['Price']['std']),10)
line=max(float(mean-2*std),50)
if outlier[1]<line:
outliers_3.append(outlier[0])
if len(outliers_3)==0:
return 'There is no low price outlier.'
else:
return df.loc[outliers_3]
In [13]:
task_3_dbscan(flight_data)
Out[13]:
In [14]:
def task_3_IQR(flight_data):
clean_data = [(float(d[0].replace('$', '').replace(',', '')),
(parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days,
reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])
plt.boxplot(df['Price']);
plt.savefig('task_3_iqr.png')
Q1 = df.Price.describe()['25%']
Q3 = df.Price.describe()['75%']
IQR = Q3 - Q1
low_line = Q1 - 1.5 * IQR
result = df[df['Price'] < low_line]
if len(result) == 0:
return 'No outliers'
else:
return result
In [15]:
task_3_IQR(flight_data)
Out[15]:
In [16]:
def task_4_dbscan(flight_data):
clean_data = [(float(d[0].replace('$', '').replace(',', '')),
(parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days,
reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])
X = df[['Start_Date', 'Price']].values*np.array([20,1])
radius = np.sqrt(np.square(20.00) + np.square(20.00))
db = DBSCAN(eps=radius, min_samples=3).fit(X)
df['dbscan_labels'] = db.labels_
clusters=df.dbscan_labels.unique()
clusters_5=[]
for cluster in clusters:
if cluster!=-1 and len(df[df['dbscan_labels']==cluster])>4:
for i in range(len(df[df['dbscan_labels']==cluster])-4):
clusters_5.append(df[df['dbscan_labels']==cluster]['Start_Date'].values[i:i+5])
mean_min=9999
cluster_mean_min=[]
for cluster_5 in clusters_5:
df_5=df.loc[cluster_5][['Start_Date','Price']]
cluster_max=df_5['Price'].max()
cluster_min=df_5['Price'].min()
cluster_mean=df_5['Price'].mean()
if cluster_max-cluster_min<=20 and cluster_mean < mean_min:
mean_min=cluster_mean
cluster_mean_min=cluster_5
else:
pass
if len(cluster_mean_min)==0:
return 'No required value'
else:
return df.loc[cluster_mean_min]
In [17]:
task_4_dbscan(flight_data)
Out[17]:
In [721]:
clean_data = [(float(d[0].replace('$', '').replace(',', '')),
(parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days,
reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])
X = df[['Start_Date', 'Price']].values*np.array([20,1])
radius = np.sqrt(np.square(20.00) + np.square(20.00))
db = DBSCAN(eps=radius, min_samples=3).fit(X)
df['dbscan_labels'] = db.labels_
In [725]:
clusters=df.dbscan_labels.unique()
clusters
Out[725]:
In [722]:
clusters_5=[]
for cluster in clusters:
if cluster!=-1:
if len(df[df['dbscan_labels']==cluster])>4:
clusters_5.append(cluster)
else:
pass
In [727]:
clusters_5
Out[727]:
In [ ]:
mean_min=9999
cluster_mean_min=[]
for cluster_5 in clusters_5:
df_5=df.loc[cluster_5][['Start_Date','Price']]
cluster_max=df_5['Price'].max()
cluster_min=df_5['Price'].min()
cluster_mean=df_5['Price'].mean()
if cluster_max-cluster_min<=20 and cluster_mean < mean_min:
mean_min=cluster_mean
cluster_mean_min=cluster_5
else:
pass
In [728]:
cluster_mean_min
Out[728]:
In [525]:
import pandas as pd
import datetime
%matplotlib inline
from dateutil.parser import parse
clean_data = [(float(d[0].replace('$', '').replace(',', '')),
(parse(d[1].split('-')[0].strip()) - parse(flight_data[0][1].split('-')[0].strip())).days,
reduce(lambda x,y: y-x, [parse(x.strip()) for x in d[1].split('-')]).days) for d in flight_data]
clean_data
Out[525]:
In [553]:
radius = np.sqrt(np.square(20.00) + np.square(20.00))
radius
Out[553]:
In [703]:
df = pd.DataFrame(clean_data, columns=['Price', 'Start_Date', 'Trip_Length'])
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
# Pandas has a ton of built-in visualizations
# Play and Learn
# http://pandas.pydata.org/pandas-docs/stable/visualization.html
df.plot.scatter(x='Start_Date', y='Price')
Out[703]:
In [571]:
df = df.set_value(12, 'Price', 20)
df = df.set_value(13, 'Price', 25)
df = df.set_value(14, 'Price', 40)
df = df.set_value(15, 'Price', 20)
df = df.set_value(16, 'Price', 25)
df = df.set_value(17, 'Price', 40)
df = df.set_value(18, 'Price', 50)
df = df.set_value(19, 'Price', 60)
# Time for a Google Investigation
# "IQR Outlier"
In [704]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
# All of pandas' viz is built on top of matplotlib as you might have noticed
# You can get started learning matplotlib here: http://matplotlib.org/users/pyplot_tutorial.html
# df = df.set_value(49, 'Price', 255)
# X = StandardScaler().fit_transform(df[['Start_Date', 'Price']])
X = df[['Start_Date', 'Price']].values*np.array([20,1])
radius = np.sqrt(np.square(20.00) + np.square(20.00))
db = DBSCAN(eps=radius, min_samples=3).fit(X)
labels = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
plt.subplots(figsize=(12,8))
for k, c in zip(unique_labels, colors):
class_member_mask = (labels == k)
xy = X[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
markeredgecolor='k', markersize=14)
plt.title("Total Clusters: {}".format(clusters), fontsize=14, y=1.01)
df['dbscan_labels'] = db.labels_
In [705]:
clusters=df.dbscan_labels.unique()
clusters
Out[705]:
In [706]:
clusters_5=[]
for cluster in clusters:
if cluster!=-1 and len(df[df['dbscan_labels']==cluster])>4:
for i in range(len(df[df['dbscan_labels']==cluster])-4):
clusters_5.append(df[df['dbscan_labels']==cluster]['Start_Date'].values[i:i+5])
else:
pass
clusters_5
Out[706]:
In [707]:
mean_min=9999
cluster_mean_min=''
for cluster_5 in clusters_5:
df_5=df.loc[cluster_5][['Start_Date','Price']]
cluster_max=df_5['Price'].max()
cluster_min=df_5['Price'].min()
cluster_mean=df_5['Price'].mean()
if cluster_max-cluster_min<=20 and cluster_mean < mean_min:
mean_min=cluster_mean
cluster_mean_min=cluster_5
else:
pass
In [711]:
df.loc[cluster_mean_min]
Out[711]:
In [709]:
cluster_mean_min
Out[709]:
In [512]:
plt.subplots(figsize=(12,8))
for k, c in zip(unique_labels, colors):
class_member_mask = (labels == k)
xy = X[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=c,
markeredgecolor='k', markersize=14)
In [504]:
start_date = parse(flight_data[0][1].split('-')[0].strip())
db = DBSCAN(eps=radius, min_samples=3).fit(X)
df['label'] = db.labels_
clusters = len(set(labels))
unique_labels = set(labels)
list_of_dfs = []
for label in unique_labels:
if label != -1:
one_cluster = df[df.label == label]
# find start/end date for every 5-day-consecutive period
consecutive_list = []
days = one_cluster.Start_Date.values
for i in range(len(days) - 4):
if days[i + 4] - days[i] == 4:
consecutive_list.append((days[i], days[i + 4]))
# a list of 5-day-consecutive period dataframes
for start, end in consecutive_list:
df_five_day = one_cluster.loc[start - 1:end - 1, ['Start_Date', 'Price']]
if df_five_day.Price.describe()['max'] - df_five_day.Price.describe()['min'] <= 20:
df_five_day.Start_Date = df_five_day.Start_Date.apply(
lambda x: (start_date + datetime.timedelta(
days=x - 1)).strftime('%Y-%m-%d'))
list_of_dfs.append(df_five_day)
means = [one_df.Price.mean() for one_df in list_of_dfs]
df_min_index = means.index(min(means))
In [507]:
list_of_dfs[df_min_index]
Out[507]:
In [319]:
outliers=df[df['dbscan_labels']==-1].copy()
outliers_1=zip(outliers.Start_Date,outliers.Price)
outliers_1
Out[319]:
In [320]:
clusters=df[df['dbscan_labels']!=-1].copy()
clusters_1=zip(clusters.Start_Date,clusters.Price,clusters.dbscan_labels)
clusters_1
Out[320]:
In [321]:
outliers_label=[]
for outlier in outliers_1:
min_cluster_label = -1
min_dist = 9999
for cluster in clusters_1:
dist=(float(outlier[0])-float(cluster[0]))**2+((float(outlier[1])-float(cluster[1]))/100)**2
if dist < min_dist:
min_dist = dist
min_cluster_label = cluster[2]
outliers_label.append(min_cluster_label)
outliers_label
outliers_2=zip(outliers.Start_Date,outliers.Price,outliers_label)
outliers_2
Out[321]:
In [322]:
agg = df[df['dbscan_labels']!=-1].groupby('dbscan_labels')['Start_Date','Price'].agg(['std','mean','count']).copy()
agg
Out[322]:
In [323]:
mean=agg[agg.index==outliers_2[0][2]]['Price']['mean']
std=agg[agg.index==outliers_2[0][2]]['Price']['std']
max(float(mean-2*std),50)
Out[323]:
In [324]:
outliers_3=[]
for outlier in outliers_2:
mean=agg[agg.index==outlier[2]]['Price']['mean']
std=agg[agg.index==outlier[2]]['Price']['std']
line=max(float(mean-2*std),50)
if outlier[1]<line:
outliers_3.append(outlier[0])
outliers_3
Out[324]:
In [325]:
df.loc[outliers_3]
Out[325]:
In [138]:
if bool(re.search('multi', 'A mUltiCased string', re.IGNORECASE)):
print 'true'
else:
print 'false'
In [343]:
a='New York'
In [347]:
b=a.lower().split(' ')
In [357]:
c=''
for i in range(len(b)):
c=c+b[i][0].upper()+b[i][1:]+' '
In [358]:
c
Out[358]:
In [434]:
plt.boxplot(df['Price']);
plt.savefig('task_3_iqr.png')
In [435]:
plt.boxplot(df['Price']);
plt.savefig('task_3_iqr.png')
Q1 = df.Price.describe()['25%']
Q3 = df.Price.describe()['75%']
IQR = Q3 - Q1
low_line = Q1 - 1.5 * IQR
result = df[df.Price < low_line]
In [447]:
result
Out[447]:
In [453]:
len(result)
Out[453]:
In [450]:
if len(result) == 0:
return 'No outliers'
else:
return result
In [462]:
df.Price.describe()
Out[462]:
In [456]:
if a == 0:
print 'No outliers'
else:
print result
In [ ]: