In [1]:
import pymongo
from pymongo import MongoClient
In [2]:
client = MongoClient('mongodb://localhost:27017/')
In [3]:
db = client.airbnb
In [36]:
listings = db.listings_new
attractions = db.attractions
In [5]:
import pandas as pd
In [6]:
listings_df = pd.DataFrame(list(db.listings_new.find()))
In [7]:
listings_df.head()
Out[7]:
In [8]:
listings_df.columns.values
Out[8]:
In [9]:
listings_df = listings_df.convert_objects(convert_numeric=True)
In [10]:
listings_df['price'] = listings_df['price'].str[1:]
listings_df['price'] = listings_df.price.replace(',', '',regex=True)
listings_df['price'] = listings_df.price.astype(float).fillna(0.0)
In [11]:
listings_df['extra_people'] = listings_df['extra_people'].str[1:]
listings_df['extra_people'] = listings_df.extra_people.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['extra_people'] = listings_df.extra_people.astype(float).fillna(0.0)
In [12]:
listings_df['weekly_price'] = listings_df['weekly_price'].str[1:]
listings_df['weekly_price'] = listings_df.weekly_price.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['weekly_price'] = listings_df.weekly_price.astype(float).fillna(0.0)
In [13]:
listings_df['monthly_price'] = listings_df['monthly_price'].str[1:]
listings_df['monthly_price'] = listings_df.monthly_price.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['monthly_price'] = listings_df.monthly_price.astype(float).fillna(0.0)
In [14]:
listings_df['security_deposit'] = listings_df['security_deposit'].str[1:]
listings_df['security_deposit'] = listings_df.security_deposit.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['security_deposit'] = listings_df.security_deposit.astype(float).fillna(0.0)
In [15]:
listings_df['cleaning_fee'] = listings_df['cleaning_fee'].str[1:]
listings_df['cleaning_fee'] = listings_df.cleaning_fee.replace(',', '',regex=True).replace('', '0',regex=True)
listings_df['cleaning_fee'] = listings_df.cleaning_fee.astype(float).fillna(0.0)
In [17]:
listings_df['amenities_split'] = listings_df["amenities"].apply(lambda x: x[1:-1].split(','))
In [30]:
#Get unique amenities
unique_amenities = list(set(x for l in listings_df["amenities_split"] for x in l))
unique_amenities = unique_amenities[0:2] + unique_amenities[3:]
unique_amenities
Out[30]:
In [32]:
num_col = len(unique_amenities) #number of columns
data_array = []
for n in range(0, len(listings_df)):
lst = []
for i in range (0, num_col):
row = listings_df["amenities_split"][n]
if unique_amenities[i] in row:
lst.append(1)
else:
lst.append(0)
data_array.append(lst)
df = pd.DataFrame(data_array, columns=unique_amenities)
In [41]:
listings_df2 = listings_df.join(df)
In [42]:
listings_df2.head()
Out[42]:
In [37]:
attractions = pd.DataFrame(list(db.attractions.find()))
In [38]:
attractions.head()
Out[38]:
In [39]:
#Calculate distance between 2 lat long points
#Returns distance in km
def distance(lat1, long1, lat2, long2):
from math import sin, cos, sqrt, atan2, radians
# approximate radius of earth in km
R = 6373.0
lat1 = radians(lat1)
long1 = radians(long1)
lat2 = radians(lat2)
long2 = radians(long2)
dlong = long2 - long1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlong / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
return distance
In [43]:
for n in range(0, len(listings_df2)):
nearest_attr = attractions['attraction'][0]
nearest_attr_rating = attractions['rating'][0]
nearest_attr_lat = attractions['lat'][0]
nearest_attr_long = attractions['long'][0]
list_lat = listings_df2['latitude'][n]
list_long = listings_df2['longitude'][n]
#Distance from first attraction to listing
dist_nearest = distance(list_lat, list_long, nearest_attr_lat, nearest_attr_long)
for i in range(1, len(attractions)):
attr_lat = attractions['lat'][i]
attr_long = attractions['long'][i]
dist = distance(list_lat, list_long, attr_lat, attr_long)
if dist < dist_nearest:
nearest_attr = attractions['attraction'][i]
nearest_attr_rating = attractions['rating'][i]
nearest_attr_lat = attractions['lat'][i]
nearest_attr_long = attractions['long'][i]
dist_nearest = dist
listings_df2.loc[n, 'nearest_attr'] = nearest_attr
listings_df2.loc[n, 'nearest_attr_rating'] = nearest_attr_rating
listings_df2.loc[n, 'nearest_attr_lat'] = nearest_attr_lat
listings_df2.loc[n, 'nearest_attr_long'] = nearest_attr_long
listings_df2.loc[n, 'nearest_attr_dist'] = dist_nearest
In [44]:
listings_df2.head()
Out[44]:
In [45]:
#listings_df2.to_csv("listings_31Mar.csv")
In [ ]: