In [185]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import itertools
from nltk.corpus import stopwords
!pip install wordcloud
from wordcloud import WordCloud
pd.set_option('max_columns', None)
%matplotlib inline
In [66]:
df = pd.read_csv('booking_com-travel_sample.csv')
In [67]:
df.head()
Out[67]:
In [68]:
df.info()
In [70]:
len(df['city'].unique())
Out[70]:
In [71]:
df['city'].value_counts().head()
Out[71]:
In [72]:
df['hotel_star_rating'].value_counts()
Out[72]:
In [73]:
def star_cleaning(value):
try:
value = list(value)[0]
if int(value) == 3:
return (3.0)
elif int(value) == 2:
return (2.0)
elif int(value) == 1:
return (1.0)
elif int(value) == 4:
return (4.0)
else:
return (5.0)
except:
pass
In [74]:
df['hotel_star_rating'] = df['hotel_star_rating'].apply(star_cleaning)
In [79]:
df.head()
Out[79]:
In [80]:
df['room_count'].describe()
Out[80]:
In [90]:
room_count = df['room_count']
room_count.dropna(inplace = True)
plt.figure(figsize=(15,10))
sns.distplot(df['room_count'], color='g')
Out[90]:
In [100]:
plt.figure(figsize=(15, 5))
sns.boxplot(df['room_count'])
Out[100]:
In [101]:
room_count = df['hotel_star_rating']
room_count.dropna(inplace = True)
plt.figure(figsize=(15, 5))
sns.countplot(df['hotel_star_rating'])
Out[101]:
In [106]:
df['hotel_facilities'][0].split('•')
Out[106]:
In [112]:
df['hotel_facilities'][0].split('•')[1].split('|')
Out[112]:
In [131]:
' '.join(list(itertools.chain.from_iterable([re.sub(r'[^\w\s]',' ',x.replace(':', ' ').strip()).split('|') for x in df['hotel_facilities'][0].split('•')]))).replace('\n','')
Out[131]:
In [179]:
' '.join(list(itertools.chain.from_iterable([re.sub(r'[^\w\s]',' ',x.replace(':', ' ').lower().strip()).split('|') for x in df['hotel_facilities'][10].split('•')]))).replace('\n','')
Out[179]:
In [182]:
df['hotel_facilities_wc'] = df['hotel_facilities'].apply(one_liner)
In [181]:
def one_liner(val):
try:
return(' '.join(list(itertools.chain.from_iterable([re.sub(r'[^\w\s]',' ',x.replace(':', ' ').lower().strip()).split('|') for x in val.split('•')]))).replace('\n',''))
except:
pass
In [ ]:
In [ ]: