In [ ]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
# Check if the counts in the input file and data frame are matching
!wc data/restaurants/tripadvisor_in-restaurant_sample.csv
In [ ]:
pd.read
In [ ]:
df = pd.read_csv("data/restaurants/restaurants-on-tripadvisor.zip",compression='zip',encoding='utf-8')
In [ ]:
len(df)
In [ ]:
len(df.columns)
In [ ]:
# Check the data types of the columns
df.dtypes
In [ ]:
df.info()
In [ ]:
(len(df)-df.isnull().sum())*100/len(df)
In [ ]:
df[df["Uniq Id"] == "bfc127bb137c9aa4628c0659e0dce2d5"]
In [ ]:
# select sub set of columns that can be checked for null and remove any such records
major_columns = ['Restaurant ID','Restaurant URL','Name','Uniq Id']
col_na = [col_name for col_name in list(df.columns) if col_name not in major_columns]
In [ ]:
print(col_na)
In [ ]:
len(df)
In [ ]:
# drop all of the rows where subset of columns are completely null
df1 = df.dropna(axis=0,subset=col_na,how='all')
In [ ]:
len(df1)
In [ ]:
len(df1.columns)
In [ ]:
# drop those columns that are completely null
df2 = df1.dropna(axis=1,how='all').copy()
In [ ]:
len(df2)
In [ ]:
len(df2.columns)
In [ ]:
df2.head()
In [ ]:
df2.info()
In [ ]:
df2['Total Review'].value_counts().iloc[0:21]
In [ ]:
df2['Total Review'].value_counts().iloc[0:21].sum()
In [ ]:
df2['Total Review'].value_counts().iloc[0:21]
In [ ]:
df2.columns
In [ ]:
# Top 20 reviews
df2['Rating'].value_counts().iloc[0:21].plot(kind='bar',figsize=(14,6))
In [ ]:
# incase if you want to consider the columns having data populated > 60%
#cols = [col for col in list(df.columns) if col not in list(df_col[df_col > 60].index)]
In [ ]:
cols
In [ ]:
df1 = df[cols]
In [ ]:
df1.head(2)
In [ ]:
df['Ranking'].head()
In [ ]:
df.Country.value_counts()
In [ ]:
df[df.Country=="United States"]["State"].value_counts()
In [ ]:
df_usa = df[df.Country=="United States"]
In [ ]:
# Select only those records with valid state codes, STATE code that has lenght of 2
df_usa_state = df_usa[df_usa.State.str.len() == 2]
In [ ]:
df_usa_state.State.value_counts()
In [ ]:
df_usa_state.State.value_counts()[0:10]
In [ ]:
pd.to_datetime(df2['Last Reviewed']).head()
In [ ]:
# Last Reviewed column contains data values in the format dd Mon Year
df2["Last Reviewed"].head(10)
In [ ]:
# This function extract the day, month, year part of the date column
def get_split_value(value,index):
value = str(value)
value = value.replace("- ","")
fields = value.split()
month = ""
try:
month = fields[index]
except TypeError:
month = ""
except IndexError:
month = ""
return month
In [ ]:
# Let's check if the date values in Last Reviewed column are having valid values
# check Day
days = df2["Last Reviewed"].apply(lambda x:get_split_value(x,1)).value_counts().sort_index(ascending=True)
print(sorted([ int(x) for x in list(days.index) if x != 'nan']))
In [ ]:
# check Month
months = df2["Last Reviewed"].apply(lambda x:get_split_value(x,1)).value_counts().sort_index(ascending=True)
print(sorted(list(months.index)))
In [ ]:
# check Year
years = df2["Last Reviewed"].apply(lambda x:get_split_value(x,2)).value_counts().sort_index(ascending=True)
print(list(years.index))
In [ ]:
df2["Last Reviewed"].apply(lambda x:get_split_value(x,2)).value_counts()
In [ ]:
df2['last_reviewed_dt'] = pd.to_datetime(df2["Last Reviewed"])
In [ ]:
df2[df2.Country=="United States"].last_reviewed_dt.apply(lambda x:x.year).value_counts()
In [ ]:
df2.last_reviewed_dt.head()
In [ ]:
df2.last_reviewed_dt.describe()
In [ ]: