Explore restaurants data

This notebook shows on the nuisances of data processing


In [ ]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
# Check if the counts in the input file and data frame are matching
!wc data/restaurants/tripadvisor_in-restaurant_sample.csv

In [ ]:
pd.read

In [ ]:
df = pd.read_csv("data/restaurants/restaurants-on-tripadvisor.zip",compression='zip',encoding='utf-8')

In [ ]:
len(df)

In [ ]:
len(df.columns)

In [ ]:
# Check the data types of the columns
df.dtypes

In [ ]:
df.info()

In [ ]:
(len(df)-df.isnull().sum())*100/len(df)

In [ ]:
df[df["Uniq Id"] == "bfc127bb137c9aa4628c0659e0dce2d5"]

In [ ]:
# select sub set of columns that can be checked for null and remove any such records
major_columns = ['Restaurant ID','Restaurant URL','Name','Uniq Id']
col_na = [col_name for col_name in list(df.columns) if col_name not in major_columns]

In [ ]:
print(col_na)

In [ ]:
len(df)

In [ ]:
# drop all of the rows where subset of columns are completely null
df1 = df.dropna(axis=0,subset=col_na,how='all')

In [ ]:
len(df1)

In [ ]:
len(df1.columns)

In [ ]:
# drop those columns that are completely null
df2 = df1.dropna(axis=1,how='all').copy()

In [ ]:
len(df2)

In [ ]:
len(df2.columns)

In [ ]:
df2.head()

In [ ]:
df2.info()

In [ ]:
df2['Total Review'].value_counts().iloc[0:21]

In [ ]:
df2['Total Review'].value_counts().iloc[0:21].sum()

In [ ]:
df2['Total Review'].value_counts().iloc[0:21]

In [ ]:
df2.columns

In [ ]:
# Top 20 reviews
df2['Rating'].value_counts().iloc[0:21].plot(kind='bar',figsize=(14,6))

In [ ]:
# incase if you want to consider the columns having data populated > 60%
#cols = [col for col in list(df.columns) if col not in list(df_col[df_col > 60].index)]

In [ ]:
cols

In [ ]:
df1 = df[cols]

In [ ]:
df1.head(2)

In [ ]:
df['Ranking'].head()

In [ ]:
df.Country.value_counts()

In [ ]:
df[df.Country=="United States"]["State"].value_counts()

In [ ]:
df_usa = df[df.Country=="United States"]

In [ ]:
# Select only those records with valid state codes, STATE code that has lenght of 2
df_usa_state = df_usa[df_usa.State.str.len() == 2]

In [ ]:
df_usa_state.State.value_counts()

In [ ]:
df_usa_state.State.value_counts()[0:10]

In [ ]:
pd.to_datetime(df2['Last Reviewed']).head()

In [ ]:
# Last Reviewed column contains data values in the format dd Mon Year
df2["Last Reviewed"].head(10)

In [ ]:
# This function extract the day, month, year part of the date column
def get_split_value(value,index):
    value = str(value)
    value = value.replace("- ","")
    fields = value.split()
    month = ""
    try:
        month = fields[index]
    except TypeError:
        month = ""
    except IndexError:
        month = ""
    return month

In [ ]:
# Let's check if the date values in Last Reviewed column are having valid values
# check Day
days = df2["Last Reviewed"].apply(lambda x:get_split_value(x,1)).value_counts().sort_index(ascending=True)
print(sorted([ int(x) for x in list(days.index) if x != 'nan']))

In [ ]:
# check Month
months = df2["Last Reviewed"].apply(lambda x:get_split_value(x,1)).value_counts().sort_index(ascending=True)
print(sorted(list(months.index)))

In [ ]:
# check Year
years = df2["Last Reviewed"].apply(lambda x:get_split_value(x,2)).value_counts().sort_index(ascending=True)
print(list(years.index))

In [ ]:
df2["Last Reviewed"].apply(lambda x:get_split_value(x,2)).value_counts()

In [ ]:
df2['last_reviewed_dt'] = pd.to_datetime(df2["Last Reviewed"])

In [ ]:
df2[df2.Country=="United States"].last_reviewed_dt.apply(lambda x:x.year).value_counts()

In [ ]:
df2.last_reviewed_dt.head()

In [ ]:
df2.last_reviewed_dt.describe()

In [ ]: