Explore restaurants data

This notebook shows on the nuisances of data processing



In [ ]:

    
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline



In [ ]:

    
# Check if the counts in the input file and data frame are matching
!wc data/restaurants/tripadvisor_in-restaurant_sample.csv



In [ ]:

    
pd.read



In [ ]:

    
df = pd.read_csv("data/restaurants/restaurants-on-tripadvisor.zip",compression='zip',encoding='utf-8')



In [ ]:

    
len(df)



In [ ]:

    
len(df.columns)



In [ ]:

    
# Check the data types of the columns
df.dtypes



In [ ]:

    
df.info()



In [ ]:

    
(len(df)-df.isnull().sum())*100/len(df)



In [ ]:

    
df[df["Uniq Id"] == "bfc127bb137c9aa4628c0659e0dce2d5"]



In [ ]:

    
# select sub set of columns that can be checked for null and remove any such records
major_columns = ['Restaurant ID','Restaurant URL','Name','Uniq Id']
col_na = [col_name for col_name in list(df.columns) if col_name not in major_columns]



In [ ]:

    
print(col_na)



In [ ]:

    
len(df)



In [ ]:

    
# drop all of the rows where subset of columns are completely null
df1 = df.dropna(axis=0,subset=col_na,how='all')



In [ ]:

    
len(df1)



In [ ]:

    
len(df1.columns)



In [ ]:

    
# drop those columns that are completely null
df2 = df1.dropna(axis=1,how='all').copy()



In [ ]:

    
len(df2)



In [ ]:

    
len(df2.columns)



In [ ]:

    
df2.head()



In [ ]:

    
df2.info()



In [ ]:

    
df2['Total Review'].value_counts().iloc[0:21]



In [ ]:

    
df2['Total Review'].value_counts().iloc[0:21].sum()



In [ ]:

    
df2['Total Review'].value_counts().iloc[0:21]



In [ ]:

    
df2.columns



In [ ]:

    
# Top 20 reviews
df2['Rating'].value_counts().iloc[0:21].plot(kind='bar',figsize=(14,6))



In [ ]:

    
# incase if you want to consider the columns having data populated > 60%
#cols = [col for col in list(df.columns) if col not in list(df_col[df_col > 60].index)]



In [ ]:

    
cols



In [ ]:

    
df1 = df[cols]



In [ ]:

    
df1.head(2)



In [ ]:

    
df['Ranking'].head()



In [ ]:

    
df.Country.value_counts()



In [ ]:

    
df[df.Country=="United States"]["State"].value_counts()



In [ ]:

    
df_usa = df[df.Country=="United States"]



In [ ]:

    
# Select only those records with valid state codes, STATE code that has lenght of 2
df_usa_state = df_usa[df_usa.State.str.len() == 2]



In [ ]:

    
df_usa_state.State.value_counts()



In [ ]:

    
df_usa_state.State.value_counts()[0:10]



In [ ]:

    
pd.to_datetime(df2['Last Reviewed']).head()



In [ ]:

    
# Last Reviewed column contains data values in the format dd Mon Year
df2["Last Reviewed"].head(10)



In [ ]:

    
# This function extract the day, month, year part of the date column
def get_split_value(value,index):
    value = str(value)
    value = value.replace("- ","")
    fields = value.split()
    month = ""
    try:
        month = fields[index]
    except TypeError:
        month = ""
    except IndexError:
        month = ""
    return month



In [ ]:

    
# Let's check if the date values in Last Reviewed column are having valid values
# check Day
days = df2["Last Reviewed"].apply(lambda x:get_split_value(x,1)).value_counts().sort_index(ascending=True)
print(sorted([ int(x) for x in list(days.index) if x != 'nan']))



In [ ]:

    
# check Month
months = df2["Last Reviewed"].apply(lambda x:get_split_value(x,1)).value_counts().sort_index(ascending=True)
print(sorted(list(months.index)))



In [ ]:

    
# check Year
years = df2["Last Reviewed"].apply(lambda x:get_split_value(x,2)).value_counts().sort_index(ascending=True)
print(list(years.index))



In [ ]:

    
df2["Last Reviewed"].apply(lambda x:get_split_value(x,2)).value_counts()



In [ ]:

    
df2['last_reviewed_dt'] = pd.to_datetime(df2["Last Reviewed"])



In [ ]:

    
df2[df2.Country=="United States"].last_reviewed_dt.apply(lambda x:x.year).value_counts()



In [ ]:

    
df2.last_reviewed_dt.head()



In [ ]:

    
df2.last_reviewed_dt.describe()



In [ ]: