In [2]:
import json
import sqlite3
import pandas as pd
import numpy as np
#from sqlalchemy import create_engine
from collections import defaultdict
import sys
reload(sys)
sys.setdefaultencoding('utf8')
In [8]:
df_c = business[['business_id','categories']]
df_c.head(3)
Out[8]:
In [9]:
len(df_c)
Out[9]:
In [10]:
# Build table of business and categories
def yes_no(df,col): # data, which col to expand(name)
"""
the function takes a dataframe and column name
df: dataframe
col: name of the column containing a list of words(i.e.categories)
returns a dictionary with
keys--all the words;
values as 0/1--0 if the business's categories do not include this word; 1 if it does
"""
dic = defaultdict(list) # dictionary whose values are lists.
for i in range(len(df)):
#lst = df.iloc[:,col][i].encode('ascii','ignore')[3:-2].split('\', u\'') # convert unicode list to list of strings
lst_u = df[col][i]
try:
lst = []
for j in range(len(lst_u)):
lst.append(str(lst_u[j]))
for k in dic.keys():
if k in lst:
dic[k].append(1)
elif k not in lst:
dic[k].append(0)
else:
pass
for c in lst:
if c not in dic.keys():
zeros = [0] * i
dic[c] = zeros + [1]
except:
# if a business does not have "categoriy" items, add 0 for all keys
for k in dic.keys():
dic[k].append(0)
continue
return dic
In [11]:
##!! Caution!! takes some time to run
dic_all = yes_no(df_c, 'categories')
In [12]:
len(df_c)
Out[12]:
In [13]:
df_cat = pd.DataFrame.from_dict(dic_all)
df_cat.index = df_c.business_id
df_cat.head()
Out[13]:
In [14]:
# Check
df_cat.loc['SWkmyhQBV1A7Qu4LRHoVXQ',:][df_cat.loc['SWkmyhQBV1A7Qu4LRHoVXQ',:]==1]
Out[14]:
In [ ]:
# Save as pickle file
df_cat.to_pickle('Yelp_Business_Catergory_Table.p')
In [9]:
import pandas as pd
import numpy as np
In [3]:
## load all cat from pickle
df = pd.read_pickle('Yelp_Business_Catergory_Table.p')
In [4]:
df.shape
Out[4]:
In [5]:
for col in df.columns[:5]:
print col
In [11]:
#columns_food
In [7]:
#columns_culture
In [8]:
#columns_unrelated
In [42]:
### adjustment ###
# columns_food.append('___')
# columns_unrelated.remove('____')
In [10]:
# Avoid repetitive words
columns_food= list(set(columns_food))
In [59]:
# Save to text file
thefile = open('cat_unrelated.txt', 'w')
for item in columns_unrelated:
thefile.write("%s\n" % item)
In [61]:
### might need to extract Cultural words... ###
In [6]:
df.head(3)
Out[6]:
In [7]:
df.shape
Out[7]:
In [34]:
# read saved txt file
list_food = open('cat_food.txt', 'rw').read().split('\n')
list_culture = open('cat_culture.txt', 'rw').read().split('\n')
In [35]:
list_all = list(set(list_food + list_culture))
In [36]:
len(list_food)
Out[36]:
In [37]:
len(list_culture)
Out[37]:
In [38]:
len(list_all)
Out[38]:
In [39]:
df_fc = df.loc[:, df.columns[df.columns.isin(list_all)]]
In [40]:
df_fc['cat_sum'] = df_fc.sum(axis=1)
df_fc['food_sum'] = df_fc.loc[:, df_fc.columns[df_fc.columns.isin(list_food)]].sum(axis=1)
df_fc['cul_sum'] = df_fc.loc[:, df_fc.columns[df_fc.columns.isin(list_culture)]].sum(axis=1)
In [15]:
df_fc = df_fc[df_fc['cat_sum'] > 0 ]
In [16]:
df_fc.shape
Out[16]:
In [17]:
df_fc.head()
Out[17]:
In [18]:
#### check those are not labeled as Restaurants
len(df_fc[df_fc['Restaurants']!=1])
Out[18]:
In [19]:
## check if there is non-res with cultral vacabularies ----- only "restuarants" are assigned a cultural vacabulary
df_fc[(df_fc['Restaurants']!=1) & (df_fc['cul_sum']!=0)]
Out[19]:
In [ ]:
In [20]:
##### Show all non-"res"
non_restaurants_cat = []
for biz in df_fc[df_fc['Restaurants']!=1].index:
non_restaurants_cat.append(df_fc.columns[df_fc.ix[biz, :-3] == 1])
In [21]:
pd.DataFrame(non_restaurants_cat).head(5)
Out[21]:
In [290]:
#df_fc.loc['XxjrqA5jrzLH0EdCKUG9Mw', :][df_fc.loc['XxjrqA5jrzLH0EdCKUG9Mw', :] == 1]
In [22]:
#################
#### check those are labeled as "Restaurants"
len(df_fc[df_fc['Restaurants']==1])
Out[22]:
In [23]:
df_r = df_fc[df_fc['Restaurants']==1]
In [24]:
df_r.head(5)
Out[24]:
In [68]:
##### Show all "res"
restaurants_cat = []
for biz in df_fc[df_fc['Restaurants']==1].index:
restaurants_cat.append(df_fc.columns[df_fc.ix[biz, :-3] == 1])
In [69]:
restaurants_cat[:10]
Out[69]:
In [25]:
##################################
df_fc['Restaurants'][df_fc['Swiss Food'] == 1]
Out[25]:
In [44]:
df_fc.loc['GzF4lVSh61sayvwVMp8RmA', :][df_fc.loc['GzF4lVSh61sayvwVMp8RmA', :] == 1]
Out[44]:
In [26]:
## double check
df_culonly = df_fc[df_fc['food_sum'] == 0 ]
df_culonly
Out[26]:
In [27]:
## Take a closer look at the cultural vacab
cultureonly = df_fc.loc[:, df_fc.columns[df_fc.columns.isin(list_culture)]]
biz_cultureonly = cultureonly[cultureonly.sum(axis=1)>0].index
df_cul = df_fc.loc[biz_cultureonly, :]
In [71]:
df_cul.head()
Out[71]:
In [72]:
(df_cul.food_sum != 0 ).sum() == len(df_cul)
Out[72]:
In [28]:
### Examine cultures only
biz_cultures = df_cul[df_cul.cul_sum > 0]
biz_cultures = biz_cultures.loc[:, list_culture + ['cat_sum', 'food_sum','cul_sum']]
biz_cultures.shape
Out[28]:
In [ ]:
##############################
In [30]:
biz_cultures.head(10)
Out[30]:
In [76]:
lst = biz_cultures.columns[biz_cultures.iloc[5,:-3] == 1].values
lst
Out[76]:
In [77]:
lsts_cul = map(lambda biz: biz_cultures.columns[biz_cultures.ix[biz,:-3] == 1].values, biz_cultures.index.values)
In [78]:
### Create culture table
columns = map(lambda c: 'culture_{}'.format(c), range(1, df_cul.cul_sum.max()+1))
df_CULTURE = pd.DataFrame(data=lsts_cul, index = df_cul.index, columns = columns)
In [79]:
df_CULTURE
Out[79]:
In [80]:
df_CULTURE.culture_7.unique()
Out[80]:
In [32]:
### Dictionary of combined cultures (keep building...)
dic_cuisine = {}
In [41]:
# List of Chinese food
list_chinese = ['Cantonese','Szechuan','Shanghainese','Hainan','Chinese','Taiwanese',\
'Asian Fusion',\
'Dim Sum', 'Hot Pot', 'Hong Kong Style Cafe']
dic_cuisine['Chinese'] = list_chinese
In [82]:
biz_cultures.head(3)
Out[82]:
In [ ]:
In [ ]: