In [1]:
%pylab inline
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.decomposition import SparsePCA, NMF
from scipy import sparse
import pickle
In [2]:
def sound_like(s):
'''
Given string of words, returns string of sound equivalents.
'''
import fuzzy
sound = fuzzy.nysiis #pick algorithm
word_list = s.split()
sound_list = []
for word in word_list:
#if sound(word):
sound_list.append(sound(word))
return ' '.join(sound_list)
In [3]:
data = json.load(open('details.json'))
In [56]:
sections=[]
def get_items(detail):
'''
Given restaurant, returns (id, name, url, list of menu items)
'''
menu_items = []
#print detail['name'],'\n'
for menu in detail['menus']:
#print menu['menu_name']
for section in menu['sections']:
#sections.append(re.sub(r'[^a-zA-Z]',' ',section['section_name']).lower().strip())
sections.append(section['section_name'].lower().strip())
#print section['section_name']
for subsection in section['subsections']:
#print '\t\t',subsection['subsection_name']
for item in subsection['contents']:
if item['type'] == 'ITEM':
menu_item = [item['name']]
if 'description' in item.keys():
menu_item.append(item['description'])
menu_items.append(' '.join(menu_item))
return detail['id'],detail['name'],detail['website_url'],menu_items
In [57]:
# reduce data
restaurants = []
for item in data:
r_id, name, url, menu_items = get_items(item)
restaurant = {'id':r_id, 'name':name, 'url':url, 'items':menu_items}
restaurants.append(restaurant)
In [6]:
# form bags of words
all_items = [] # list member = menu item
all_menus = [] # list member = concatenated menu items of a restaurant
for resto in restaurants:
all_items += resto['items']
all_menus.append(' '.join(resto['items']))
print len(all_menus), 'menus'
print len(all_items), 'items'
In [7]:
fuzzy_items = [sound_like(item) for item in all_items if sound_like(item)]
In [8]:
vectorizer = TfidfVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
vectorizer.get_params()
Out[8]:
In [38]:
# feature matrix. shape = number of data samples by number of features
fm = vectorizer.fit_transform(fuzzy_items)
fm.shape
Out[38]:
In [39]:
# target vector
cuisines = array(fm.shape[0]*['chinese']) #all item are Chinese
cuisines.shape
Out[39]:
In [46]:
clf = MultinomialNB()
In [47]:
clf.fit(fm, cuisines)
Out[47]:
In [48]:
plot(sorted(clf.feature_log_prob_[0]))
Out[48]:
In [49]:
cpt = np.matrix(clf.feature_log_prob_[0,:])
cpt.shape
Out[49]:
In [50]:
pickle.dump(cpt,open('cpt.pickle','w'))
pickle.dump(vectorizer, open('vectorizer.pickle','w'))
In [55]:
a,b,c=plt.hist(clf.feature_log_prob_[0],bins=10,log=True)
In [17]:
for ij in zip(fm.nonzero()[0], fm.nonzero()[1]):
fm[ij] = 1
fm = fm.transpose()
fm
Out[17]:
In [18]:
cpt = sparse.csr_matrix(clf.feature_log_prob_[0,:])
cpt.shape
Out[18]:
In [19]:
word_count = np.matrix([len(item.split()) for item in fuzzy_items])
word_count.shape
Out[19]:
In [20]:
pos = (cpt*fm).toarray()/word_count
pickle.dump(pos,open('pos.pickle','w'))
In [21]:
a=plot([len(item.split()) for item in fuzzy_items], pos.transpose(),'.')
In [22]:
a,b,c = hist([item for item in pos.tolist()[0] if item <= inf], bins=100)
plt.vlines((pos.mean()-pos.std(), pos.mean(), pos.mean()+pos.std()), 0 ,1200, colors='r')
Out[22]:
In [69]:
restaurants[1]['items']
Out[69]:
In [ ]: