Imports and configuration.


In [1]:
%matplotlib inline

In [2]:
import collections
import lda
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import psycopg2
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

import common

import functions as f

In [3]:
matplotlib.style.use('ggplot')

Load data


In [7]:
## TODO: fix unicode here.
df = f.load_data()


Loaded 4506 records from epicurious.com
Loaded 12033 records from allrecipes.com
Loaded 16539 records in total

Examining and cleaning data


In [8]:
df = df[df['ingredient_txt'].str.len() > 20]
print df['ingredient_txt'].str.len().describe()
plt.figure()
df['ingredient_txt'].str.len().plot(kind='hist').set_title('Ingredients character count')
plt.savefig('character-counts.png')
df = df.reset_index()


count    16526.000000
mean       296.671366
std        127.103774
min         23.000000
25%        206.000000
50%        280.000000
75%        371.000000
max       1087.000000
Name: ingredient_txt, dtype: float64
<matplotlib.figure.Figure at 0x7f400a47b650>

In [9]:
## clean up quoting.
pattern = "[\"\']"
for k in ['title', 'ingredient_txt', 'url', 'image']:
    df[k] = df[k].str.replace(pattern, '')

## formatting ingredients.
df['ingredient_txt'] = df['ingredient_txt'].str.replace('\n',' ')

In [592]:


In [10]:
def rm_stopwords(stopwords, x):
    return ' '.join([w for w in x.split() if w.strip() not in stopwords])

In [11]:
## replace punctuation to improve tokenizing and stop word filtering.
df['ingredient_txt_no_stopwords'] = df['ingredient_txt'].str.replace('[\W]', ' ')
df['ingredient_txt_no_stopwords'] = map(lambda x: rm_stopwords(f.get_stop_words(), x), df['ingredient_txt_no_stopwords'])

In [12]:
# Extract features
# TODO: remove stopwords before vectorizing, because the stop words don't capture bigrams.

vectorizer = CountVectorizer(
stop_words='english'
, ngram_range=(1, 2)
, token_pattern='[A-Za-z]+'
, min_df = 10
, max_df = 0.25
)

features = vectorizer.fit_transform(df.ingredient_txt_no_stopwords)
## features is a document x term matrix.

wc = f.feature_counts(vectorizer, features)

In [13]:
wc.sort('count').tail(25).plot('word','count', kind='bar')
plt.savefig('word-counts.png')


<matplotlib.figure.Figure at 0x7f400a138510>

In [14]:
## this model, with 40 topics, is not bad.
m = lda.LDA(n_topics=40, random_state=0, n_iter=200)
m.fit(features)
print('Finished running model')


WARNING:lda:all zero row in document-term matrix found
Finished running model

In [38]:
## select best number of topics using loglikelihood.
## it might be that 40 topics is too much.
ll = {}
for k in range(5, 200, 5):
    print k
    mk  = lda.LDA(n_topics=k, random_state=0, n_iter=400)
    mk.fit(features)
    ll[k] = mk.loglikelihood()


WARNING:lda:all zero row in document-term matrix found
5
10
WARNING:lda:all zero row in document-term matrix found
15
WARNING:lda:all zero row in document-term matrix found
20
WARNING:lda:all zero row in document-term matrix found
25
WARNING:lda:all zero row in document-term matrix found
30
WARNING:lda:all zero row in document-term matrix found
35
WARNING:lda:all zero row in document-term matrix found
40
WARNING:lda:all zero row in document-term matrix found
45
WARNING:lda:all zero row in document-term matrix found
50
WARNING:lda:all zero row in document-term matrix found
55
WARNING:lda:all zero row in document-term matrix found
60
WARNING:lda:all zero row in document-term matrix found
65
WARNING:lda:all zero row in document-term matrix found
70
WARNING:lda:all zero row in document-term matrix found
75
WARNING:lda:all zero row in document-term matrix found
80
WARNING:lda:all zero row in document-term matrix found
85
WARNING:lda:all zero row in document-term matrix found
90
WARNING:lda:all zero row in document-term matrix found
95
WARNING:lda:all zero row in document-term matrix found


In [95]:
ks = sorted(ll.keys())
vs = [ll[k] for k in ks]
plt.plot(ks, vs)


Out[95]:
[<matplotlib.lines.Line2D at 0x7f40084eb8d0>]
<matplotlib.figure.Figure at 0x7f4008b10750>

Evaluating the model.

Convergence


In [15]:
p = plt.figure()
plt.plot(m.loglikelihoods_, '-')
plt.title('Loglikelihood')
p.savefig('loglikelihood.png')


<matplotlib.figure.Figure at 0x7f4008be4490>

In [303]:
## assessing stability using k-fold set overlap measures.
def k_fold_lda(**kwargs):
    results = []
    j = 0       
    for i in map(int, np.linspace(0, features.shape[0], 6)[1:]):
        results.append(lda.LDA(**kwargs).fit(features[j:i-1,:]))
        j = i
    return results 

r75 = k_fold_lda(n_topics=75, random_state=0, n_iter=300)
r40 = k_fold_lda(n_topics=40, random_state=0, n_iter=300)


# results = [lda.LDA(n_topics=best_k, random_state=0, n_iter=100).fit(features[i


WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero row in document-term matrix found
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero row in document-term matrix found

In [314]:
## extract set memberships and compare them.
## I AM HERE: extract set memberships for each run.
def set_memberships(r):
    map(lambda x: np.argmax(x.doc_topic_, axis=1, r)

for i = range(0, 5)
    for j = range(1, 4)
    diff = np.argmax(r4[i].doc_topic_, axis=1)


-540712.328698
[array([24, 26, 26, ..., 16, 20, 16]), array([34, 38,  4, ..., 15, 23,  4]), array([26, 25, 39, ..., 36, 35, 31]), array([24, 32, 36, ..., 36,  2, 33]), array([31,  0,  6, ..., 11,  0, 31])]
-537024.864479
[array([24, 26, 26, ..., 16, 20, 16]), array([34, 38,  4, ..., 15, 23,  4]), array([26, 25, 39, ..., 36, 35, 31]), array([24, 32, 36, ..., 36,  2, 33]), array([31,  0,  6, ..., 11,  0, 31])]
-535034.248194
[array([24, 26, 26, ..., 16, 20, 16]), array([34, 38,  4, ..., 15, 23,  4]), array([26, 25, 39, ..., 36, 35, 31]), array([24, 32, 36, ..., 36,  2, 33]), array([31,  0,  6, ..., 11,  0, 31])]
-572048.834245
[array([24, 26, 26, ..., 16, 20, 16]), array([34, 38,  4, ..., 15, 23,  4]), array([26, 25, 39, ..., 36, 35, 31]), array([24, 32, 36, ..., 36,  2, 33]), array([31,  0,  6, ..., 11,  0, 31])]
-611898.743893
[array([24, 26, 26, ..., 16, 20, 16]), array([34, 38,  4, ..., 15, 23,  4]), array([26, 25, 39, ..., 36, 35, 31]), array([24, 32, 36, ..., 36,  2, 33]), array([31,  0,  6, ..., 11,  0, 31])]

In [15]:
## assessing stability using two random subsets..
n = features.shape[0]
size = int(n * 0.8)

i0 = np.random.random_integers(0, n, size)
i1 = np.random.random_integers(0, n, size)

f0 = features[i0, :]
f1 = features[i1, :]

print('running model on sample 0...')
m0 = lda.LDA(n_topics=40, random_state=0, n_iter=100)
m0.fit(f0)

print('running model on sample 1...')
m1 = lda.LDA(n_topics=40, random_state=0, n_iter=100)
m1.fit(f1)

print('Finished running models')



Finished running models
WARNING:lda:all zero row in document-term matrix found
running model on sample 0...
running model on sample 1...
WARNING:lda:all zero column in document-term matrix found
WARNING:lda:all zero row in document-term matrix found

In [16]:
a0 = zip(i0, np.argmax(m0.doc_topic_, axis=1))
a1 = zip(i1, np.argmax(m0.doc_topic_, axis=1))

## make 

t0 = collections.defaultdict(set)
for doc, topic in a0:
    t0[topic].add(doc)

t1 = collections.defaultdict(set)
for doc, topic in a1:
    t1[topic].add(doc)

In [17]:
## filter out elements not in both sets.
canon = set(i0).union(set(i1))
for k,v in t0.iteritems():
    t0[k] = t0[k].intersection(canon)
for k,v in t1.iteritems():
    t1[k] = t1[k].intersection(canon)

## compare set assignments.

In [18]:
def diff(t0, t1):
    """Calculate set membership differences as a stability measure.
    """


    diff = 0
    for k0, x0 in t0.iteritems():
        # print 'topic:', k0
        d = [len(x0.difference(x1))/float(len(x0.union(x1))) for x1 in t1.values()]
        # print 'differences:', d
        # print 'min diff:', min(d)
        # print 'current diff', diff
        diff += min(d)
    return diff

In [19]:
print diff(t0, t1)


8.73957410405

Assessing topics


In [16]:
## Extracting topic data.
## most probable words by topic.
## TODO: check if these are properly sorted within each topic.
w = f.most_probable_words(m, vectorizer.get_feature_names(), 10)
w.columns = ['rank','topic','word','prob']

## most probable documents by topic.
# np.apply_along_axis(lambda i: df.iloc[i]['title'], 1, doc_ids)
doc_ids = np.argsort(m.doc_topic_, axis=0)[-4:-1,:].T
doc_probs = np.sort(m.doc_topic_, axis=0)[-4:-1,:].T

In [986]:
f.show_topics(m, df, doc_probs, doc_ids, w)


======================================================================
topic: 0
documents:
   Popeye Power Burger      0.484
     Rice Quiche Crust  0.4733333
 Grilled Garlic Rabbit  0.4636364
1 3 rabbit cleaned 1 free marinade Mrs Dash-----1 1 instant rice 1 butter optional 1 optional cooking spray-----1 egg 1 lean beef 1 quick cooking oats 1 onion 1 spinach 2 low fat Mexican blend cheese 4 grain buns
----------------------------------------------------------------------
 cooking  spray  cooking spray    fat  wheat    low  blend  low fat  vegetable
   0.101  0.077          0.071  0.030  0.027  0.024  0.021    0.021      0.019
======================================================================
topic: 1
documents:
   Carrots and Greens with Dilly Bean Vinaigrette      0.724
        Linguine and Clams with Almonds and Herbs      0.724
 Sliced Baguette with Radishes and Anchovy Butter  0.6714286
1 2 1 unsalted butter 2 3 anchovy fillets 2 chives Coarse kosher 16 1 2 diagonal baguette 10 radishes French Breakfast diagonal Additional chives-----1 4 dilly beans 1 pickling liquid 3 unsalted butter 1 1 1 2 carrots unpeeled 1 shallot 1 2 rings 3 bunches mustard greens 1 center ribs stems Kosher-----1 2 unsalted roasted almonds 2 chives 2 flat leaf parsley 1 1 4 Kosher 4 3 4 flakes 1 4 wine 2 littleneck clams scrubbed 12 linguine
----------------------------------------------------------------------
 parsley   flat   leaf  flat leaf  leaf parsley  unsalted  unsalted butter  stems  freshly
   0.040  0.037  0.036      0.034         0.033     0.032            0.028  0.023    0.017
======================================================================
topic: 2
documents:
  Spicy Steamed Shrimp    0.63125
 Best Unsteamed Shrimp  0.6166667
    Steamed Blue Crabs      0.604
36 live blue crabs 1 2 seafood seasoning Old Bay 1 2 3 beer 3 distilled vinegar 1 4 seafood seasoning Old Bay-----1 2 butter 2 shrimp deveined 1 4 Old Bay Seasoning TM 2 lemon juice-----1 quart 1 tiger prawns shell 3 Old Bay Seasoning 1 12 jar cocktail sauce
----------------------------------------------------------------------
 shrimp  deveined  uncooked  shrimp deveined  grain   long  seasoning  cooked  long grain
  0.061     0.050     0.048            0.037  0.031  0.030      0.029   0.029       0.023
======================================================================
topic: 3
documents:
    Taco in a Bag  0.8741935
 Mexican Meatloaf       0.85
       Enchiladas      0.805
2 lean beef 1 onion 1 8 12 8 flour tortillas 2 vegetable 8 Colby cheese 2 19 cans enchilada sauce-----2 lean beef 1 1 25 package taco seasoning mix 1 16 refried beans 4 8 flour tortillas 3 4 salsa 1 2 Cheddar cheese-----1 lean beef 1 1 package taco seasoning mix 4 2 5 packages corn chips 2 lettuce 1 tomato 1 Cheddar cheese 1 3 salsa 1 2 sour cream
----------------------------------------------------------------------
 sour cream  cream  cheddar  cheddar cheese   corn  tortillas  package  flour    mix
      0.048  0.044    0.034           0.031  0.029      0.027    0.023  0.022  0.021
======================================================================
topic: 4
documents:
             Mexican Orange Chicken  0.5552632
 Perfect Pan-Roasted Chicken Thighs       0.54
     Paprika Chicken with Mushrooms  0.5352941
4 skinless boneless chicken breasts 1 paprika 1 1 4 butter 1 onion rings 1 mushrooms-----6 skin bone chicken thighs 2 1 4 Kosher 1 vegetable-----8 chicken drumsticks 8 chicken thighs 1 1 2 cubed cooked ham 1 1 2 canned pineapple chunks 1 12 package bacon 2 1 2 raisins optional 8 squeezed orange juice 1 4 chicken bouillon granules 1 4 butter chunks
----------------------------------------------------------------------
 chunks  thighs  chicken thighs  breasts  boneless  skinless  chicken breasts  skinless chicken  bouillon
  0.049   0.042           0.036    0.036     0.035     0.032            0.031             0.028     0.025
======================================================================
topic: 5
documents:
 Campfire Pepperoni Pizza  0.75625
             Cheese Bread    0.705
                      BLT    0.675
4 bacon 2 lettuce 2 tomato 2 bread toasted 1 mayonnaise-----1 1 loaf French bread horizontally 4 butter 1 2 mayonnaise 1 Parmesan cheese 4 mozzarella cheese 4 Cheddar cheese 1 4-----1 refrigerated pizza dough 1 4 pizza sauce 1 2 mozzarella cheese 1 2 pepperoni
----------------------------------------------------------------------
 swiss  pizza  crust  swiss cheese  mayonnaise  cheddar  dough  cheddar cheese    pie
 0.025  0.025  0.024         0.024       0.019    0.017  0.016           0.016  0.016
======================================================================
topic: 6
documents:
                Red Ryder  0.8555556
 Bill Hecks Old Fashioned       0.85
                  Mai Tai      0.844
1 silver gold rum 3 4 lime juice 1 2 aged rum 1 2 orange Cura ao premium triple sec 1 2 orgeat almond syrup 1 4 simple syrup Float dark Jamaican rum optional Mint sprig-----2 sugar 4 dashes Angostura bitters 1 lemon wheel 1 lime wheel 1 orange wheel 1 maraschino cherry 1 2 grenadine 1 4 bourbon rye whiskey Club soda Ginger ale-----1 750 milliliter bottle Campari 1 750 milliliter bottle dark rum 3 chilled brewed rooibos tea 3 pomegranate juice 3 unsweetened cranberry juice 1 1 2 Angostura bitters
----------------------------------------------------------------------
 orange   lime  orange juice  lime juice   zest  sugar  honey   mint  pineapple
  0.088  0.054         0.038       0.037  0.025  0.016  0.016  0.015      0.014
======================================================================
topic: 7
documents:
                   Portobello Bellybuttons      0.805
                  Clam Sauce with Linguine  0.7434783
 Escargot and Pollock over Spinach Noodles  0.7318182
1 16 package spinach spaghetti pasta 1 2 butter 5 pollock fillets 1 onion 1 7 escargot 2 1 parsley 1 oregano 1 2 basil 1 4 Parmesan cheese topping-----1 16 package linguine pasta 1 2 butter 3 1 mushrooms 2 6 5 cans clams juice 1 2 parsley 1 1 2 1 4 Parmesan cheese-----1 16 package cheese tortellini 3 butter 1 2 portobello mushrooms 1 2 button mushrooms 1 4 wine 1 2 basil 1 2 Parmesan cheese
----------------------------------------------------------------------
 parmesan cheese  pasta  tomatoes  basil  package  mushrooms  parsley  oregano  optional
           0.053  0.052     0.046  0.043    0.041      0.037    0.026    0.022     0.021
======================================================================
topic: 8
documents:
             Thai Monkfish Curry      0.724
       Thai Shrimp Halibut Curry  0.7133333
 Sweet, Salty, and Sour Marinade  0.7131579
3 palm sugar 6 1 2 light brown sugar 1 2 lime juice 1 2 fish sauce nam pla nuoc nam 1 2 cilantro 2 ginger 4 Thai chiles 6 Fresno chiles 3 lemongrass 4 2 stalks-----3 limes 1 vegetable 1 shallots 1 bell 1 2 3 4 dice 1 1 2 ginger 2 1 2 Thai curry paste Thai Kitchen brand 1 13 1 2 14 unsweetened coconut milk 1 fish sauce nam pla nuoc nam 16 18 halibut fillets 1 1 2 chunks 8 deveined uncooked shrimp 8 10 1 3 cilantro 1 3 basil-----1 peanut 1 2 sweet onion 1 bell 3 Thai curry paste 1 14 coconut milk 12 monkfish 1 fish sauce 2 lime juice 2 cilantro
----------------------------------------------------------------------
  milk  coconut milk   fish  curry  fish sauce  sugar   lime  cilantro  paste
 0.042         0.032  0.031  0.031       0.027  0.024  0.024     0.020  0.019
======================================================================
topic: 9
documents:
 Poor Man\u0027s Chicken Fried Steak  0.7772727
            Fried Chicken with Gravy  0.7117647
              Southern Fried Chicken    0.69375
1 3 chicken 1 purpose flour 1 paprika 1 quart vegetable frying-----One 3 3 1 2 broiler fryer frying Note 2 1 4 3 4 unsifted purpose flour Vegetable frying lard Mrs McCollum used lard 1-----1 2 beef 1 purpose flour 4 eggs 3 vegetable needed 1 butter 1 purpose flour needed 1 milk
----------------------------------------------------------------------
 purpose  purpose flour   milk  vegetable  frying  butter purpose  needed    egg   eggs
   0.142          0.141  0.041      0.032   0.031           0.021   0.019  0.018  0.017
======================================================================
topic: 10
documents:
 Creamy Polenta with Arrabbiata Sausage Ragout  0.6609756
                  Rossi\u0027s Sausage Gnocchi  0.6483871
                     Spicy Breakfast Meatballs  0.6192308
1 20 package bulk spicy Italian turkey sausage 1 20 package spicy Italian turkey 3 eggs 4 2 fat Cheddar cheese 1 4 onion 1 4-----1 1 Italian sausage links 1 1 2 onion 1 1 1 2 Italian seasoning 1 14 tomatoes 2 1 4 1 sugar 1 Italian seasoning 1 1 package gnocchi flat leaf parsley Romano Parmesan cheese-----Polenta 2 1 2 1 2 milk 1 1 Italian polenta 2 unsalted butter 1 2 Parmigiano Reggiano cheese Arrabbiata Sauce Sausages 2 grapeseed 1 19 package Italian sausage links 2 bell peppers 1 2 rings 1 sweet onion 1 2 1 24 jar Classico Tomato Basil Sauce 1 4 flakes
----------------------------------------------------------------------
 sausage  seasoning  italian seasoning    jar  sweet  italian sausage  style  pasta  package
   0.054      0.054              0.033  0.028  0.026            0.022  0.019  0.017    0.017
======================================================================
topic: 11
documents:
       Chicken Noodle Casserole II  0.8357143
 Grandma\u0027s Hamburger Hot Dish  0.8314286
                Venison Stroganoff      0.825
1 venison 1 onion 2 10 75 cans condensed cream mushroom soup 1 16 package uncooked egg noodles 1 8 container sour cream-----1 8 package egg noodles 1 beef 1 onion 1 celery 1 10 75 condensed cream mushroom soup 1 10 75 condensed cream chicken soup 1 4 5 mushrooms 1 2 milk 1 2 salted cashews-----1 onion 1 4 butter 3 10 cans chicken chunks 1 pint sour cream 2 10 75 cans condensed cream chicken mushroom soup 3 Cheddar cheese 1 8 package uncooked egg noodles 4 buttery round crackers
----------------------------------------------------------------------
 cream  condensed  condensed cream  mushroom  mushroom soup  cream mushroom  package   cans  cream chicken
 0.072      0.060            0.046     0.035          0.033           0.030    0.024  0.022          0.020
======================================================================
topic: 12
documents:
 Cheeseburger Meatloaf   0.7214286
     Mini Cheeseburgers  0.7117647
    Eggplant Croquettes  0.7074074
2 eggplants cubed 1 sharp Cheddar cheese 1 Italian seasoned bread crumbs 2 eggs 2 parsley 2 onion 1 1 vegetable frying 1 1 2-----1 lean beef 1 2 ketchup 1 2 Cheddar cheese 1 egg 1 1 4 Cheddar cheese 1 4 ketchup-----2 beef 3 4 bread crumbs 1 2 onion 2 eggs 1 1 2 1 1 2 3 Cheddar cheese
----------------------------------------------------------------------
 crumbs  bread crumbs    egg  cheddar  cheddar cheese   eggs   milk  seasoned  sharp
  0.088         0.083  0.036    0.036           0.034  0.032  0.027     0.024  0.021
======================================================================
topic: 13
documents:
                   Black Pepper Goat Curry  0.8115385
 Country Captain with Cauliflower and Peas  0.8033333
                           Tandoori Turkey  0.7833333
2 1 2 coriander seeds 2 cumin seeds 1 1 teapsoon peppercorns 1 cardamom 2 chili 1 fenugreek 1 1 3 4 cinnamon broken 1 4 ajwain seeds-----1 1 2 coriander seeds 1 fennel seeds 1 cumin seeds 1 2 peppercorns 3 1 4 cardamom seeds 3 green cardamom pods 1 1 2 cinnamon 1 2 turmeric 1 4 scant cayenne-----1 vegetable 1 onion 1 2 curry 2 peppercorns 2 coriander 1 2 3 vegetable 2 onions 2 ginger 5 2 cayenne 1 1 turmeric 1 2 tomato paste 1 goat stew meat 1 1 2
----------------------------------------------------------------------
 seeds  paste  tomato  coriander  vegetable  ginger  turmeric  tomato paste  curry
 0.036  0.034   0.031      0.031      0.025   0.025     0.024         0.020  0.020
======================================================================
topic: 14
documents:
                   Guinness(R) Corned Beef     0.7125
 Stout Slow Cooker Corned Beef and Veggies  0.7068182
       Slow Cooker Corned Beef and Cabbage  0.7054054
1 2 baby carrots 1 2 yellow potatoes Dutch Yellow Pee Wee potatoes 1 sweet onion 4 4 3 4 corned beef brisket spice packet 3 onion 3 1 bottle Irish stout beer Guinness 1 head cabbage separated-----1 1 2 12 fluid cans bottles Irish stout beer Guinness 1 4 corned beef brisket 1 1 2 brown sugar 3 sweet potatoes chunks 1 head cabbage cored 2 sweet onions 6 carrots 3 potatoes chunks-----4 corned beef brisket 1 brown sugar 1 12 fluid bottle Irish stout beer e Guinness
----------------------------------------------------------------------
 potatoes  carrots  fluid  sweet  cubed  onions   beer  steak  vegetable
    0.075    0.036  0.029  0.023  0.020   0.020  0.020  0.018      0.017
======================================================================
topic: 15
documents:
 Sweet \u0026 Spicy St. Louis Ribs  0.7377358
            Cider-Mopped Spareribs  0.6868421
     Lemon Pepper Rubbed Back Ribs  0.6568966
Dry Rub 3 light brown sugar 2 sea 1 1 onion 2 cayenne 1 1 2 lemon peel 1 2 apple pie spice 2 racks Smithfield Extra Tender Pork Back Ribs membrane Vegetable Handful hickory apple wood chips smoking soaked optional 3 4 apple juice spray bottle optional-----3 chili 2 kosher 5 granulated 5 2 racks Smithfield Pork Spareribs membrane 1 2 apple cider vinegar 1 2 Handful hickory apple wood chips smoking soaked optional 3 4 barbecue sauce use favorite-----Dry Rub 1 2 light brown sugar 1 2 kosher 1 4 paprika 2 chili 2 1 onion 1 1 4 allspice 1 4 cinnamon 1 8 2 racks Smithfield Extra Tender St Louis Pork Spareribs membrane Handful hickory apple wood chips smoking soaked optional 2 favorite barbecue sauce
----------------------------------------------------------------------
 cayenne  smoked  kosher  steak  cumin   pork  flank  smoked paprika  oregano
   0.066   0.026   0.022  0.019  0.016  0.015  0.014           0.014    0.014
======================================================================
topic: 16
documents:
 Pea, Asparagus, and Fava Bean Salad  0.5931818
              Zucchini & Corn Gratin  0.5761905
           Meandus\u0027 Easy Dinner  0.5285714
1 beef 1 14 5 French green beans 12 frozen hash brown patties thawed 1 10 canned kernel corn-----1 2 onions 5 corn kernels 1 jalapeno 2 Tbsp flour 1 3 milk 1 zucchini 3 4 KRAFT Shredded Triple Cheddar Cheese TOUCH OF PHILADELPHIA-----3 extra virgin 1 Pecorino Parmesan 1 lemon juice Kosher 2 fava beans 2 pods frozen fava beans thawed 2 bunches asparagus stalks 1 shelled peas 1 pods frozen peas thawed 1 2 vegetable 1 shallot 4 bacon cooked
----------------------------------------------------------------------
  peas  onions  frozen   jack  jack cheese  green onions  monterey  beans  monterey jack
 0.053   0.053   0.050  0.044        0.040         0.039     0.036  0.035          0.035
======================================================================
topic: 17
documents:
        Carnitas with Pico De Gallo  0.6576923
 Taqueria Style Tacos - Carne Asada      0.642
                 Taqueria Guacamole    0.63125
2 ripe avocados pitted 2 lime juice Kosher 2 cilantro-----3 flank steak 1 3 vinegar 1 2 soy sauce 4 2 limes juiced 1 2 1 1 1 1 1 chili 1 oregano 1 cumin 1 paprika 1 onion 1 2 cilantro 1 lime juiced 2 tomatoes 2 jalapeno peppers 1 onion 4 4 New Mexico chile pods 1 1 32 package corn tortillas 2 cotija cheese optional 2 limes-----1 6 boneless pork shoulder 1 cumin 4 New Mexico chiles 1 2 1 onion 6 1 jalapeno 6 6 tomatoes 1 onion 2 tomatillos husked 2 jalapeno 1 3 lime juice 1 1 4
----------------------------------------------------------------------
 chile  peppers   lime  jalapeno  green  cumin  chile peppers  tomatoes  green chile
 0.057    0.050  0.035     0.033  0.031  0.030          0.028     0.023        0.022
======================================================================
topic: 18
documents:
               Festival Chicken  0.6130435
 Three Packet Slow Cooker Roast  0.5972973
                Sweet Meatballs      0.564
4 frozen meatballs 1 16 bottle barbeque sauce 1 16 bottle French salad dressing 1 16 bottle Italian style salad dressing-----1 1 7 package Italian style salad dressing mix 1 1 package ranch dressing mix 1 75 packet brown gravy mix 1 3 boneless beef chuck roast-----1 12 jar apricot preserves 1 8 bottle Russian style salad dressing 1 packet onion soup mix 2 4 chickens 8
----------------------------------------------------------------------
 dressing  broccoli  salad  florets  salad dressing  style   free  reduced    mix
    0.056     0.042  0.041    0.037           0.034  0.033  0.031    0.027  0.027
======================================================================
topic: 19
documents:
 Allie\u0027s BIL\u0027s Thanksgiving or Celebr...  0.7869565
                                  Black Friday Pie     0.7125
                            Thanksgiving in a Dish  0.7033333
1 1 2 hot 1 4 butter 1 6 package chicken flavored stuffing mix 3 butter 1 2 celery 1 2 onion 3 cooked turkey 1 2 celery seed 1 4 3 prepared mashed potatoes-----1 mashed potatoes 1 2 cubed cooked turkey 1 2 cooked green beans 1 2 turkey gravy 1 prepared stuffing 2 butter melted-----3 turkey 1 16 berry cranberry sauce 1 14 package herb seasoned stuffing mix Pepperidge Farm 4 eggs
----------------------------------------------------------------------
 turkey  cubed  melted  chicken  package   meat  cabbage   head  butter melted
  0.041  0.037   0.034    0.032    0.026  0.026    0.025  0.025          0.022
======================================================================
topic: 20
documents:
           The Shrubarb Cocktail       0.65
            Quick Pickled Onions       0.65
 Molasses-Glazed Pork Tenderloin  0.6130435
2 molasses 1 apple cider vinegar 2 honey 2 brown sugar 1 flakes 1 ginger 3 1 1 1 2 pork tenderloin-----2 rhubarb 1 4 6 1 7 ginger unpeeled 1 4 coins 1 4 3 4 apple cider vinegar 1 1 4 sugar-----1 2 apple cider vinegar 1 sugar 1 1 2 kosher 1 onion
----------------------------------------------------------------------
 brown  brown sugar  apple  vinegar  cider  cored  cider vinegar  cinnamon  apple cider
 0.083        0.077  0.055    0.040  0.038  0.032          0.031     0.029        0.024
======================================================================
topic: 21
documents:
 Caramel-Layered Dark Chocolate Brownies  0.8957447
        Chocolate-Peanut Butter Fun Cake  0.8885714
                   Homemade Marshmallows  0.8852941
Vegetable brushing pan About 1 confectioners sugar coating pan marshmallows 3 1 4 envelopes powdered unflavored gelatin 1 1 2 granulated sugar 1 light corn syrup 1 4 2 pure vanilla extract-----Nonstick vegetable spray 1 1 4 purpose flour 1 sugar 3 4 natural unsweetened cocoa 1 1 2 kosher 1 baking soda 1 4 vegetable 1 vanilla extract 1 2 semisweet bittersweet chocolate 3 1 2-----Nonstick baking spray 5 unsweetened chocolate 2 bittersweet chocolate 12 6 1 1 2 sticks unsalted butter 2 eggs 1 egg yolk 3 4 5 superfine sugar 3 4 4 1 2 firmly light brown sugar 1 pure vanilla extract 3 4 3 1 4 purpose flour 3 natural cocoa 1 4 kosher grained sea
----------------------------------------------------------------------
 unsalted  flour  unsalted butter  baking  purpose  purpose flour   eggs  vanilla  vanilla extract
    0.043  0.041            0.040   0.031    0.025          0.024  0.020    0.018            0.017
======================================================================
topic: 22
documents:
                      Sesame Seared Tuna  0.8419355
       Miso, Carrot, and Sesame Dressing     0.8375
 Sea Bean Salad with Daikon and Cucumber  0.8033333
2 cucumbers 1 1 2 1 daikon radish 4 scallions 1 2 sea beans ends 3 rice wine vinegar 2 canola 2 toasted sesame 2 tamari 2 sesame seeds 2 sesame seeds 1 sugar 2 shiso-----1 2 miso 6 vegetable 1 4 carrot 2 ginger 2 unseasoned rice vinegar 4 toasted sesame seeds 2 toasted sesame 2 honey-----1 4 soy sauce 1 mirin Japanese sweet wine 1 honey 2 sesame 1 rice wine vinegar 4 6 tuna steaks 1 2 sesame seeds wasabi paste 1
----------------------------------------------------------------------
 sesame  kosher  vinegar   rice  toasted  sesame seeds  sugar  toasted sesame    soy
  0.043   0.033    0.030  0.029    0.026         0.026  0.021           0.017  0.017
======================================================================
topic: 23
documents:
                    Good and Easy Sloppy Joes    0.75625
 My World Famous Pressure Cooker Chinese Ribs  0.7421053
                       Classic Barbecued Ribs  0.7277778
1 4 package pork spareribs 1 1 ketchup 1 2 vinegar 1 4 sugar 4 celery 1 1 2 chili 1 1 mustard 1 paprika 1 2-----3 paprika 1 1 2 1 2 6 1 2 pork ribs 2 vegetable 3 3 4 ketchup 1 4 brown sugar 1 4 Chinese vinegar 1 Worcestershire sauce-----1 beef 1 12 bottle chili sauce 2 mustard 2 brown sugar 1 vinegar
----------------------------------------------------------------------
 worcestershire    hot  brown  ketchup  hot sauce  sugar  brown sugar  vinegar  mustard
          0.069  0.061  0.051    0.049      0.045  0.045        0.039    0.028    0.026
======================================================================
topic: 24
documents:
       Foil Barbecued Trout with Wine  0.6368421
 Tuna with Mediterranean Lentil Salad  0.6238095
              Lemon Piccata Whitefish  0.6107143
1 2 purpose flour 1 lemon 1 trout fillets 2 vegetable needed 1 1 wine 1 1 2 lemon zest 1 4 lemon juice 2 capers 3 butter 1 parsley-----1 1 carrots 1 2 onion 2 1 lentils rinsed 3 3 1 1 2 lemon juice 1 4 parsley 1 1 2 lemon juice 4 7 tuna steaks 2-----2 trout cleaned head 1 4 wine 2 butter melted 1 lemon juice 2 parsley
----------------------------------------------------------------------
 juice  lemon juice   zest  lemon zest  parsley  juice lemon  kosher  zest lemon   wine
 0.159        0.141  0.032       0.027    0.019        0.015   0.010       0.009  0.009
======================================================================
topic: 25
documents:
 Lamb Shank Braised in White Wine with Rosemary      0.675
                               Forager Sandwich  0.6733333
                                 Parmesan Broth  0.6368421
1 head 1 onion saut onion 1 handful thyme Sprigs parsley 1 bay leaf 1 shake peppercorns 1 wine 1 Parmesan rinds 8-----1 2 shallot 1 sprig thyme 4 king trumpet mushrooms 2 1 4 scored 1 4 1 4 wine vinegar 1 1 2 kosher-----3 4 lamb shanks 5 1 onion 2 rosemary sprigs 1 1 wine
----------------------------------------------------------------------
 rosemary    bay  sprigs   wine   leaf  bay leaf  kosher  parsley  vinegar
    0.059  0.053   0.047  0.042  0.030     0.029   0.022    0.021    0.018
======================================================================
topic: 26
documents:
 Honey Glazed Stuffed Chicken  0.6529412
           Easy Chicken Curry  0.6529412
   A Good Easy Garlic Chicken    0.63125
3 butter 4 skinless boneless chicken breast halves 2 1 seasoning 1 onion-----4 skinless boneless chicken breast halves 4 Boursin cheese herbs 8 bacon toothpicks 1 2 honey-----6 skinless boneless chicken breast halves strips 1 4 2 onions 1 3 curry
----------------------------------------------------------------------
 breast  boneless  chicken breast  skinless  boneless chicken  skinless boneless  halves  breast halves  vegetable
  0.099     0.097           0.095     0.095             0.091              0.090   0.084          0.080      0.007
======================================================================
topic: 27
documents:
           Nutty Shamrock Shake  0.7958333
                 g\u0027Oatmeal  0.7909091
 Mediterranean Breakfast Quinoa  0.7833333
1 4 raw almonds 1 cinnamon 1 quinoa 2 milk 1 sea 1 vanilla extract 2 honey 2 pitted dates 5 apricots-----3 rolled oats 2 Irish steel oats 5 ripe bananas mashed 1 2 flax seeds 1 2 1 4 peanut butter 1 4 pitted dates optional 1 4 pecans optional 1 4 chia seeds 2 coconut 1 1 2 vanilla extract 1 2 1 3 honey-----1 1 2 pistachio ice cream 1 milk 1 2 almond extract 5 drops green food coloring Whipped cream optional 1 pistachios
----------------------------------------------------------------------
   egg  almonds  vanilla   milk  yolks  cinnamon  egg yolks  pitted    raw
 0.031    0.027    0.026  0.018  0.016     0.016      0.015   0.015  0.015
======================================================================
topic: 28
documents:
                Mom-Mom\u0027s Pepper Steak  0.6241379
 Stuffed Peppers with Turkey and Vegetables   0.596875
                            Cajun Chow Mein  0.5962963
1 lean beef 1 2 1 onion 1 green bell 1 14 5 tomatoes 1 15 dark kidney beans undrained 1 converted long grain rice 1 1 1 2 chili-----4 green bell peppers tops 1 turkey 2 1 2 onion 1 mushrooms 1 zucchini 1 2 bell 1 2 yellow bell 1 spinach 1 14 5 tomatoes 1 tomato paste Italian seasoning-----2 vegetable 2 green bell peppers strips 2 bell peppers strips 1 1 2 round steak strips 1 onion 1 tomatoes 1 2 3 1 2 sugar
----------------------------------------------------------------------
 green  green bell  tomatoes  beans  peppers  onion green  yellow  bell peppers  strips
 0.092       0.074     0.030  0.030    0.029        0.029   0.024         0.023   0.021
======================================================================
topic: 29
documents:
              Hot Vanilla Chai  0.7366667
 Fantastic Lemon Butter Fillet  0.7214286
    Garlic Lemon Butter Salmon  0.7214286
1 2 butter 1 lemon juiced 1 1 1 2 parsley 3 6 4 fillets cod 2 lemon-----1 2 1 lemon juiced 1 2 butter 16 2 2 1 salmon fillets 1 2 butter 8 1 1 lemon 8-----1 packet No Sugar Added Classic French Vanilla Flavor CARNATION BREAKFAST ESSENTIALS Complete Nutritional Drink 1 chai tea bag
----------------------------------------------------------------------
 lemon  salmon  juiced   dill  yogurt  greek  lemon juiced  salmon fillets  fillet
 0.053   0.053   0.042  0.035   0.029  0.027         0.024           0.019   0.016
======================================================================
topic: 30
documents:
                Grilled Ham  0.6166667
              Apple Pie Ham     0.5875
 Baked Ham with Maple Glaze  0.5695652
1 5 fully cooked bone ham 1 4 maple syrup 1 wine vinegar 2 Dijon mustard 1 mustard-----1 4 fully cooked bone ham 1 20 apple pie filling 1 prepared yellow mustard 2 barbeque sauce 2 honey-----1 brown sugar 1 4 yellow mustard 9 fully cooked bone ham
----------------------------------------------------------------------
 dijon  dijon mustard  prepared  yellow    ham  honey  yellow mustard  prepared mustard  cooked
 0.071          0.065     0.064   0.044  0.026  0.024           0.023             0.018   0.018
======================================================================
topic: 31
documents:
 Southern Barbeque Pulled Beef Sandwiches  0.7594595
          Anyone Can Make BBQ Pulled Pork   0.730303
    Loose Meat on a Bun, Restaurant Style  0.6894737
3 beef 1 4 onion 3 Worcestershire sauce 4 beef broth 1 1 2 butter 12 hamburger buns split-----1 5 pork shoulder roast 5 2 strong brewed coffee 1 onion 2 Worcestershire sauce 2 liquid smoke flavoring 1 flakes 1 16 bottle barbeque sauce 12 sandwich buns split toasted-----cooking spray 1 2 beef broth 1 liquid smoke flavoring 1 1 4 boneless beef chuck roast 3 barbeque seasoning Grill Mates Smokehouse Maple 2 Worcestershire sauce 2 barbeque sauce 8 Kaiser rolls split
----------------------------------------------------------------------
 split  roast   buns  barbeque  barbeque sauce  hamburger  chuck  hamburger buns  sirloin
 0.033  0.031  0.027     0.024           0.023      0.023  0.021           0.021    0.019
======================================================================
topic: 32
documents:
             Spinach and Feta Turkey Burgers  0.7117647
                               Spinach Puffs       0.67
 Polish Noodles (Cottage Cheese and Noodles)  0.6238095
1 2 butter 1 onion 1 16 package egg noodles 1 16 package cottage cheese 1 2 sour cream 1 2 sea 1 4-----1 10 package frozen spinach thawed 1 2 feta 1 4 onion 1 1 dill 1 Kosher 2 eggs 1 sheet frozen puff pastry 17 3 package thawed rolled 12 square kept chilled-----2 eggs 2 4 feta cheese 1 10 box frozen spinach thawed squeezed 2 turkey
----------------------------------------------------------------------
 frozen  thawed  package frozen    egg  spinach  cream cheese  cream  softened  frozen spinach
  0.087   0.059           0.046  0.029    0.029         0.027  0.027     0.021           0.020
======================================================================
topic: 33
documents:
    How to Make Coquilles Saint-Jacques   0.5552632
 Chef John\u0027s Creamy Mushroom Pasta   0.5366667
              Wild Salmon Buckwheat Skin    0.50625
35 grams isomalt 200 grams buckwheat 14 grams sea lettuce 3 grams vegetable ash 35 grams trisol 5 grams kosher 5 10 grams Pain Mie-----2 3 4 mushrooms 1 4 shiitake mushrooms 2 2 fluid sherry 1 chicken stock 1 heavy whipping cream 8 fettuccine pasta 1 1 2 thyme 1 1 2 chives 1 1 2 tarragon 9 Parmigiano Reggiano cheese-----2 unsalted butter 1 2 shallots 1 2 button mushrooms 1 wine 1 sea scallops 1 2 heavy whipping cream 1 egg yolk 1 cayenne 2 tarragon 1 lemon zest 4 oven safe scallop shells 1 4 Gruyere cheese 1 paprika 8 tarragon
----------------------------------------------------------------------
 heavy  heavy cream  mushrooms   wine  whipping  whipping cream  grams  heavy whipping  unsalted
 0.095        0.072      0.040  0.034     0.026           0.026  0.025           0.022     0.018
======================================================================
topic: 34
documents:
       Roasted Baby Eggplant, Tomato, and Zucchini   0.74
                                  Breakfast Pizzas  0.575
 Pancetta-Wrapped Dates Stuffed with Manchego C...  0.555
20 Medjool dates 20 mint 1 3 Manchego cheese 1 1 2 x 1 4 x 1 4 strips 4 3 packages pancetta Italian bacon-----1 2 batch Basic Brioche dough 1 2 120 cr fra che 12 applewood smoked bacon cooked oven barely crisp 1 240 Caramelized Onions 8 eggs 2 225 skim mozzarella cheese-----4 purple eggplants 1 1 2 1 5 Roma plum tomatoes 1 1 2 2 zucchini 1 1 2 1 yellow onion 1 1 2 5 2
----------------------------------------------------------------------
 tomatoes   plum   baby  plum tomatoes  spinach   salt  yellow  zucchini   roma
    0.029  0.029  0.024          0.022    0.022  0.021   0.021     0.019  0.018
======================================================================
topic: 35
documents:
            Slow Cooker Lasagna II  0.8818182
               Slow Cooker Lasagna  0.8741935
 Carol\u0027s Baked Ziti Casserole  0.8607143
1 16 package ziti pasta 1 egg 1 15 container ricotta cheese 1 4 Parmesan cheese 1 28 jar meatless spaghetti sauce 2 skim mozzarella cheese 1 4 Parmesan cheese-----1 lean beef 1 onion 2 1 29 tomato sauce 1 6 tomato paste 1 1 2 1 oregano 1 12 package lasagna noodles 12 cottage cheese 1 2 Parmesan cheese 16 mozzarella cheese-----1 16 package lasagna noodles 1 lean beef 1 1 2 26 jars spaghetti sauce 2 mozzarella cheese 1 2 Parmesan cheese 1 8 container ricotta cheese 2 eggs 2 mozzarella cheese
----------------------------------------------------------------------
 mozzarella cheese  parmesan  parmesan cheese  tomato  package  basil  spaghetti  pasta  oregano
             0.049     0.044            0.040   0.034    0.030  0.026      0.022  0.022    0.020
======================================================================
topic: 36
documents:
                   Cherry Tomato Vinaigrette  0.7833333
 Cherry Tomatoes Stuffed with Marinated Feta  0.7730769
                      Pan Bagnat with Fennel  0.7705882
1 2 fennel bulb 1 2 pitted olives 1 4 3 wine vinegar 2 capers 1 2 flakes Kosher-----1 7 8 package feta cheese 1 2 2 extra virgin additional drizzling 1 shallot 1 2 oregano 1 cherry tomatoes 12 pitted Kalamata olives-----1 pint cherry tomatoes 3 1 shallot 1 wine vinegar Kosher 2 chives
----------------------------------------------------------------------
 virgin  extra virgin  vinegar  balsamic  balsamic vinegar   wine  kosher  wine vinegar  tomatoes
  0.077         0.077    0.048     0.025             0.024  0.023   0.020         0.019     0.018
======================================================================
topic: 37
documents:
                   Teeny-Weeny Coxinha  0.5695652
          Romano Risotto with Radishes      0.564
 Sweet Potato Risotto from Reynolds(R)     0.5525
1 purpose flour 1 1 2 uncooked Arborio rice 1 8 sweet potato 1 2 onion 2 butter softened 2 snipped thyme 2 1 4 3 reduced sodium chicken broth 1 2 wine reduced sodium chicken broth 1 4 Parmesan cheese-----6 reduced sodium chicken broth 48 fl 2 hot 3 4 unsalted butter 1 onion 1 3 1 Arborio rice 2 1 2 2 3 wine 1 2 Pecorino Romano-----1 quart vegetable frying 3 1 2 low sodium chicken broth 1 onion 1 carrot 1 celery rib
----------------------------------------------------------------------
 broth  chicken broth  celery    low  sodium  stock  stalks  low sodium  chicken stock
 0.102          0.079   0.057  0.028   0.026  0.026   0.025       0.022          0.019
======================================================================
topic: 38
documents:
 Chinese Steamed White Fish Fillet with Tofu (C...  0.7864865
                            Brandi\u0027s Won Tons  0.7742857
                           Chinese Pork Tenderloin       0.77
2 1 1 2 pork tenderloins 2 light soy sauce 2 hoisin sauce 1 sherry 1 bean sauce 1 1 2 ginger root 1 1 2 brown sugar 1 1 2 sesame 1 Chinese spice-----2 napa cabbage 1 2 pork 2 soy sauce 1 sherry 2 cornstarch 1 1 2 ginger root 1 green onion 36 wonton wrappers 1 egg 1 4 soy sauce 1 4 seasoned rice vinegar 2 ginger-----1 Thai chile 2 1 1 2 ginger 1 bean sauce 2 dark soy sauce 2 soy sauce 1 vegetable 1 sugar 1 1 cornstarch 1 cold 3 4 fish fillets 1 16 package tofu cubed 1 green onion
----------------------------------------------------------------------
 soy sauce  ginger  green  vegetable  sugar  cornstarch  onions   root  sesame
     0.080   0.037  0.028      0.024  0.024       0.023   0.021  0.019   0.019
======================================================================
topic: 39
documents:
 Slow Cooker German-Style Pork Roast with Sauer...      0.655
                        Molasses Brined Pork Chops    0.63125
               Herbed Pork Chops with Homemade Rub  0.6130435
1 3 parsley 1 4 marjoram 1 4 thyme 3 rubbed sage 2 2 onion 1 1 cinnamon 4 boneless pork loin chops 2 vegetable 1 canola-----1 2 kosher 1 2 molasses 4 1 boiling 7 cold 4 2 1 4 center bone pork chops 1 2 vegetable-----6 potatoes 1 1 3 boneless pork loin roast 1 32 jar sauerkraut liquid 2 caraway seeds
----------------------------------------------------------------------
 chops  pork chops  boneless  boneless pork   loin  roast  pork loin  vegetable  tenderloin
 0.069       0.053     0.052          0.046  0.035  0.033      0.031      0.022       0.022
======================================================================

In [971]:
# Plotting word distributions for each topic.
wb = f.most_probable_words(m, vectorizer.get_feature_names(), 10)
wb.columns = ['rank','topic','word','prob']

## make figure of word distributions for each topic.
g = sns.FacetGrid(wb, col='topic', col_wrap=10)
p = g.map(sns.barplot, 'word', 'prob')
## save figure for easier viewing.
p.savefig('word_dist_by_topic.png')

## TODO: figure out way of examining probs of words in relation to topic coherence:
## high average prob?

## make figure of document distributions for each topic.
## for each topic, show distribution of documents.


<matplotlib.figure.Figure at 0x7fd84970bd10>

In [ ]:
## examine topics:
## 14: one very probable word.
## 32: many very probable words.
## 30: no very probable words.

In [18]:
## TODO: store one set of results for each run.

con = f.make_engine()

## massage document ids and probabilities into form suitable for database.
di = pd.DataFrame(doc_ids)
di['topic'] = di.index
di = pd.melt(di, id_vars='topic')
di.columns = ['topic','rank','recipe_key']

dp = pd.DataFrame(doc_probs)
dp['topic'] = dp.index
dp = pd.melt(dp, id_vars='topic')
dp.columns = ['topic','rank','prob']

dd = pd.merge(di, dp)
dd.to_sql('doc_prob', con, if_exists='replace')

# store recipes
df['key'] = df.index
## assign the most probable topic to each recipe.
df['topic'] = np.argmax(m.doc_topic_, axis=1)
df.to_sql('clean_recipes', con, if_exists='replace', index=False)

# store words
w.columns = ['rank','topic','word','prob']
w.to_sql('word_probs', con, if_exists='replace')

In [285]:
xx = pd.merge(df, dd, left_on='key', right_on='recipe_key', how='right')

## topics with low word probs.
## but, they seem pretty good.
print 'topics with low word probs.'
for n, g in xx[xx['topic'].isin([5,8,16,18,30])].groupby('topic'):
    print 'topic: %s' % n
    print g[['title','prob']].sort('prob').to_string()

##
print '='*80
print 'topics with one high word prob.'
print '='*80
for n, g in xx[xx['topic'].isin([1,4,9,14,21])].groupby('topic'):
    print 'topic: %s' % n
    print g[['title','prob']].sort('prob').to_string()


topics with low word probs.
topic: 5
                                          title      prob
31                       Fruit and Nut Pancakes  0.816667
27   Coconut Macaroon Sandwiches with Lime Curd  0.821212
116                        Joyce\u0027s Granola  0.823077
topic: 8
                                        title      prob
71      Spicy Coconut and Lime Grilled Shrimp  0.638636
98  Garlic, Cilantro, and Lime Sauteed Shrimp  0.669697
51                  Spicy Lime Grilled Shrimp  0.684000
topic: 16
                                                title      prob
25       Chocolate-Dipped Salted Caramel Marshmallows  0.765278
34  Maple-Gingerbread Layer Cake with Salted Maple...  0.811538
5                        Bourbon-Vanilla Marshmallows  0.816279
topic: 18
                                title      prob
78          Rajma (Kidney Bean Curry)  0.743299
43  Indian Chicken Curry (Murgh Kari)  0.753608
55           Indian Dahl with Spinach  0.762000
topic: 30
                             title      prob
26         Charred Tomatillo Salsa  0.771795
8   Thai Celery Salad with Peanuts  0.801818
11           Tomatillo Salsa Verde  0.813514
================================================================================
topics with one high word prob.
================================================================================
topic: 1
                                     title      prob
29                              Bún Bò Hue  0.452500
103  Classic Smoked Sausage \u0026 Peppers  0.474545
113                             Dinengdeng  0.526190
topic: 4
                                                title      prob
94                              Manicotti Pancakes II  0.700000
33  Witchs Finger Bread Sticks with Maple Mustard Dip  0.703333
21                           Fried Chicken with Gravy  0.724000
topic: 9
                     title      prob
99           Messy Burgers  0.432143
39  Cabbage Roll Casserole  0.460000
57    Best Ever Meatloaf I  0.467857
topic: 14
                  title      prob
22   Blue Crab Beignets  0.360714
16  Arugula Salsa Verde  0.360937
79      Filipino Lumpia  0.391667
topic: 21
                                               title      prob
23               Pea, Asparagus, and Fava Bean Salad  0.383951
28                           Scallops à La Provençal  0.427941
9   Fava Bean and Pea Salad with Poppy Seed Dressing  0.428000

In [469]:
## relationship between doc prob and length:
dpa = pd.DataFrame({'max_prob':np.max(m.doc_topic_, axis=1), 'topic':np.argmax(m.doc_topic_, axis=1)})
dpa = df.join(dpa)
dpa['ingredient_len'] = dpa['ingredient_txt'].str.len()

dpa.plot('ingredient_len', 'mdpa_prob', kind='scatter')

# g = sns.FacetGrid(dpa, col='topic', col_wrap=10)
# p = g.map(sns.barplot, 'word', 'prob')
# ## save figure for easier viewing.
# p.savefig('word_dist_by_topic.png')


Out[469]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd84f99d750>
<matplotlib.figure.Figure at 0x7fd84c3fbe90>

In [39]:
## generating recipes.
reload(f)
ww = f.all_word_probs(m, vectorizer.get_feature_names())
ww.to_sql('all_word_probs', con, if_exists='replace')

In [40]:
for n, g in ww.groupby('label'):
    print n
    print [', '.join(np.random.choice(g['word'], size=5, p=g['prob'])) for _ in range(0,1)]


[u'bell, bell, chile peppers, green, green']
0
[u'vegetable chicken, chicken, broth, broth, broth']
1
[u'lemon peel, parsley, butter lemon, parsley lemon, halibut']
2
[u'base, needed, smoked, cover, giblets']
3
[u'soup mix, processed cheese, onion soup, onion cans, lean']
4
[u'vegetable, vanilla extract, egg yolks, cacao, rose']
5
[u'fluid bottle, steak, skirt steak, bottles, beer']
6
[u'jack, jack cheese, butter flour, fat free, cheddar monterey']
7
[u'cream package, chives, cheese sour, light, button']
8
[u'basil vegetable, thyme, oregano, balsamic vinegar, oregano']
9
[u'condensed cheddar, potato nuggets, tomato soup, mushroom, meat']
10
[u'molasses, vegetable, seeds, fennel, cooking spray']
11
[u'orzo, butter cans, hair, asiago cheese, flakes']
12
[u'seedless, duck, chicken, juiced, juiced']
13
[u'mushrooms, baby bella, heavy, cream, beef stock']
14
[u'purpose, purpose flour, milk, purpose, milk']
15
[u'worcestershire, cayenne, blue cheese, ketchup mustard, beef hot']
16
[u'hash brown, crust pie, pie, melted butter, butter melted']
17
[u'chicken bouillon, long, optional vegetable, chicken, broth']
18
[u'english, dressing, dressing, pheasant, split']
19
[u'ginger root, green onion, japanese, carrot, vinegar']
20
[u'cheddar, blend cheese, corn, kernel, green onions']
21
[u'carrot, carrots, leek, celery stalks, onion carrots']
22
[u'pineapple, juice, garnish, grapefruit, juice']
23
[u'unsalted butter, nuts, flat, leaf parsley, flat leaf']
24
[u'hot flakes, wine, sea extra, extra virgin, olives']
25
[u'dijon, mustard, fris, vinegar, cider']
26
[u'packages, onion feta, package, thawed squeezed, vegetables']
27
[u'beef, beef tenderloin, beef bouillon, cornstarch, boneless round']
28
[u'bread, bread crumbs, crumbs, parsley, panko bread']
29
[u'chicken breast, halves, skinless, chicken breast, chicken']
30
[u'jalapeno, tomatillos, green beans, pinto beans, broth lemon']
31
[u'lasagna, italian seasoning, seasoning, italian, puree']
32
[u'cooked chicken, florets, milk package, milk, extra eggs']
33
[u'sea, canola, salt, kale, mixture']
34
[u'sriracha sauce, tamarind, peanut butter, wine vinegar, jalape']
35
[u'mustard seed, seed, cinnamon, coconut milk, chile peppers']
36
[u'debearded, wine, shelled deveined, shrimp, deveined']
37
[u'sugar, brown sugar, venison, sugar, brown sugar']
38
[u'cheddar cheese, cheese paprika, cheddar cheese, cracked, optional']
39