In [1]:
import pandas as pd

CHN


In [2]:
bid = pd.read_csv("../yelp-challenge/LDA/CHN15/01_ALL_Chinese_Business_ID.csv")
topic_CHN_5 = pd.read_csv("../yelp-challenge/LDA/CHN5/topic2doc.csv", index_col=0)
topic_CHN_5.index = bid.business_id.values
topic_CHN_15 = pd.read_csv("../yelp-challenge/LDA/CHN15/04_topic2doc.csv", index_col=0)
topic_CHN_15.index = bid.business_id.values

In [3]:
TOR_CHN = pd.read_csv("../yelp-challenge/Anomaly/Result/torronto_chi_results.csv")
TOR_CHN = TOR_CHN.loc[:, 'clusters_sp':'clusters_gm']

PHO_CHN = pd.read_csv("../yelp-challenge/Anomaly/Result/Phoenix_chi_results.csv")
PHO_CHN = PHO_CHN.loc[:, 'clusters_sp':'clusters_gm']

LAS_CHN = pd.read_csv("../yelp-challenge/Anomaly/Result/LasVegas_chi_results.csv")
LAS_CHN = LAS_CHN.loc[:, 'clusters_sp':'clusters_gm']

In [4]:
TOR_CHN_LDA5 = TOR_CHN.join(topic_CHN_5[['max']],how='left')
TOR_CHN_LDA15 = TOR_CHN.join(topic_CHN_15[['max']],how='left')

PHO_CHN_LDA5 = PHO_CHN.join(topic_CHN_5[['max']],how='left')
PHO_CHN_LDA15 = PHO_CHN.join(topic_CHN_15[['max']],how='left')

LAS_CHN_LDA5 = LAS_CHN.join(topic_CHN_5[['max']],how='left')
LAS_CHN_LDA15 = LAS_CHN.join(topic_CHN_15[['max']],how='left')

In [5]:
def ClsuterTopic(LDA):
    for method in ['clusters_sp']:#, 'clusters_km', 'clusters_gm']:
        n_cluster = len(LDA.loc[:,method].unique())
        print method
        for i in range(n_cluster):
            tmp = LDA[LDA.loc[:,method] == i]
            print "c{}: {}".format(i,tmp.groupby(tmp['max']).size().argmax()+1) # topics index start from 1  
        print '\n'

1. SP + CHN15


In [6]:
ClsuterTopic(TOR_CHN_LDA15)
ClsuterTopic(PHO_CHN_LDA15)
ClsuterTopic(LAS_CHN_LDA15)


clusters_sp
c0: 13
c1: 15


clusters_sp
c0: 13
c1: 12


clusters_sp
c0: 13
c1: 4



In [7]:
CHN15_TOPIC = pd.read_csv("../yelp-challenge/LDA/CHN15/04_top30_word2topic.csv")
CHN15_TOPIC.loc[:,['Topic 13', 'Topic 15', 'Topic 12', 'Topic 4']]


Out[7]:
Topic 13 Topic 15 Topic 12 Topic 4
0 happy bowl chicken tea
1 menu chicken chinese milk
2 hour sauce rice boba
3 server rice fried drink
4 ordered fresh ordered drinks
5 table location egg ice
6 bar teriyaki lunch bubble
7 night veggies soup sweet
8 drinks eat beef green
9 experience meat shrimp taste
10 wait staff sauce mango
11 beer sauces sour pretty
12 atmosphere meal hot ordered
13 waitress friendly delivery sugar
14 drink spicy eat places
15 minutes clean friendly taro
16 location burrito orange teas
17 dinner fast sweet dessert
18 house lunch fast taiwanese
19 waiter give rolls jelly
20 pretty panda menu tapioca
21 staff healthy chow menu
22 asked shabu bad friendly
23 friends customer delicious desserts
24 manager line fresh friends
25 seated pretty pretty location
26 meal noodles portions staff
27 selection quick give fresh
28 busy bowls years flavor
29 bad pei family lot




JPN


In [8]:
bid2 = pd.read_csv("../yelp-challenge/LDA/JPN5/ALL_Japanese_Business_ID.csv")
topic_JPN_5 = pd.read_csv("../yelp-challenge/LDA/JPN5/topic2doc.csv", index_col=0)
topic_JPN_5.index = bid2.business_id.values

In [9]:
TOR_JPN = pd.read_csv("../yelp-challenge/Anomaly/Result/torronto_jap_results.csv")
TOR_JPN = TOR_JPN.loc[:, 'clusters_sp':'clusters_gm']

PHO_JPN = pd.read_csv("../yelp-challenge/Anomaly/Result/Phoenix_jap_results.csv")
PHO_JPN = PHO_JPN.loc[:, 'clusters_sp':'clusters_gm']

LAS_JPN = pd.read_csv("../yelp-challenge/Anomaly/Result/LasVegas_jap_results.csv")
LAS_JPN = LAS_JPN.loc[:, 'clusters_sp':'clusters_gm']

In [10]:
TOR_JPN_LDA5 = TOR_JPN.join(topic_JPN_5[['max']],how='left')
PHO_JPN_LDA5 = PHO_JPN.join(topic_JPN_5[['max']],how='left')
LAS_JPN_LDA5 = LAS_JPN.join(topic_JPN_5[['max']],how='left')

In [11]:
def ClsuterTopic(LDA):
    for method in ['clusters_gm']:#['clusters_sp', 'clusters_km', 'clusters_gm']:
        n_cluster = len(LDA.loc[:,method].unique())
        print method
        for i in range(n_cluster):
            tmp = LDA[LDA.loc[:,method] == i]
            print "c{}: {}".format(i,tmp.groupby(tmp['max']).size().argmax()+1) # topics index start from 1  
        print '\n'

In [12]:
ClsuterTopic(TOR_JPN_LDA5)
ClsuterTopic(PHO_JPN_LDA5)
ClsuterTopic(LAS_JPN_LDA5)


clusters_gm
c0: 3
c1: 3
c2: 3
c3: 3


clusters_gm
c0: 2
c1: 3
c2: 3
c3: 3


clusters_gm
c0: 2
c1: 2
c2: 2
c3: 3



In [13]:
JPN5_TOPIC = pd.read_csv("../yelp-challenge/LDA/JPN5/top30_word2topic.csv")
JPN5_TOPIC.loc[:,['Topic 2', 'Topic 3']]


Out[13]:
Topic 2 Topic 3
0 sushi ramen
1 roll pork
2 rolls noodles
3 fresh broth
4 fish bowl
5 eat chicken
6 quality japanese
7 ayce spicy
8 salmon ordered
9 menu soup
10 tuna rice
11 sashimi curry
12 lunch small
13 ordered delicious
14 rice wait
15 spicy pretty
16 tempura bit
17 friendly miso
18 pretty taste
19 price egg
20 places noodle
21 staff menu
22 dinner sauce
23 delicious fried
24 amazing friendly
25 wait tea
26 favorite places
27 2 meat
28 nigiri side
29 salad lunch




What if No Clustering for One city?

CHN5


In [14]:
TOR_CHN_LDA5['max'].value_counts().argmax()


Out[14]:
3

In [15]:
PHO_CHN_LDA5['max'].value_counts().argmax()


Out[15]:
3

In [16]:
LAS_CHN_LDA5['max'].value_counts().argmax()


Out[16]:
3

CHN15


In [17]:
TOR_CHN_LDA15['max'].value_counts().argmax()


Out[17]:
12

In [18]:
PHO_CHN_LDA15['max'].value_counts().argmax()


Out[18]:
12

In [19]:
LAS_CHN_LDA15['max'].value_counts().argmax()


Out[19]:
12

JPN5


In [20]:
TOR_JPN_LDA5['max'].value_counts().argmax()


Out[20]:
2

In [21]:
PHO_JPN_LDA5['max'].value_counts().argmax()


Out[21]:
2

In [22]:
LAS_JPN_LDA5['max'].value_counts().argmax()


Out[22]:
2

In [ ]:

OUTPUT: DataFrame -> LaTeX


In [29]:
jpn_out = JPN5_TOPIC.iloc[:10,1:]
jpn_out.index = range(1,11,1)

In [30]:
print jpn_out.to_latex()


\begin{tabular}{llllll}
\toprule
{} &  Topic 1 &  Topic 2 &   Topic 3 &     Topic 4 &   Topic 5 \\
\midrule
1  &  chicken &    sushi &     ramen &        menu &    korean \\
2  &    happy &     roll &      pork &    japanese &       bbq \\
3  &     hour &    rolls &   noodles &      dishes &     sushi \\
4  &  ordered &    fresh &     broth &        dish &   burrito \\
5  &     rice &     fish &      bowl &     amazing &      meat \\
6  &    table &      eat &   chicken &   delicious &   bulgogi \\
7  &    lunch &  quality &  japanese &     ordered &  montreal \\
8  &   shrimp &     ayce &     spicy &  experience &  bibimbap \\
9  &   pretty &   salmon &   ordered &       vegas &     meats \\
10 &    sauce &     menu &      soup &       night &     kalbi \\
\bottomrule
\end{tabular}


In [52]:
chn_out = CHN15_TOPIC.iloc[:10,1:]
chn_out.index = range(1,11,1)
chn_out = chn_out.loc[:, ['Topic 4', 'Topic 12', 'Topic 13', 'Topic 15']]

In [53]:
print chn_out.to_latex()


\begin{tabular}{lllll}
\toprule
{} & Topic 4 & Topic 12 &    Topic 13 &  Topic 15 \\
\midrule
1  &     tea &  chicken &       happy &      bowl \\
2  &    milk &  chinese &        menu &   chicken \\
3  &    boba &     rice &        hour &     sauce \\
4  &   drink &    fried &      server &      rice \\
5  &  drinks &  ordered &     ordered &     fresh \\
6  &     ice &      egg &       table &  location \\
7  &  bubble &    lunch &         bar &  teriyaki \\
8  &   sweet &     soup &       night &   veggies \\
9  &   green &     beef &      drinks &       eat \\
10 &   taste &   shrimp &  experience &      meat \\
\bottomrule
\end{tabular}