In [1]:
import pandas as pd
In [2]:
bid = pd.read_csv("../yelp-challenge/LDA/CHN15/01_ALL_Chinese_Business_ID.csv")
topic_CHN_5 = pd.read_csv("../yelp-challenge/LDA/CHN5/topic2doc.csv", index_col=0)
topic_CHN_5.index = bid.business_id.values
topic_CHN_15 = pd.read_csv("../yelp-challenge/LDA/CHN15/04_topic2doc.csv", index_col=0)
topic_CHN_15.index = bid.business_id.values
In [3]:
TOR_CHN = pd.read_csv("../yelp-challenge/Anomaly/Result/torronto_chi_results.csv")
TOR_CHN = TOR_CHN.loc[:, 'clusters_sp':'clusters_gm']
PHO_CHN = pd.read_csv("../yelp-challenge/Anomaly/Result/Phoenix_chi_results.csv")
PHO_CHN = PHO_CHN.loc[:, 'clusters_sp':'clusters_gm']
LAS_CHN = pd.read_csv("../yelp-challenge/Anomaly/Result/LasVegas_chi_results.csv")
LAS_CHN = LAS_CHN.loc[:, 'clusters_sp':'clusters_gm']
In [4]:
TOR_CHN_LDA5 = TOR_CHN.join(topic_CHN_5[['max']],how='left')
TOR_CHN_LDA15 = TOR_CHN.join(topic_CHN_15[['max']],how='left')
PHO_CHN_LDA5 = PHO_CHN.join(topic_CHN_5[['max']],how='left')
PHO_CHN_LDA15 = PHO_CHN.join(topic_CHN_15[['max']],how='left')
LAS_CHN_LDA5 = LAS_CHN.join(topic_CHN_5[['max']],how='left')
LAS_CHN_LDA15 = LAS_CHN.join(topic_CHN_15[['max']],how='left')
In [5]:
def ClsuterTopic(LDA):
for method in ['clusters_sp']:#, 'clusters_km', 'clusters_gm']:
n_cluster = len(LDA.loc[:,method].unique())
print method
for i in range(n_cluster):
tmp = LDA[LDA.loc[:,method] == i]
print "c{}: {}".format(i,tmp.groupby(tmp['max']).size().argmax()+1) # topics index start from 1
print '\n'
In [6]:
ClsuterTopic(TOR_CHN_LDA15)
ClsuterTopic(PHO_CHN_LDA15)
ClsuterTopic(LAS_CHN_LDA15)
In [7]:
CHN15_TOPIC = pd.read_csv("../yelp-challenge/LDA/CHN15/04_top30_word2topic.csv")
CHN15_TOPIC.loc[:,['Topic 13', 'Topic 15', 'Topic 12', 'Topic 4']]
Out[7]:
In [8]:
bid2 = pd.read_csv("../yelp-challenge/LDA/JPN5/ALL_Japanese_Business_ID.csv")
topic_JPN_5 = pd.read_csv("../yelp-challenge/LDA/JPN5/topic2doc.csv", index_col=0)
topic_JPN_5.index = bid2.business_id.values
In [9]:
TOR_JPN = pd.read_csv("../yelp-challenge/Anomaly/Result/torronto_jap_results.csv")
TOR_JPN = TOR_JPN.loc[:, 'clusters_sp':'clusters_gm']
PHO_JPN = pd.read_csv("../yelp-challenge/Anomaly/Result/Phoenix_jap_results.csv")
PHO_JPN = PHO_JPN.loc[:, 'clusters_sp':'clusters_gm']
LAS_JPN = pd.read_csv("../yelp-challenge/Anomaly/Result/LasVegas_jap_results.csv")
LAS_JPN = LAS_JPN.loc[:, 'clusters_sp':'clusters_gm']
In [10]:
TOR_JPN_LDA5 = TOR_JPN.join(topic_JPN_5[['max']],how='left')
PHO_JPN_LDA5 = PHO_JPN.join(topic_JPN_5[['max']],how='left')
LAS_JPN_LDA5 = LAS_JPN.join(topic_JPN_5[['max']],how='left')
In [11]:
def ClsuterTopic(LDA):
for method in ['clusters_gm']:#['clusters_sp', 'clusters_km', 'clusters_gm']:
n_cluster = len(LDA.loc[:,method].unique())
print method
for i in range(n_cluster):
tmp = LDA[LDA.loc[:,method] == i]
print "c{}: {}".format(i,tmp.groupby(tmp['max']).size().argmax()+1) # topics index start from 1
print '\n'
In [12]:
ClsuterTopic(TOR_JPN_LDA5)
ClsuterTopic(PHO_JPN_LDA5)
ClsuterTopic(LAS_JPN_LDA5)
In [13]:
JPN5_TOPIC = pd.read_csv("../yelp-challenge/LDA/JPN5/top30_word2topic.csv")
JPN5_TOPIC.loc[:,['Topic 2', 'Topic 3']]
Out[13]:
In [14]:
TOR_CHN_LDA5['max'].value_counts().argmax()
Out[14]:
In [15]:
PHO_CHN_LDA5['max'].value_counts().argmax()
Out[15]:
In [16]:
LAS_CHN_LDA5['max'].value_counts().argmax()
Out[16]:
In [17]:
TOR_CHN_LDA15['max'].value_counts().argmax()
Out[17]:
In [18]:
PHO_CHN_LDA15['max'].value_counts().argmax()
Out[18]:
In [19]:
LAS_CHN_LDA15['max'].value_counts().argmax()
Out[19]:
In [20]:
TOR_JPN_LDA5['max'].value_counts().argmax()
Out[20]:
In [21]:
PHO_JPN_LDA5['max'].value_counts().argmax()
Out[21]:
In [22]:
LAS_JPN_LDA5['max'].value_counts().argmax()
Out[22]:
In [ ]:
In [29]:
jpn_out = JPN5_TOPIC.iloc[:10,1:]
jpn_out.index = range(1,11,1)
In [30]:
print jpn_out.to_latex()
In [52]:
chn_out = CHN15_TOPIC.iloc[:10,1:]
chn_out.index = range(1,11,1)
chn_out = chn_out.loc[:, ['Topic 4', 'Topic 12', 'Topic 13', 'Topic 15']]
In [53]:
print chn_out.to_latex()