In [1]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
from sklearn.feature_extraction.text import TfidfVectorizer
import re
answers = []
questions = []
flag = 0
######################################
question = []
answer = []
file = open('xiaomi_raw.txt', 'r')
for line in file.readlines():
    if line.startswith('用户问题'):
        questions.append(line)
        if answer:
            answers.append(answer)
        answer = []
        ##############
        answer.append(line)
        #############
    elif len(line) < 6:
        continue
    else:
        answer.append(line)
answers.append(answer)
file.close()
##########################################
######################################
question = []
answer = []
file1 = open('xiaomi_raw_2.txt', 'r')
for line in file1.readlines():
    if line.startswith('用户问题'):
        questions.append(line)
        if answer:
            answers.append(answer)
        answer = []
        ##############
        answer.append(line)
        #############
    elif len(line) < 6:
        continue
    else:
        answer.append(line)
answers.append(answer)
file1.close()
##########################################
######################################
question = []
answer = []
file2 = open('xiaomi_raw_3.txt', 'r')
for line in file2.readlines():
    if line.startswith('用户问题'):
        questions.append(line)
        if answer:
            answers.append(answer)
        answer = []
        ##############
        answer.append(line)
        #############
    elif len(line) < 6:
        continue
    else:
        answer.append(line)
answers.append(answer)
file2.close()
##########################################
answers = [''.join(ans) for ans in answers]
raw_questions = questions
raw_answers = answers


def get_accuracy(vectorizer, questions_vec, answers_vec):
    """
    输入fit过的vectorizer,向量化的questions和answers,帮你返回正确率
    """
    y = np.linspace(0, answers_vec.shape[0]-1, answers_vec.shape[0], dtype=np.int32)
    y_predict = np.array([predict_answer(question_vec)[0] for question_vec in questions_vec])
    return sum(y == y_predict) / answers_vec.shape[0], y, y_predict
def get_accuracy_with_threshold(vectorizer, questions_vec, answers_vec, threshold):
    """
    输入一定的threshold,只有问题答案分值超过threshold才进行回答
    
    Input:
        threshold
    
    Return:
        correct_number
        total_answer_number
        accuracy
    """
    correct = 0
    total = 0
    for i, question_vec in enumerate(questions_vec):
        answer_idx, answer_scores = predict_answer(question_vec)
        max_answer_score = np.max(answer_scores)
        if max_answer_score < threshold:
#             print(i, max_answer_score, 'max_answer_score < threshold', correct, total)
            pass
        else:
            if answer_idx == i:
                correct += 1    
            total += 1
            print(i, np.max(answer_scores))
    return correct, total, correct/total
def predict_answer(question_vec):
    """
    Get answer choice from a single question_vec
    
    Example:
        predict_answer(vectorizer.transform(jieba.cut('你好什么是黑洞啊')))
    
    Return:
        最高分的回答,全部回答评分
    
    """
    answer_scores = np.array([question_vec.multiply(answer_vec).sum() for answer_vec in answers_vec])
    return np.argmax(answer_scores), answer_scores
def get_answer(question):
    """
    提问题得到回答
    
    Input:
        中文问题
    
    Return:
        中文回答,全部回答评分
        
    Example:
        
    """
    question = filter_sentence(question)
    question_vec = vectorizer.transform([question])
    answer_score, answer_scores = predict_answer(question_vec)
#     print(answer_score)
    answer = answers[answer_score].replace(' ', '')
    # print(1)
    return answer, answer_scores
def check_accuracy_top_k(questions_vec, answers_vec, k, threshold=None):
    """
    Input:
        questions_vec: 
        Matrix representation of questions, with each question a row.
        
        answers_vec:
        Matrix representation of questions, with each answer a row.

        k:
        Top k answers are considered correct.
    
    Return:
        Accuracy of such metric.
    """
    correct = 0
#     total = questions_vec.shape[0]
    total = 0
    for i, question_vec in enumerate(questions_vec):
        answer_scores = np.array([question_vec.multiply(answer_vec).sum() for answer_vec in answers_vec])
        if threshold:
            if max(answer_scores) < threshold:
                continue
        predict_ones = answer_scores.argsort()[-k:][::-1]
        if i in predict_ones:
            correct += 1
        else:
            print(questions[i])
            for x in predict_ones:
                print(answers[x], '<< wrong', answer_scores[x])
            print(answers[i], '<< right', answer_scores[i])
        total += 1
        
    return correct/total
def filter_sentence(sentence):
    clean = r'小米6'
    new_sentence = re.sub(clean, '小米手机', sentence)
    clean = r'小米电视4'
    new_sentence = re.sub(clean, '小米电视', new_sentence)
    clean = r'。|,|?|!|:|\n|:|\?'
    new_sentence = re.sub(clean, '', new_sentence)
    return new_sentence

In [2]:
questions = [filter_sentence(question) for question in questions]
answers = [filter_sentence(answer) for answer in answers]

import jieba
jieba.load_userdict('userdict.txt')
# questions = [' '.join(jieba.cut(question)) for question in questions]
# answers = [' '.join(jieba.cut(answer)) for answer in answers]
chinese_stopwords = '按,按照,俺,俺们,阿,别,别人,别处,别是,别的,别管,别说,不,不仅,不但,不光,不单,不只,不外乎,不如,不妨,不尽,不尽然,不得,不怕,不惟,不成,不拘,不料,不是,不比,不然,不特,不独,不管,不至于,不若,不论,不过,不问,比方,比如,比及,比,本身,本着,本地,本人,本,巴巴,巴,并,并且,非彼,彼时,彼此,便于,把,边,鄙人,罢了,被,般的,此间,此次,此时,此外,此处,此地,此,才,才能,朝,朝着,从,从此,从而,除非,除此之外,除开,除外,除了,除,诚然,诚如,出来,出于,曾,趁着,趁,处在,乘,冲,等等,等到,等,第,当着,当然,当地,当,多,多么,多少,对,对于,对待,对方,对比,得,得了,打,打从,的,的确,的话,但,但凡,但是,大家,大,地,待,都,到,叮咚,而言,而是,而已,而外,而后,而况,而且,而,尔尔,尔后,尔,二来,非独,非特,非徒,非但,否则,反过来说,反过来,反而,反之,分别,凡是,凡,个,个别,固然,故,故此,故而,果然,果真,各,各个,各位,各种,各自,关于具体地说,归齐,归,根据,管,赶,跟,过,该,给,光是,或者,或曰,或是,或则,或,何,何以,何况,何处,何时,还要,还有,还是,还,后者,很,换言之,换句话说,好,后,和,即,即令,即使,即便,即如,即或,即若,继而,继后,继之,既然,既是,既往,既,尽管如此,尽管,尽,就要,就算,就是说,就是了,就是,就,据,据此,接着,经,经过,结果,及,及其,及至,加以,加之,例如,介于,几时,几,截至,极了,简言之,竟而,紧接着,距,较之,较,进而,鉴于,基于,具体说来,兼之,借傥然,今,叫,将,可,可以,可是,可见,开始,开外,况且,靠,看,来说,来自,来着,来,两者,临,类如,论,赖以,连,连同,离,莫若,莫如,莫不然,假使,假如,假若,某,某个,某些,某某,漫说,没奈何,每当,每,慢说,冒,哪个,哪些,哪儿,哪天,哪年,哪怕,哪样,哪边,哪里,那里,那边,那般,那样,那时,那儿,那会儿,那些,那么样,那么些,那么,那个,那,乃,乃至,乃至于,宁肯,宁愿,宁可,宁,能,能否,你,你们,您,拿,难道说,内,哪,凭借,凭,旁人,譬如,譬喻,且,且不说,且说,其,其一,其中,其二,其他,其余,其它,其次,前后,前此,前者,起见,起,全部,全体,恰恰相反,岂但,却,去,若非,若果,若是,若夫,若,另,另一方面,另外,另悉,如若,如此,如果,如是,如同,如其,如何,如下,如上所述,如上,如,然则,然后,然而,任,任何,任凭,仍,仍旧,人家,人们,人,让,甚至于,甚至,甚而,甚或,甚么,甚且,什么,什么样,上,上下,虽说,虽然,虽则,虽,孰知,孰料,始而,所,所以,所在,所幸,所有,是,是以,是的,设使,设或,设若,谁,谁人,谁料,谁知,随着,随时,随后,随,顺着,顺,受到,使得,使,似的,尚且,庶几,庶乎,时候,省得,说来,首先,倘,倘使,倘或,倘然,倘若,同,同时,他,他人,他们们,她们,她,它们,它,替代,替,通过,腾,这里,这边,这般,这次,这样,这时,这就是说,这儿,这会儿,这些,这么点儿,这么样,这么些,这么,这个,这一来,这,正是,正巧,正如,正值,万一,为,为了,为什么,为何,为止,为此,为着,无论,无宁,无,我们,我,往,望,惟其,唯有,下,向着,向使,向,先不先,相对而言,许多,像,小,些,一,一些,一何,一切,一则,一方面,一旦,一来,一样,一般,一转眼,,由此可见,由此,由是,由于,由,用来,因而,因着,因此,因了,因为,因,要是,要么,要不然,要不是,要不,要,与,与其,与其说,与否,与此同时,以,以上,以为,以便,以免,以及,以故,以期,以来,以至,以至于,以致,己,已,已矣,有,有些,有关,有及,有时,有的,沿,沿着,于,于是,于是乎,云云,云尔,依照,依据,依,余外,也罢,也好,也,又及,又,抑或,犹自,犹且,用,越是,只当,只怕,只是,只有,只消,只要,只限,再,再其次,再则,再有,再者,再者说,再说,自身,自打,自己,自家,自后,自各儿,自从,自个儿,自,怎样,怎奈,怎么样,怎么办,怎么,怎,至若,至今,至于,至,纵然,纵使,纵令,纵,之,之一,之所以,之类,着呢,着,眨眼,总而言之,总的说来,总的来说,总的来看,总之,在于,在下,在,诸,诸位,诸如,咱们,咱,作为,只,最,照着,照,直到,综上所述,贼死,逐步,遵照,遵循,针对,致,者,则甚,则,咳,哇,哈,哈哈,哉,哎,哎呀,哎哟,哗,哟,哦,哩,矣哉,矣乎,矣,焉,毋宁,欤,嘿嘿,嘿,嘻,嘛,嘘,嘎登,嘎,嗳,嗯,嗬,嗡嗡,嗡,喽,喔唷,喏,喂,啷当,啪达,啦,啥,啐,啊,唉,哼唷,哼,咧,咦,咚,咋,呼哧,呸,呵呵,呵,呢,呜呼,呜,呗,呕,呃,呀,吱,吧哒,吧,吗,吓,兮,儿,亦,了,乎'
chinese_stopwords = chinese_stopwords.split(',')
# vocab = set((' '.join([answer for answer in answers] + [question for question in questions])).split())
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words=chinese_stopwords, analyzer='char')
train_corpus = questions + answers
train_corpus = [i.replace('\n','') for i in train_corpus]
vectorizer.fit([' '.join(jieba.cut(sentence)) for sentence in train_corpus])
questions_vec = vectorizer.transform(questions)
answers_vec = vectorizer.transform(answers)


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kg/s1qv_rwj7g391rxlk1lbsx6r0000gn/T/jieba.cache
Loading model cost 0.863 seconds.
Prefix dict has been built succesfully.

In [60]:
import gensim
texts = [[word for word in jieba.cut(answer) if word not in chinese_stopwords]
               for answer in answers]
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                      id2word=dictionary, 
                                      num_topics=10,
                                      alpha='auto',
                                     eta='auto')

In [61]:
lda.print_topics(10)


Out[61]:
[(0,
  '0.039*"问题" + 0.038*"小米" + 0.027*"手机" + 0.022*"小米手机" + 0.021*"手环" + 0.019*"作答" + 0.019*"用户" + 0.018*"终端" + 0.016*"会" + 0.014*"系统升级"'),
 (1,
  '0.023*"小米手机" + 0.019*"问题" + 0.013*"终端" + 0.013*"支持" + 0.012*"用户" + 0.011*"手机" + 0.010*"盖" + 0.010*"您好" + 0.009*"模式" + 0.008*"电池"'),
 (2,
  '0.018*"手环" + 0.017*"小米" + 0.017*"同步" + 0.016*"用户" + 0.014*"问题" + 0.011*"作答" + 0.009*"数据" + 0.008*"终端" + 0.008*"非常" + 0.007*"视频"'),
 (3,
  '0.017*"用户" + 0.016*"支持" + 0.016*"尺寸" + 0.015*"问题" + 0.015*"功能" + 0.014*"小米电视" + 0.012*"终端" + 0.011*"厚度" + 0.010*"“" + 0.009*"小米"'),
 (4,
  '0.035*"小米" + 0.032*"问题" + 0.027*"手环" + 0.022*"用户" + 0.020*"版本" + 0.016*"会" + 0.015*"系统" + 0.015*"手机" + 0.014*"升级" + 0.013*"作答"'),
 (5,
  '0.039*"小米" + 0.038*"手环" + 0.018*"问题" + 0.016*"微信" + 0.015*"“" + 0.015*"终端" + 0.014*"”" + 0.014*"用户" + 0.014*"使用" + 0.012*"功能"'),
 (6,
  '0.017*"数据" + 0.015*"问题" + 0.013*"用户" + 0.013*"版本" + 0.012*"一个" + 0.012*"运动" + 0.012*"年月日" + 0.011*"作答" + 0.011*"情况" + 0.010*"("'),
 (7,
  '0.027*"小米" + 0.024*"小米电视" + 0.022*"手环" + 0.019*"主机" + 0.019*"-" + 0.017*"问题" + 0.015*"使用" + 0.015*"用户" + 0.015*"终端" + 0.015*"65"'),
 (8,
  '0.028*"问题" + 0.022*"手环" + 0.021*"小米" + 0.019*"用户" + 0.016*"小米手机" + 0.015*"作答" + 0.014*"终端" + 0.012*"会" + 0.011*"中" + 0.008*"手机"'),
 (9,
  '0.032*"问题" + 0.026*"用户" + 0.021*"手环" + 0.021*"小米" + 0.017*"终端" + 0.017*"版本" + 0.012*"会" + 0.012*"作答" + 0.010*"请" + 0.010*"小米电视"')]

In [5]:
labels = np.argmax(lda.inference(corpus)[0], axis=1)
for i,a in enumerate(raw_answers):
    if labels[i] == 1:
        print(a)


用户问题:为什么详情页中没有年月日之前的数据?
终端作答:由于年月日以前的小米运动App版本,详情页数据以每小时为单位记
录(一个小时一个柱子)。年月日以后的版本做了优化,详情页数据以每分钟
为单位记录(每分钟一个柱子),数据清楚准确,这样可以方便您更加详细的掌握您全天
的运动情况。因此详情页只展示年月日以后的数据。

用户问题:为什么小米6配置的转接头用到其他小米Type-c接口的手机上耳机不能使用?
终端作答:您好,小米6配置的接头,使用需要电路板上线路和元器件的匹配,来支持音质更好的Type-C接口降噪耳机使用,目前只有小米6电路支持。其他手机虽然从也是Type-c接口,但已经有通用的耳机接口,直接插耳机就可以听音乐了。
涉及机型:小米6

用户问题:为什么小米6有nfc“使用sim卡钱包“选项却不能使用?终端作答:您好,使用这个功能需要运营商支持,同时需要跟运营商绑定,所以只有合约机
才有可能支持这个功能,并且如果您没有nfc-SIM卡,还需要换卡。其他渠道购买的机器不
支持,给您带来的不变敬请谅解其实小米6内置的小米钱包功能也很强大,您可以了解一下。
内部消息:除了特定合约机支持这个功能,其他渠道销售机器的软件会关闭这个接口,具体版本是开发版7.4.19和稳定版8.2.15.0版本以后。请知悉,谢谢!
涉及机型:小米6