This is the Doc2Vec try, which is unsuccessful.


In [1]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
from sklearn.feature_extraction.text import TfidfVectorizer
file = open('十万个为什么raw.txt', 'r')
answers = []
answer = []
questions = []
flag = 0
for line in file.readlines()[1:]:
    if flag == 1:
        flag = 0
    elif line.startswith('<h2>'):
        questions.append(line)
        if answer:
            answers.append(answer)
        answer = []
    elif len(line) == 1:
        flag = 1
    else:
        answer.append(line)
answers.append(answer)
answers = [''.join(ans) for ans in answers]
import jieba
questions = [' '.join(jieba.cut(question)) for question in questions]
answers = [' '.join(jieba.cut(answer)) for answer in answers]


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kg/s1qv_rwj7g391rxlk1lbsx6r0000gn/T/jieba.cache
Loading model cost 1.033 seconds.
Prefix dict has been built succesfully.

In [2]:
import gensim
answers_corpus = [gensim.models.doc2vec.TaggedDocument(answer.split(), [i]) for i,answer in enumerate(answers)]
questions_corpus = [gensim.models.doc2vec.TaggedDocument(question.split(), [i]) for i,question in enumerate(questions)]
train_corpus = answers_corpus + questions_corpus

In [3]:
# model = gensim.models.Doc2Vec(documents=answers_corpus,
#                              size=3000,
#                              max_vocab_size=None,
#                              iter=100,
#                              )
model = gensim.models.doc2vec.Doc2Vec(size=200, min_count=3, iter=55)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)


Out[3]:
7670488

In [4]:
questions_vec = np.asarray([model.infer_vector(question.split()) for question in questions])
answers_vec = np.asarray([model.infer_vector(answer.split()) for answer in answers])

In [5]:
from sklearn.neighbors import KNeighborsClassifier
X = answers_vec
y = np.linspace(0, X.shape[0]-1, X.shape[0], dtype=np.int32)
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X, y)
Xtest = questions_vec
ytest = y
# np.argmax(np.matmul(Xtest[7], X.T))
model.score(Xtest, ytest)


Out[5]:
0.0013175230566534915

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.decomposition import PCA

In [7]:
pca = PCA(n_components=2)
pca.fit(X + Xtest)
Xp = pca.transform(X)
Xtestp = pca.transform(Xtest)
plt.scatter(Xp[:,0], Xp[:,1], color='red', s=3, alpha=0.5)
plt.scatter(Xtestp[:,0], Xtestp[:,1], color='blue',s=3, alpha=0.5)
plt.show()



In [8]:
multi = np.sum(X) / np.sum(Xtest)
Xtest = Xtest * multi
model.score(Xtest, ytest)


Out[8]:
0.010540184453227932

In [9]:
pca = PCA(n_components=2)
pca.fit(X + Xtest)
Xp = pca.transform(X)
Xtestp = pca.transform(Xtest)
plt.scatter(Xp[:,0], Xp[:,1], color='red', s=3, alpha=0.5)
plt.scatter(Xtestp[:,0], Xtestp[:,1], color='blue',s=3, alpha=0.5)
plt.show()



In [ ]: