In [69]:
from gensim import models
file_f = open('D:/tugas-akhir/fnc-id/corpus/crossval/fakta_stopstemset3.txt').read()
file_h = open('D:/tugas-akhir/fnc-id/corpus/crossval/hoax_stopstemset3.txt').read()
list_f = file_f.split()
list_h = file_h.split()
sentence = models.doc2vec.LabeledSentence(
words=list_f, tags=["SENT_fakta"])
sentence1 = models.doc2vec.LabeledSentence(
words=list_h, tags=["SENT_hoax"])
sentences = [sentence, sentence1]
token_count = sum([len(sentence) for sentence in sentences])
class LabeledLineSentence(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
for uid, line in enumerate(open(filename)):
yield LabeledSentence(words=line.split(), labels=['SENT_%s' % uid])
model = models.Doc2Vec(dm=0, alpha=.025, min_alpha=.025, min_count=1)
model.build_vocab(sentences)
for epoch in range(10):
model.train( sentences, total_examples = token_count, epochs = model.iter)
model.alpha -= 0.002 # decrease the learning rate`
model.min_alpha = model.alpha # fix the learning rate, no decay
model.save("my_model.doc2vec")
model_loaded = models.Doc2Vec.load('my_model.doc2vec')
#print(model.docvecs.most_similar(["SENT_hoax"])[0][1])
#print(model.docvecs.most_similar(["SENT_fakta"]))
#print(model_loaded.docvecs.most_similar(["SENT_hoax"]))
print(model.docvecs["SENT_hoax"])
#word_vec = model['pesan']
#model.docvecs.most_similar([word_vec])
#print("fakta")
#print(model.docvecs["SENT_fakta"])
In [70]:
print(model.docvecs.most_similar(["SENT_hoax"]))
In [71]:
import numpy as np
new_mat = np.vstack((model.docvecs["SENT_hoax"], model.docvecs["SENT_fakta"]))
np.shape(new_mat)
Out[71]:
In [72]:
from sklearn.preprocessing import StandardScaler
x_new = StandardScaler().fit_transform(new_mat)
In [73]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(x_new)
Out[73]:
In [74]:
new_pca = pca.transform(x_new)
print("original shape: ", new_mat.shape)
print("transformed shape:", new_pca.shape)
x = np.stack((new_pca[0][0], new_pca[1][0]))
y = np.stack((new_pca[0][1], new_pca[1][1]))
new_pca
Out[74]:
In [75]:
import matplotlib.pyplot as plt
N = 5
x = x
y = y
colors = (0,0,0)
area = np.pi*15
# Plot
plt.scatter(x, y, s=area, c=colors, alpha=0.5)
plt.title('Set 3 : Stopword Removal + Stemming')
plt.xlabel('x')
plt.ylabel('y')
plt.show()
In [ ]: