In [1]:
import pandas as pd
Sebelumnya corpus yang sudah terkumpul dipisahkan secara manual ke dua file csv terpisah, hoax.csv
dan facts.csv
. Kali ini dicoba untuk membaca semua corpus secara terpadu dari satu file csv yang memuat dua kelompok dokumen hoax dan facts.
In [4]:
#fakta = pd.read_csv('hoax.csv')
#fakta.head()
corpus = pd.read_csv('corpus.csv')
corpus.head()
Out[4]:
In [96]:
'''
dataframe[dataframe.column == value] will filter a dataframe based on row that fulfill criteria
dataframe.iloc[row_start:row_end, col_start:col_end] filters a dataframe using slicing method
'''
corpus_hoax = corpus[corpus.Label == 'hoax'].iloc[:, 0:2]
corpus_facts = corpus[corpus.Label == 'fact'].iloc[:, 0:2]
corpus_hoax.head()
Out[96]:
In [97]:
corpus_hoax.shape
Out[97]:
In [98]:
corpus_facts.shape
Out[98]:
In [8]:
for isi in corpus.iterrows():
print(isi[0])
In [29]:
headers = ["corpus"]
corp_h = pd.read_csv('faktahalf.csv', skipinitialspace=True, usecols = headers)
corp_h.head()
Out[29]:
In [31]:
print len(corp_h.corpus)
In [32]:
halfcorpus = open("halffakta.txt", 'w')
for i in range(0,len(corp_h.corpus)):
halfcorpus.write(corp_h.corpus[i])
halfcorpus.write("\n")
i += 1
halfcorpus.close()
In [ ]:
In [ ]:
In [ ]: