In [1]:
import pandas as pd
Sebelumnya corpus yang sudah terkumpul dipisahkan secara manual ke dua file csv terpisah, hoax.csv dan facts.csv. Kali ini dicoba untuk membaca semua corpus secara terpadu dari satu file csv yang memuat dua kelompok dokumen hoax dan facts.
In [4]:
#fakta = pd.read_csv('hoax.csv')
#fakta.head()
corpus = pd.read_csv('corpus.csv')
corpus.head()
Out[4]:
In [96]:
'''
dataframe[dataframe.column == value] will filter a dataframe based on row that fulfill criteria
dataframe.iloc[row_start:row_end, col_start:col_end] filters a dataframe using slicing method
'''
corpus_hoax = corpus[corpus.Label == 'hoax'].iloc[:, 0:2]
corpus_facts = corpus[corpus.Label == 'fact'].iloc[:, 0:2]
corpus_hoax.head()
Out[96]:
In [97]:
corpus_hoax.shape
Out[97]:
In [98]:
corpus_facts.shape
Out[98]:
In [8]:
for isi in corpus.iterrows():
print(isi[0])
In [29]:
headers = ["corpus"]
corp_h = pd.read_csv('faktahalf.csv', skipinitialspace=True, usecols = headers)
corp_h.head()
Out[29]:
In [31]:
print len(corp_h.corpus)
In [32]:
halfcorpus = open("halffakta.txt", 'w')
for i in range(0,len(corp_h.corpus)):
halfcorpus.write(corp_h.corpus[i])
halfcorpus.write("\n")
i += 1
halfcorpus.close()
In [ ]:
In [ ]:
In [ ]: