In [5]:
import glob
import codecs
In [6]:
hindi_filenames = sorted(glob.glob("../data/training-hindi/*utf8"))
In [9]:
output_file = codecs.open('../data/training_hindi_NER.utf8','w')
In [10]:
for file_name in hindi_filenames:
file = codecs.open(file_name,'r')
status = False
flag = 0
for line in file:
if flag==0:
if line == "\n":
flag=1
continue
if line[0]==u'<' and line[-2]==u'>':
pass
elif len(line)>2 and line[-2]==u')' and line[-3]==u')':
pass
elif line[0]==u'0':
pass
else:
line = line.strip().split()
if len(line) == 2:
output_file.write(line[1])
if status:
output_file.write("\t" + entity)
status = False
else:
output_file.write("\tO")
output_file.write("\n")
elif len(line) == 4:
status = True
try:
entity = line[-1].split("=")[1][:-1]
except:
print " ".join(line)
else:
output_file.write("\n")