In [5]:
import glob
import codecs

In [6]:
hindi_filenames = sorted(glob.glob("../data/training-hindi/*utf8"))

In [9]:
output_file = codecs.open('../data/training_hindi_NER.utf8','w')

In [10]:
for file_name in hindi_filenames:
    file = codecs.open(file_name,'r')
    status = False
    flag = 0
    for line in file:
        if flag==0:
            if line == "\n":
                flag=1
            continue
        if line[0]==u'<' and line[-2]==u'>':
            pass
        elif len(line)>2 and line[-2]==u')' and line[-3]==u')':
            pass
        elif line[0]==u'0':
            pass
        else:
            line = line.strip().split()
            if len(line) == 2:
                output_file.write(line[1])
                if status:
                    output_file.write("\t" + entity)
                    status = False
                else:
                    output_file.write("\tO")
                output_file.write("\n")
            elif len(line) == 4:
                status = True
                try:
                    entity = line[-1].split("=")[1][:-1]
                except:
                    print " ".join(line)
            else:
                output_file.write("\n")


/home/divesh_pandey/anaconda2/envs/keras_tensorflow/lib/python2.7/site-packages/ipykernel/__main__.py:12: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
10 शनैः - शनैः