In [1]:
import pandas as pd
In [95]:
import sys,json
sys.path.append('../graphparser')
import graphparser as gp
reload(gp)
from networkx.readwrite import json_graph
urdu_data_file = '../graphparser/settings/urdu.yaml'
devanagari_data_file = '../graphparser/settings/devanagari.yaml'
diacritics_data_file = '../graphparser/settings/diacritics.yaml'
meter_data_file = '../graphparser/settings/urdu-meter.yaml'
urdu_parser = gp.GraphParser(urdu_data_file)
diacritics_parser = gp.GraphParser(diacritics_data_file)
devanagari_parser = gp.GraphParser(devanagari_data_file)
meter_parser = gp.GraphParser(meter_data_file)
divan = pd.DataFrame.from_csv('../ghalib-concordance/input/verses.csv',header=None,index_col=False )
divan.columns=['number','verse','meter']
tokens = divan.verse.map(lambda x:x.strip().split(' '))
from collections import Counter
def flatten(lst):
for elem in lst:
if type(elem) in (tuple, list):
for i in flatten(elem):
yield i
else:
yield elem
c = Counter(list(flatten(tokens.values)))
tokens = pd.DataFrame.from_dict(dict(c).items())
tokens.columns = ['plain_roman','count']
tokens['diacritics'] = tokens.plain_roman.map(lambda x: diacritics_parser.parse(x).output)#
tokens['urdu'] = tokens.plain_roman.map(lambda x: urdu_parser.parse(x).output)#
tokens['devanagari'] = tokens.plain_roman.map(lambda x: devanagari_parser.parse(x).output)#
tokens['meter'] = tokens.plain_roman.map(lambda x: meter_parser.parse(x).output)
tokens.to_csv('tokens.csv',encoding='utf-8', index=False)
tokens = tokens [['plain_roman','diacritics','devanagari','urdu','count','meter']]
In [96]:
tokens
Out[96]:
In [ ]: