In [1]:
import sys
sys.path.append('./graphparser/')
import graphparser
urdup = graphparser.GraphParser('./graphparser/settings/urdu.yaml')
nagarip = graphparser.GraphParser('./graphparser/settings/devanagari.yaml')
In [2]:
import csv
def unicode_csv_reader(unicode_csv_data, **kwargs):
# csv.py doesn't do Unicode; encode temporarily as UTF-8:
csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
**kwargs)
for row in csv_reader:
# decode UTF-8 back to Unicode, cell by cell:
yield [unicode(cell, 'utf-8') for cell in row]
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
import csv, codecs, cStringIO
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds )
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
In [3]:
def write_urdu_statistics(inputfile,outputfile,nagari=False,headers=True):
with open(outputfile,'w') as output_stream:
csvwriter = UnicodeWriter(output_stream)
if headers==True:
fieldnames = ['urdu', 'transliteration','count']
if nagari==True:
fieldnames=['urdu', 'nagari', 'transliteration','count']
csvwriter.writerow(fieldnames) # add headers
with open(inputfile,'r') as input_stream:
csvreader = unicode_csv_reader(input_stream) # this is likely not utf-8
for row in csvreader:
(token,freq)=row
if nagari==True:
row = urdup.parse(token).output,nagarip.parse(token).output,token,freq
else:
row = urdup.parse(token).output, token, freq
csvwriter.writerow(row)
In [4]:
def write_all_urdu_statistics():
# write_urdu_statistics('output/statistics/izafat-freq.csv','output/statistics') # will contain transliteration for now
write_urdu_statistics('output/statistics/lemmas-beta-freq.csv','output/statistics/lemmas-beta-freq-ur.csv')
write_urdu_statistics('output/statistics/uniquetokens-freq.csv','output/statistics/uniquetokens-freq-ur.csv')
write_urdu_statistics('output/statistics/izafatastokens-freq.csv','output/statistics/izafatastokens-freq-ur.csv')
write_urdu_statistics('output/statistics/izafatastokens-freq.csv','output/statistics/izafatastokens-freq-hiur.csv',nagari=True)
write_urdu_statistics('output/statistics/lemma-counts.csv', 'output/statistics/lemma-counts-ur.csv')
In [6]:
write_all_urdu_statistics()
In [ ]: