Ghalib Concordance Generator

Description

This notebook contains code to generate a concordance for the muravāj dīvān of Ghalib.

Verses are taken from "input/verses.csv"

The current task is to identify the proper lemma of the tokens, e.g. singular instead of plural, verb infinitive instead of verb root, etc. This can partially be done computationally.

Lemma that remain to be checked are in "output/tocheck.csv" The first column, if marked as 'x', means that entry is okay. Checked lemma can then be entered into "input/okay.csv" using the functions



In [1]:

    
import re
from collections import *
import csv

Variables



In [2]:

    
verses = {}                      # dictionary of verses, e.g. 001.01.0='naqsh faryaadii..'
tokens = {}                      # dictionary of tokens where key is verses+.xx, e.g. 001.01.0.01 = 'naqsh'
unique_tokens = Counter()        # Counter of tokens where value is their count
lemmas = defaultdict(list)       # dictionary of tokens where value is a list of their lemmas
unique_lemmas = []               # list of unique lemmas
okay_lemmas = defaultdict(list)  # dictionary of unique tokens with lists of lemma, e.

Functions



In [3]:

    
# moved load_verses, moved to util.py

def load_verses(inputfile='input/verses.csv'):
    '''
    Loads verses from CSV data file
    inputfile: name of csv file
    returns: verses where verses['ggg.vv.l']=token; where ggg=ghazal #; vv=verse number;l=line number
    '''


    verses = {}
    with open(inputfile,'r') as csvfile:
        versereader = csv.reader(csvfile)
        for row in versereader:
            if len(row)<3: print row
            (verse_id, input_string, real_scan) = row # 
            if not 'x' in verse_id: # only muravvaj divan for now
                verses[verse_id] = input_string.strip() 
    return verses

def get_okay_lemmas(inputfile='input/okay.csv'):
    '''
    Loads checked lemmas from CSV data file
    inputfile: name of csv file
    returns: checked_lemmas where checked_lemmas['token'] = [lemmas]
    '''

    import csv
    okay_lemmas = {}
    with open(inputfile,'r') as csvfile:
        versereader = csv.reader(csvfile)
        for row in versereader:
            if len(row)!=3:
                print row
            (status, unique_token, lemmas) = row
#            if not status in ['','x']:
#                print 'error in row,row'
        
            assert status in ['','x']
            if status=='x':
                okay_lemmas[unique_token]=lemmas.split('|')
    return okay_lemmas


def get_tokens(verses):
    '''
    Identifies tokens in verses
    verses: verses
    returns: tokens, where tokens['ggg.vv.l.tt']=token {tt = token # on line starting  at zero}
    '''
    tokens = {}
    token_instances=defaultdict(list)
    token_instance_count = Counter()
    for k in verses.keys():
        v_tokens = verses[k].split(' ')
        for id,t in enumerate(v_tokens):

            token_id = k+'.'+str(id).zfill(2)
            tokens[token_id] = t
            token_instances[t].append(token_id)
            token_instance_count[t]+=1
    return tokens,token_instances,token_instance_count

def locate_token(token):
    '''
    Finds locations of token
    token: string 
    Input: token (string)
    returns: a list of locations, e.g. [001.01.0.01]
    '''
    assert tokens
    return [k  for k,v in tokens.iteritems() if v==token]

def match_tokens(match_string):
    '''
    Finds tokens matching a pattern (from start)
    match_string: regular expression string (assumes ^,e.g. 'naq')
    returns: a list of tokens,e.g. ['naqsh']
    '''
    assert unique_tokens
    return [k  for k in unique_tokens.keys() if re.match(match_string,k)]

def search_tokens(match_string):
    '''
    Searches for tokens matching a pattern (anywhere in it)
    match_string: regular expression of string
    Input: regular expression string (e.g. 'naqsh'
    returns: a list of tokens, e.g. ['naqsh']
    '''
    assert unique_tokens
    return [k  for k in unique_tokens.keys() if re.search(match_string,k)]

def get_unique_tokens(tokens):
    '''
    Finds unique tokens
    tokens: a dictionary of tokens at locations, e.g. tokens['001.01.0.00']='naqsh'
    returns: a dictionary of unique tokens and their count, unique_tokens['token']=1
    '''
    unique = Counter()
#    print type(tokens)
    for k,t in tokens.iteritems():
        unique[t]+=1
    return unique


def get_lemmas(unique_tokens):
    '''
    Generate lemmas of tokens
    unique_tokens: dictionary of unique tokens
    returns: lemmas[original_token]=['lemma1','lemma2']
    '''
    lemmas = {}

    
    for t in unique_tokens.keys():
        lemma = t
        if re.search("-e$",t):
            lemma = t[:-2] # remove izaafat ending '-e'
        if re.search("[-']haa$",t): 
            lemma = t[:-4] # remove Persian plural ['-]haa ending
#            print lemma
        t_lemmas = [lemma]
        if re.search('-o-',lemma):
            nouns = lemma.split('-o-')
            t_lemmas = t_lemmas + nouns
            
        lemmas[t]=t_lemmas
    return lemmas

def get_unique_lemmas(lemmas):
    '''
    Generates unique lemma forms
    lemmas: dictionary keyed by tokens containing lists of lemma, e.g. lemmas['rang-o-buu']=['rang','buu','rang-o-buu']
    returns: unique_lemmas as unique_lemmas['lemma']=count
    '''
    unique_lemmas = set()
    for t,t_lemmas in lemmas.iteritems():
        for lemma in t_lemmas:
            unique_lemmas.add(lemma)
    return unique_lemmas


def to_check():
    '''
    Generates list of unique tokens that still need to be checked.
    '''
    out = []
    return [t for t in sorted(unique_tokens.keys()) if not t in okay_lemmas]

def print_stats():
    print "Currently there are ",len(okay_lemmas)," out of ",len(lemmas)

Set Variables



In [4]:

    
lemmas









    Out[4]:





defaultdict(<type 'list'>, {})



In [5]:

    
verses = load_verses()
tokens,token_instances,token_instance_count = get_tokens(verses)
unique_tokens = get_unique_tokens(tokens)

lemmas = get_lemmas(unique_tokens)
unique_lemmas = get_unique_lemmas(lemmas)
okay_lemmas = get_okay_lemmas()

okay_tokens_not_in_lemmas = [ok for ok in okay_lemmas if not ok in lemmas]

if len(okay_tokens_not_in_lemmas) > 0:
    print 'the following tokens are marked as okay but are not any longer'
    print okay_tokens_not_in_lemmas



In [5]:

Update Files



In [6]:

    
def update_to_check():
    '''
    Writes unique tokens not contained in okay_lemmas to output/tocheck.csv
    '''
    with open('output/tocheck.csv','w') as f:
        for t in sorted(unique_tokens.keys()):
            if not t in okay_lemmas: # only add unchecked ones
                line  = "," # good or bad
                line += t+"," #token
                line += '|'.join(lemmas[t]) # possible lemma of token
                line += "\n" 
                f.write(line)

def update_okay(inputfile='output/tocheck.csv'):
    '''
    Loads lemmas noted as correct from inputfile into okay_lemmas
    '''
    lemmas_to_add = get_okay_lemmas(inputfile=inputfile)
    for k,v in lemmas_to_add.iteritems():
        if k in okay_lemmas:
            print "WARNING: ",k," found in okay_lemmas. Will override."
        okay_lemmas[k] = v
    
def write_okay(outputfile='input/okay.csv'):
    '''
    Writes okay_lemmas to outputfile, as status,token,lemma1|lemma2|lemma3
    '''
    with open(outputfile,'w') as f:
        for t in sorted(okay_lemmas.keys()):
            line  = "x," # good or bad
            line += t+"," #token
            line += '|'.join(okay_lemmas[t])
            line += "\n" 
            f.write(line)

def update_files():
    '''
    Loads lemmas noted as correct from tocheck.csv, 
    Writes okay_lemmas as input/okay.csv
    Regenerates output/tocheck.csv
    '''
    update_okay() 
    write_okay()
    update_to_check()
    print_stats()



In [7]:

    
update_files()









    



Currently there are  4485  out of  4486

Concordance Details

Generates "output/conc_details.csv" which lists lemmas and their sources.



In [8]:

    
lemmas_out = defaultdict(set)


for k,v in okay_lemmas.iteritems(): # k = word; v = lemmas
    for l in v:
        lemmas_out[l].add(k)

with open('output/conc_details.csv','w') as f:
    for k,v in sorted(lemmas_out.iteritems()):
        f.write(k+','+'|'.join(v)+'\n')
        
#okay_lemmas.keys()[0:100]

Lemma Instances

Sorted list of lemma instances.



In [9]:

    
def instances_of_lemma(lemma):
    i=0
    for x in lemmas_out[lemma]:
        i+= token_instance_count[x]
    return i

lemma_instance_count = {lemma: instances_of_lemma(lemma) for lemma in lemmas_out.keys()}
#instances_of_lemma for
#zz=sorted(lemmas_out.keys(),key=instances_of_lemma)#sort_by_instances)#size_of_lemma_by_instances)
#for z in zz: print z, instances_of_lemma[zz])
with open("output/statistics/lemma-counts.csv","w") as f:
    for x in sorted(lemma_instance_count, key=lemma_instance_count.get,reverse=True):
        f.write(x+','+str(lemma_instance_count[x])+'\n')

Izafats

I am not sure yet how we will wind up using these. Probably based on a token location range, similarly to compound verbs, etc. There may be some combos I am not grabbing properly. These will need to lemma-ed later (e.g. nasalization).



In [10]:

    
izafat_verse_ids = [v_id for v_id in sorted(verses.keys()) if re.search('-e ',verses[v_id])]
izafat_verses = [verses[v_id] for v_id in izafat_verse_ids]



In [11]:

    
izafat_re = re.compile('(?:[^ ]+-e )+(?:z )?[^ ]+')
izafats=Counter()
for s in izafat_verses:
    x = izafat_re.findall(s)#re.findall(m,s)
    for y in x:
        izafats[y]+=1



In [12]:

    
with open('output/izafats.csv','w') as f:
    f.write('\n'.join(sorted(izafats.keys())))

Here also is a version of the tokens where izafat phrases are treated as individual tokens.



In [13]:

    
iast=Counter() # izafats as tokens, along with tokens
iast_re = re.compile('(?:[^ ]+-e )+(?:z )?[^ ]+|[^ ]+')
for i,s in verses.iteritems():
    words = iast_re.findall(s)
    for t in words:
        iast[t]+=1

Statistics

Word frequencies, etc.



In [14]:

    
def make_csv_of_token_freq(d, filename):
    '''
    Generates a CSV file of a dictionary based on numeric value of key, reverse sorted
    d: dictionary of tokens and values(token: #)
    filename = output file name
    '''
    with open(filename,'w') as f:
        for k,v in d.most_common():
            f.write(k+','+str(v)+'\n')



In [15]:

    
make_csv_of_token_freq(izafats, 'output/statistics/izafat-freq.csv')
make_csv_of_token_freq(unique_tokens, 'output/statistics/uniquetokens-freq.csv')
make_csv_of_token_freq(iast, 'output/statistics/izafatastokens-freq.csv')



In [16]:

    
type(izafats)









    Out[16]:





collections.Counter



In [17]:

    
lemma_counts_beta=Counter()

for token, count in unique_tokens.iteritems():
    if token in okay_lemmas:
        lemma = okay_lemmas[token][0]
    else:
        lemma = token
    lemma_counts_beta[lemma]+=count
lemma_counts_beta
make_csv_of_token_freq(lemma_counts_beta,'output/statistics/lemmas-beta-freq.csv')



In [18]:

    
# the following will generate the urdu versions of the statistics (a little slow)



In [19]:

    
import generate_urdu



In [20]:

    
#redo here
reload(generate_urdu)#generate_urdu.write_all_urdu_statistics()









    Out[20]:





<module 'generate_urdu' from 'generate_urdu.pyc'>

Quick and Dirty Output

This generates some quick output for proofing as .md; this a bit sloppy



In [69]:

    
with open('output/lemmas-by-size.txt','w') as f:
    for x in sorted(lemma_instance_count, key=lemma_instance_count.get,reverse=True):
        words=lemmas_out[x]
        words = sorted(words,key=token_instance_count.get, reverse=True)
        f.write(x+' '+str(lemma_instance_count[x])+'\n')
        for w in words:
            f.write("  - "+w+' '+str(token_instance_count[w])+'\n')



In [101]:

    
import codecs
import sys
sys.path.append('./graphparser/')
import graphparser
urdup = graphparser.GraphParser('./graphparser/settings/urdu.yaml')
nagarip = graphparser.GraphParser('./graphparser/settings/devanagari.yaml')
urdudiacriticsp = graphparser.GraphParser('./graphparser/settings/urdu-diacritics.yaml')
for x in [urdup, urdudiacriticsp,nagarip]: print x.parse(' jur))at ma))aal').output









    



 جرأت مآل
 جُرْأت مَآل
 जुरअत मआल



In [102]:

    
def gen_hiur_lemmas_by_size():
    import codecs
    import sys
    sys.path.append('./graphparser/')
    import graphparser
    urdup = graphparser.GraphParser('./graphparser/settings/urdu.yaml')
    nagarip = graphparser.GraphParser('./graphparser/settings/devanagari.yaml')
    def out_hiur(w):
        return urdup.parse(w).output+' '+nagarip.parse(w).output+' '+w
    with codecs.open('output/lemmas-by-size-hiur.md','w','utf-8') as f:
        for x in sorted(lemma_instance_count, key=lemma_instance_count.get,reverse=True):
            words=lemmas_out[x]
            words = sorted(words,key=token_instance_count.get, reverse=True)

            f.write(out_hiur(x)+' '+str(lemma_instance_count[x])+'\n')
            for w in words:
                f.write("  - "+out_hiur(w)+' '+str(token_instance_count[w])+'\n')
                
def out_hiur(w):
    return urdup.parse(w).output+' '+nagarip.parse(w).output+' '+w

def out_hiur_csv(w):
    return urdup.parse(w).output+','+nagarip.parse(w).output+','+w
def html_out(w):
    return td(urdup.parse(w).output)+td(nagarip.parse(w).output)+td(w)

def td(x):
    return '<td>'+x+'</td>'

def li(x):
    return ('<li>'+x+'</li>')
def md_link(s,urdu=True):
    out =  " ["+s+"]"
    out += "("+'http://www.columbia.edu/itc/mealac/pritchett/00ghalib/'
    out += s[0:3]+'/'+s[0:3]+"_"+s[4:6]+".html"
    if urdu==True:
        out+="?urdu"
    out += ") "#
    return out

def get_url(s,urdu=False):
    url='http://www.columbia.edu/itc/mealac/pritchett/00ghalib/'+s[0:3]+'/'+str(int(s[0:3]))+"_"+s[4:6]+".html"
    if urdu:
        url+='?urdu'
    return url

def a_link(s,urdu=False):
    url=get_url(s,urdu)
    out = '<a href="'+url+'">'+s+'</a>'
    return out

def gen_hiur_lemmas_by_size_hiur(file_name, with_verses=False, truncate=True,truncate_limit=50):

    with codecs.open(file_name,'w','utf-8') as f:
        for x in sorted(lemma_instance_count, key=lemma_instance_count.get,reverse=True):
            words=lemmas_o+ut[x]
            words = sorted(words,key=token_instance_count.get, reverse=True)
            
            f.write(' '+out_hiur(x)+' '+str(lemma_instance_count[x])+'\n')

            for w in words:
                f.write("  - ")
                f.write("  - "+out_hiur(w)+' '+str(token_instance_count[w])+'\n')
                vi = set(x[:-5] for x in token_instances[w]) # eg001.01 from 001.01.01.0

                if with_verses==True:
                    if (truncate==False) or (truncate==True and len (vi)< truncate_limit):
    #                print list(vi)[0]
                        f.write("    - ")# nested indent
                        f.write(', '.join([md_link(v) for v in vi]))
                        f.write('\n')

def gen_hiur_lemmas_by_size_ul(file_name='output/hiur-lemmas-by-size-ul.html'):
    with codecs.open(file_name,'w','utf-8') as f:
        f.write('<!DOCTYPE html>\n')
        f.write('<html lang="en-US">\n')
        f.write('<head><meta charset="utf-8"></head>\n')
        f.write('<body>\n')


        f.write('<table>\n')
        for x in sorted(lemma_instance_count, key=lemma_instance_count.get,reverse=True):
            words=lemmas_out[x]
            words = sorted(words,key=token_instance_count.get, reverse=True)
            f.write('<p><b>'+out_hiur(x)+' '+str(lemma_instance_count[x])+'</b></p>\n')
            f.write('<ul>\n')
            for w in words:
                f.write('<li>'+out_hiur(w)+' '+str(token_instance_count[w])+'</li>\n')
            f.write("</ul>")

        f.write("</body></html>")

def gen_hiur_lemmas(filename='output/hiur-lemmas.html'):
    with codecs.open(filename,'w','utf-8') as f:
        f.write('<!DOCTYPE html>\n')
        f.write('<html lang="en-US">\n')
        f.write('<head><meta charset="utf-8"></head>\n')
        f.write("<body><table>")
        for l,tkns in sorted(lemmas_out.iteritems()):
            locs=[]
            for t in tkns:
                locs += [v[0:6] for v,t_x in tokens.iteritems() if t_x ==t]
            locs=sorted(list(set(sorted(locs))))
            hyperlocs = [a_link(loc,urdu=False) for loc in locs]
            f.write('<tr>'+td(l)+td(urdup.parse(l).output)+td(nagarip.parse(l).output)+td(', '.join(hyperlocs))+'</tr>\n')
#                    print l,urdup.parse(l).output,locs
        f.write("</table></body></html>")
a_link('101.01')









    Out[102]:





'<a href="http://www.columbia.edu/itc/mealac/pritchett/00ghalib/101/101_01.html">101.01</a>'



In [103]:

    
gen_hiur_lemmas()    
#gen_hiur_lemmas_by_size()
#gen_hiur_lemmas_by_size_with_verses()
#gen_hiur_lemmas_by_size_hiur('output/lemmas-by-size-w-verses-all-hiur.md', with_verses=True, truncate=False)#True,truncate_limit=50):
#gen_hiur_lemmas_by_size_hiur('output/lemmas-by-size-countsonly.md', with_verses=False)#True,truncate_limit=50):
gen_hiur_lemmas_by_size_ul()



In [104]:

    
def gen_hiur_lemmas_by_size_md(file_name='output/hiur-lemmas-by-size.md'):
    with codecs.open(file_name,'w','utf-8') as f:
        f.write('# Lemmas and Tokens (Sorted by Number of Occurences)\n\n')
        for x in sorted(lemma_instance_count, key=lemma_instance_count.get,reverse=True):
            words=lemmas_out[x]
            words = sorted(words,key=token_instance_count.get, reverse=True)
            f.write('\n'+out_hiur(x)+' '+str(lemma_instance_count[x])+'\n')
            for w in words:
                f.write('* '+out_hiur(w)+' '+str(token_instance_count[w])+'\n')
                
gen_hiur_lemmas_by_size_md()

Experimenting with Lemma version of text (for topic modeling)



In [106]:

    
def gen_documents(file_name = 'output/lemma_documents.txt'):
    with codecs.open(file_name,'w','utf-8') as f:
        it = iter(sorted(verses))
        for x in it:
            v_id0,v_id1 = x,next(it)
            #print v_id0,v_id1
            lemmastring = ''
            for v in [v_id0,v_id1]:
            #print v
                vtkns = [t for t in tokens if t.startswith(v)]
                for t in vtkns:
                    l = okay_lemmas[tokens[t]]
                    if len(l)>1:
                        while '-o-' in l[0]:
                            l=l[1:]
                    lemmas_out = ' '.join(l)
                    lemmastring+=' '+lemmas_out

            f.write(lemmastring+'\n')
gen_documents()









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-106-b0bd6ebd87a1> in <module>()
     18 
     19             f.write(lemmastring+'\n')
---> 20 gen_documents()

<ipython-input-106-b0bd6ebd87a1> in gen_documents(file_name)
     10                 vtkns = [t for t in tokens if t.startswith(v)]
     11                 for t in vtkns:
---> 12                     l = okay_lemmas[tokens[t]]
     13                     if len(l)>1:
     14                         while '-o-' in l[0]:

KeyError: 'nayastaa;n'



In [ ]:



In [ ]:



In [76]:

    
import PyICU
import PyICU
locale=PyICU.Locale('ur')
urducol = PyICU.Collator.createInstance(locale)



In [77]:

    
import graphparser
reload(graphparser)

urdup = graphparser.GraphParser('./graphparser/settings/urdu-diacritics.yaml')
urdudiacriticsp = graphparser.GraphParser('./graphparser/settings/urdu-diacritics.yaml')
#nagarip = graphparser.GraphParser('./graphparser/settings/devanagari.yaml')
lemmas_diacritics = {urdudiacriticsp.parse(x).output:x for x in lemmas_out }

urdu_lemmas = [urdudiacriticsp.parse(x).output for x in lemmas_out]
urdu_lemmas_sorted = sorted(urdu_lemmas,urducol.compare)



In [78]:

    
urducol.compare('ب','ا')









    Out[78]:





1



In [79]:

    
#for x in sorted(set([urdup.parse(t).output for t in tokens.values() if t.endswith('-e')] ), col.compare): print x



In [80]:

    
import codecs
def gen_concordance(filename='output/concordance-urdu.html'):

    with codecs.open(filename,'w','utf-8') as f:
        f.write('<!DOCTYPE html>\n')
        f.write('<html lang="ur-PK">\n')
        f.write('<head><meta charset="utf-8"></head>\n')
        f.write("<body><table>")
        for l_d in urdu_lemmas_sorted:
            tkns = lemmas_out[lemmas_diacritics[l_d]]
            
#        for l,tkns in sorted(lemmas_out.iteritems(),urducol.compare):
            locs=[]
            for t in tkns:
                locs += [v[0:6] for v,t_x in tokens.iteritems() if t_x ==t]
            locs=sorted(list(set(sorted(locs))))
            hyperlocs = [a_link(loc,urdu=True) for loc in locs]
            f.write('<tr>'+td(l_d)+td(', '.join(hyperlocs))+'</tr>\n')
#                    print l,urdup.parse(l).output,locs
        f.write("</table></body></html>")
gen_concordance()



In [81]:

    
lemmas_diacritics = {urdudiacriticsp.parse(x).output:x for x in lemmas_out }



In [82]:

    
#sorted(lemmas_diacritics.keys(),urducol.compare)



In [83]:

    
def tex_link(s,urdu=False):
    url=get_url(s,urdu)
    out = '\href{'+url+'}{'+s+'}'
    return out


out=''
xetex_header=r'''
%!TEX TS-program = xelatex
%!TEX encoding = UTF-8 Unicode

\documentclass[12pt]{article}
\usepackage{geometry}                % See geometry.pdf to learn the layout options. There are lots.

\geometry{letterpaper}                   % ... or a4paper or a5paper or ... 

%\geometry{landscape}                % Activate for for rotated page geometry
%\usepackage[parfill]{parskip}    % Activate to begin paragraphs with an empty line rather than an indent
%\usepackage{graphicx}
%\usepackage{amssymb}
\usepackage{hyperref}
\usepackage{longtable}
\usepackage{fontspec}
\newfontfamily\ur[Script=Arabic,Scale=1.4]{Jameel Noori Nastaleeq}
\setromanfont[Mapping=tex-text]{Hoefler Text}
\setsansfont[Scale=MatchLowercase,Mapping=tex-text]{Gill Sans}
\setmonofont[Scale=MatchLowercase]{Andale Mono}
\usepackage{tabularx}
%\title{Brief Article}
%\author{The Author}
%\date{}                                           % Activate to display a given date or no date
\usepackage{bidi}
%\usepackage{bidipoem}

\usepackage{ltablex}

\begin{document}

\begin{tabularx}{\linewidth}{@{}cX@{}}   
'''
out=xetex_header
for l_d in urdu_lemmas_sorted:
    tkns = lemmas_out[lemmas_diacritics[l_d]]
    locs=[]
    for t in tkns:
        locs += [v[0:6] for v,t_x in tokens.iteritems() if t_x ==t]
    locs=sorted(list(set(sorted(locs))))
    hyperlocs_s = '\n'.join([tex_link(loc,urdu=True) for loc in locs])
    
    out+='{\\ur %s} & %s \\\\\n'%(l_d,hyperlocs_s)
    
out+=r'''

\end{tabularx}   


\end{document}

'''



In [84]:

    
with codecs.open('output/tex/conc-urdu.tex','w','utf8') as f:
    f.write(out)



In [85]:

    
def tex_link(s,urdu=False):
    url=get_url(s,urdu)
    out = '\href{'+url+'}{'+s+'}'
    return out


out=''
xetex_header=r'''
%!TEX TS-program = xelatex
%!TEX encoding = UTF-8 Unicode

\documentclass[12pt]{article}
\usepackage[margin=.5in]{geometry}                % See geometry.pdf to learn the layout options. There are lots.



\geometry{letterpaper}                   % ... or a4paper or a5paper or ... 

\geometry{landscape}                % Activate for for rotated page geometry
%\usepackage[parfill]{parskip}    % Activate to begin paragraphs with an empty line rather than an indent
%\usepackage{graphicx}
%\usepackage{amssymb}
\usepackage{hyperref}
\usepackage{longtable}
\usepackage{fontspec}
\newfontfamily\ur[Script=Arabic,Scale=1.4]{Jameel Noori Nastaleeq}
\setromanfont[Mapping=tex-text]{Hoefler Text}
\setsansfont[Scale=MatchLowercase,Mapping=tex-text]{Gill Sans}
\setmonofont[Scale=MatchLowercase]{Andale Mono}
\usepackage{tabularx}
%\title{Brief Article}
%\author{The Author}
%\date{}                                           % Activate to display a given date or no date
\usepackage{multicol}
\usepackage{bidi}

%\usepackage{bidipoem}

\usepackage{ltablex}

\begin{document}

\begin{RTL}

\raggedleft
 
\begin{multicols}{5}
'''

out=xetex_header
for l_d in urdu_lemmas_sorted:
    tkns = lemmas_out[lemmas_diacritics[l_d]]
    locs=[]
    for t in tkns:
        locs += [v[0:6] for v,t_x in tokens.iteritems() if t_x ==t]
    instances = len(locs)
    locs=sorted(list(set(sorted(locs))))
    hyperlocs_s = '\n'.join([tex_link(loc,urdu=True) for loc in locs])
    
    out+='{\\ur %s}   '%l_d# %s \\\\\n'%(l_d,hyperlocs_s)
    out+='\\textsuperscript{'+str(instances+1)+'} '
    out+=lemmas_diacritics[l_d]+'\n\n'
out+=r'''

 

\end{multicols}
\end{RTL}
\end{document}

'''

with codecs.open('output/tex/lemmas.tex','w','utf8') as f:
    f.write(out)



In [85]:



In [58]:

    
parser = urdudiacriticsp.parse
t_i = token_instances['parvaanah']
verse_indexes = sorted(set([ x[:7] for x in t_i]))
def highlight(s):
    return '<b>'+s+'</b>'
out=''
for v_i in verse_indexes:

    out +="<p>"+a_link(v_i[:6])+"</p>"
    first_tkns = sorted([x for x in tokens.keys() if (x.startswith(v_i) and x[-4]=='0')])# and x.endswith('.0')]
    second_tkns = sorted([x for x in tokens.keys() if x.startswith(v_i) and x[-4]=='1'])
    out += '<p>'
    for t in first_tkns:
        s = tokens[t]
        if parser:
            s = parser(s).output
        if t in t_i:
            s=highlight(s)
        out+=s+' '
    out+='<br/>'
    for t in second_tkns:
        s = tokens[t]
        if parser:
            s = parser(s).output


        if t in t_i:
            s=highlight(s)
        out+=s+' '
        
    
#    print out

from IPython.core.display import HTML

HTML(out)









    Out[58]:




045.05
جاں دَر ہَوائے یَک نِگَۂ گَرْم ہے اَسَد 
پَرْوانَہ ہے وَکِیل تِرے داد خْواہ کا 
075.04
غَم اُس کو حَسْرَتِ پَرْوانَہ کا ہے اَی شُعْلے 
تِرے لَرَزْنے سے ظاہِر ہے نا تَوانیِ شَمْعَ 
081.03
با وُجُودِ یَک جَہاں ہَنْگامَہ پِیدائی نَہِیں 
ہِیں چِراغانِ شَبِسْتانِ دِلِ پَرْوانَہ ہَم 
166.03
پَرِ پَرْوانَہ شایَد بادْبانِ کِشْتیِ مے تھا 
ہُوئی مَجْلِس کی گَرْمی سے رَوانی دورِ ساغَر کی



In [ ]: