This code will use the dump of Persian wikipedia to find some stats about the persian letters distribution and frequency. It is done by www.Jadi.net and published under GPLv3 Licence
First of all, this part reads the wikipedia dump and spits only the TEXT of articles. This will use a random number generator to keep only the 10% of articles.
In [5]:
%matplotlib inline
In [1]:
import xml.etree.ElementTree as etree
import re
import random
inFile = '/home/jadi/w/wikipedia/fawiki-20150807-pages-articles.xml'
random.seed()
counter = 0
for event, elem in etree.iterparse(inFile, events=('start', 'end', 'start-ns', 'end-ns')):
if random.random() < 0.9: #only work on 10% of articles
try:
elem.clear()
except:
pass
continue
thisTxt = None
try:
if elem.tag.endswith('/}text'):
thisTxt = elem.text
elem.clear()
except:
continue
if not thisTxt:
elem.clear()
continue
print thisTxt[1:10000]
In [2]:
import re
inputText = '/home/jadi/w/wikipedia/wiki_fa.txt'
f = open(inputText, 'r')
alltext = f.read()
text = alltext
text=re.sub("\n", " ", text)
text=re.sub("\[+", "[", text)
text=re.sub("\]+", "]", text)
text=re.sub("\{+", "{", text)
text=re.sub("\}+", "}", text)
text=re.sub("{.*?}", " ", text)
text=re.sub("\<.*?\>", " ", text)
text=re.sub("\[.*?\]", " ", text)
text=re.sub("\s+", " ", text)
# changing some arabic chars to correct persian ones
text=re.sub(u"ي", u"ی", text)
text=re.sub(u"ك", u"ک", text)
print text[1:10000]
In [3]:
f = open('/home/jadi/w/wikipedia/wiki_only_farsi_chars.txt', 'r')
alltext = f.read()
alltext = alltext.decode("utf-8")
In [14]:
allchars = {}
for i in range(0, len(alltext)):
allchars[alltext[i]] = allchars.get(alltext[i], 0) + 1
allwordsnum = allchars[' ']
totalcharsnum = len(alltext) - allchars[' ']
del allchars[' ']
del allchars['\n']
import numpy as np
from matplotlib import pyplot as plt
letters = u'آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی'
lettervals = []
letterlist = []
for letter in list(letters):
print letter, allchars[letter]
lettervals.append(allchars[letter]*1.0/totalcharsnum*100)
width = 1/1.5
plt.figure(figsize=(20,10))
plt.bar( range(len(lettervals)), lettervals, width)
plt.xticks([x+0.3 for x in range(len(lettervals))], list(letters), fontsize=18 )
plt.title(u'Percentage of persian letters\n10% of wikipedia articles are tested', fontsize=34)
plt.ylabel('Percent', fontsize=20)
plt.xlabel('Letter', fontsize=20)
plt.show()
In [15]:
allwords = alltext.split()
initialchars = {}
for word in allwords:
initialchars[word[0]] = initialchars.get(word[0], 0) + 1
import numpy as np
from matplotlib import pyplot as plt
letters = u'آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی'
lettervals = []
letterlist = []
print 'Initial letters'
for letter in list(letters):
print letter, allchars[letter]
lettervals.append(initialchars[letter]*1.0/allwordsnum*100)
width = 1/1.5
plt.figure(figsize=(20,10))
plt.bar( range(len(lettervals)), lettervals, width)
plt.xticks([x+0.3 for x in range(len(lettervals))], list(letters) , fontsize=18)
plt.title(u'Percentage of persian initial letters\n10% of wikipedia articles are tested', fontsize=34)
plt.ylabel('Percent', fontsize=20)
plt.xlabel('Initial Letter', fontsize=20)
plt.show()
In [16]:
allwords = alltext.split()
finalchars = {}
for word in allwords:
finalchars[word[-1]] = finalchars.get(word[-1], 0) + 1
# lets repeat for final letters
import numpy as np
from matplotlib import pyplot as plt
letters = u'آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی'
lettervals = []
letterlist = []
print 'Final letters'
for letter in list(letters):
print letter, allchars[letter]
lettervals.append(finalchars[letter]*1.0/allwordsnum*100)
width = 1/1.5
plt.figure(figsize=(20,10))
plt.bar( range(len(lettervals)), lettervals, width)
plt.xticks([x+0.3 for x in range(len(lettervals))], list(letters), fontsize=18 )
plt.title(u'Percentage of persian final letters\n10% of wikipedia articles are tested', fontsize=34)
plt.ylabel('Percent', fontsize=20)
plt.xlabel('Final Letter', fontsize=20)
plt.show()