In [ ]:
from collections import Counter
import re
In [ ]:
nullifications = [
# Regular expression that are simply deleted before normalizing.
# So "scribner's sons" becomes "scribners sons"
r"\.",
",",
"'",
r"\[",
r"\]",
r"\?",
" and co(mpany)?"
]
spacifications = [
# Regular expressions that are replaced with a space
"and co ",
"and company"
]
regex_replacements = [
(" +"," "),
(" ?& ?"," and ")
]
I'm doing some real native python for the actual normalization. The PublisherNormalizer is a subclass of the broader "Normalizer" object, which could work on placenames, titles, and so forth as well.
I've bundled some unit tests below that demonstrate what sort of changes the normalization code is supposed to make.
In [ ]:
class Normalizer(object):
"""
Normalizes a string while maintaining a lookup dictionary
"""
def __init__(self):
self.cache = dict()
def norm(self,string):
try:
return self.cache[string]
except KeyError:
return self.normalize(string)
class PublisherNormalizer(Normalizer):
def normalize(self,string):
string = string.lower()
string = re.sub(r"|".join(nullifications),"",string)
string = re.sub(r"|".join(spacifications)," ",string)
for (a,b) in regex_replacements:
string = re.sub(a,b,string)
string = string.strip(" ").rstrip(" ")
return string
import unittest
class TestNormalization(unittest.TestCase):
def testOne(self):
normer = PublisherNormalizer()
self.failUnless(normer.norm("foo.bar,?")=="foobar")
self.failUnless(normer.norm("FOOBAR")=="foobar")
self.failUnless(normer.norm("[Scribner's Sons and company ]?")=="scribners sons")
suite = unittest.TestLoader().loadTestsFromTestCase(TestNormalization)
unittest.TextTestRunner(verbosity=0).run(suite)
Out[ ]:
In [ ]:
In [ ]:
counts = Counter()
terms = []
i = 0
for line in open("publishers.txt"):
i += 1
(filename,publisher) = line.rstrip("\n").split("\t")
terms.append(publisher)
if i > 3000000:
break
In [ ]:
normer = PublisherNormalizer()
for term in terms:
counts[normer.norm(term)] += 1
In [ ]:
for (k,v) in counts.most_common(55):
print "{}\t{}".format(v,k)