Cleaning up names of publishers to match standard forms.


In [ ]:
from collections import Counter
import re

In [ ]:
nullifications = [
    # Regular expression that are simply deleted before normalizing.
    # So "scribner's sons" becomes "scribners sons"
    r"\.",
    ",",
    "'",
    r"\[",
    r"\]",
    r"\?",
    " and co(mpany)?"
]
spacifications = [
    # Regular expressions that are replaced with a space
    "and co ",
    "and company"
]
regex_replacements = [
    ("  +"," "),
    (" ?& ?"," and ")
]

I'm doing some real native python for the actual normalization. The PublisherNormalizer is a subclass of the broader "Normalizer" object, which could work on placenames, titles, and so forth as well.

I've bundled some unit tests below that demonstrate what sort of changes the normalization code is supposed to make.


In [ ]:
class Normalizer(object):
    """
    Normalizes a string while maintaining a lookup dictionary
    """
    def __init__(self):
        self.cache = dict()
    def norm(self,string):
        try:
            return self.cache[string]
        except KeyError:
            return self.normalize(string)
        
class PublisherNormalizer(Normalizer):
    def normalize(self,string):        
        string = string.lower()
        string = re.sub(r"|".join(nullifications),"",string)
        string = re.sub(r"|".join(spacifications)," ",string)
        for (a,b) in regex_replacements:
            string = re.sub(a,b,string)
        string = string.strip(" ").rstrip(" ")
        return string
    
    
import unittest
class TestNormalization(unittest.TestCase):
    def testOne(self):
        normer = PublisherNormalizer()
        self.failUnless(normer.norm("foo.bar,?")=="foobar")
        self.failUnless(normer.norm("FOOBAR")=="foobar")
        self.failUnless(normer.norm("[Scribner's Sons and company  ]?")=="scribners sons")

suite = unittest.TestLoader().loadTestsFromTestCase(TestNormalization)
unittest.TextTestRunner(verbosity=0).run(suite)


======================================================================
FAIL: testOne (__main__.TestNormalization)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-91-f7ebcfd2ab4e>", line 31, in testOne
    self.failUnless(normer.norm("[Scribner's Sons and compay  ]?")=="scribners sons")
AssertionError: False is not true

----------------------------------------------------------------------
Ran 1 test in 0.000s

FAILED (failures=1)
Out[ ]:
<unittest.runner.TextTestResult run=1 errors=0 failures=1>

In [ ]:


In [ ]:
counts = Counter()

terms = []
i = 0
for line in open("publishers.txt"):
    i += 1
    (filename,publisher) = line.rstrip("\n").split("\t")
    terms.append(publisher)
    if i > 3000000:
        break

In [ ]:
normer = PublisherNormalizer()
for term in terms:
    counts[normer.norm(term)] += 1


311545	
47460	us gpo
43195	sn
38529	us gpo :
38098	us govt print off
23325	govt print off
20210	macmillan
17607	gpo
14785	longmans green
13503	harper and brothers
12891	c scribners sons
12587	houghton mifflin
12243	d appleton
11215	the macmillan company
9675	the society
9069	j murray
8708	us dept of the interior bureau of mines
8350	u s govt print off
7777	little brown
7471	us dept of commerce bureau of the census :
6506	division of the federal register the national archives
6414	houghton mifflin company
5902	the bureau :
5771	clarendon press
5424	the service
5314	the office
5308	g p putnams sons
4892	h holt
4828	gp putnams sons
4693	dodd mead
4620	university press
4532	us dept of agriculture
4053	harper
4038	the century co
3909	bg teubner

In [ ]:
for (k,v) in counts.most_common(55):
    print "{}\t{}".format(v,k)


311545	
47460	us gpo
43195	sn
38529	us gpo :
38098	us govt print off
23325	govt print off
20210	macmillan
17607	gpo
14785	longmans green
13503	harper and brothers
12891	c scribners sons
12587	houghton mifflin
12243	d appleton
11215	the macmillan company
9675	the society
9069	j murray
8708	us dept of the interior bureau of mines
8350	u s govt print off
7777	little brown
7471	us dept of commerce bureau of the census :
6506	division of the federal register the national archives
6414	houghton mifflin company
5902	the bureau :
5771	clarendon press
5424	the service
5314	the office
5308	g p putnams sons
4892	h holt
4828	gp putnams sons
4693	dodd mead
4620	university press
4532	us dept of agriculture
4053	harper
4038	the century co
3909	bg teubner
3773	a and c black
3763	g fischer
3703	chapman and hall
3632	r bentley
3612	g reimer
3612	us dept of agriculture :
3609	ja barth
3524	j springer
3442	mcgraw-hill
3440	the university
3422	the association
3381	the commission
3363	b g teubner
3216	f alcan
3215	w engelmann
3176	ginn
3105	scribner
3065	the survey
2994	the bureau
2990	the board