CamAuthorsCorpus Generator

Take the titles for an author in the CamAuthors collection and generate a representative corpus for that author


In [4]:
import pymongo
from pymongo import MongoClient
from nltk.corpus import stopwords
import string
punct_filter = dict((ord(char), u' ') for char in '"#$%&\'()*+,./-:;<=>?@[\\]^_`{|}')
stop = stopwords.words('english')
mongo_url = 'mongodb://localhost:27017/'
db = 'CamSim'
coll = 'CamAuthors'
client = MongoClient(mongo_url)
ca = client[db][coll]

In [10]:
cursor = ca.find()
ind=0
for rec in cursor:
    name = rec['name']
    corp=u''
    for pub in rec['pubs']:
        title = pub['title']
        title = title.lower()
        title = title.translate(punct_filter)
        stop_filtered = [i for i in title.split() if i not in stop]
        export = u' '.join(stop_filtered)+' '
        corp+=export
    if ind%100==0: print('updated ' +str(ind)+' records')
    ind+=1
    ca.update_one({'name':name},
                 {'$set':{'corpus':corp}})
print(ind)


updated 0 records
updated 100 records
updated 200 records
updated 300 records
updated 400 records
updated 500 records
updated 600 records
updated 700 records
updated 800 records
updated 900 records
updated 1000 records
updated 1100 records
updated 1200 records
updated 1300 records
updated 1400 records
updated 1500 records
updated 1600 records
updated 1700 records
updated 1800 records
updated 1900 records
updated 2000 records
updated 2100 records
updated 2200 records
updated 2300 records
updated 2400 records
updated 2500 records
updated 2600 records
updated 2700 records
updated 2800 records
updated 2900 records
updated 3000 records
updated 3100 records
updated 3200 records
updated 3300 records
updated 3400 records
updated 3500 records
updated 3600 records
updated 3700 records
updated 3800 records
updated 3900 records
updated 4000 records
updated 4100 records
updated 4200 records
updated 4300 records
updated 4400 records
updated 4500 records
updated 4600 records
updated 4700 records
updated 4800 records
updated 4900 records
updated 5000 records
updated 5100 records
updated 5200 records
updated 5300 records
updated 5400 records
updated 5500 records
updated 5600 records
updated 5700 records
updated 5800 records
updated 5900 records
updated 6000 records
updated 6100 records
updated 6200 records
updated 6300 records
updated 6400 records
updated 6500 records
updated 6600 records
updated 6700 records
updated 6800 records
updated 6900 records
updated 7000 records
updated 7100 records
updated 7200 records
updated 7300 records
updated 7400 records
updated 7500 records
updated 7600 records
updated 7700 records
updated 7800 records
updated 7900 records
updated 8000 records
updated 8100 records
updated 8200 records
updated 8300 records
updated 8400 records
updated 8500 records
updated 8600 records
updated 8700 records
updated 8800 records
updated 8900 records
updated 9000 records
updated 9100 records
updated 9200 records
updated 9300 records
updated 9400 records
updated 9500 records
updated 9600 records
updated 9700 records
updated 9800 records
updated 9900 records
updated 10000 records
updated 10100 records
updated 10200 records
updated 10300 records
updated 10400 records
updated 10500 records
updated 10600 records
updated 10700 records
updated 10800 records
updated 10900 records
updated 11000 records
updated 11100 records
updated 11200 records
updated 11300 records
updated 11400 records
updated 11500 records
updated 11600 records
updated 11700 records
updated 11800 records
updated 11900 records
updated 12000 records
updated 12100 records
updated 12200 records
updated 12300 records
updated 12400 records
updated 12500 records
updated 12600 records
updated 12700 records
updated 12800 records
updated 12900 records
updated 13000 records
updated 13100 records
updated 13200 records
updated 13300 records
updated 13400 records
updated 13500 records
updated 13600 records
updated 13700 records
updated 13800 records
updated 13900 records
updated 14000 records
updated 14100 records
updated 14200 records
updated 14300 records
updated 14400 records
updated 14500 records
updated 14600 records
updated 14700 records
updated 14800 records
updated 14900 records
updated 15000 records
updated 15100 records
updated 15200 records
updated 15300 records
updated 15400 records
updated 15500 records
updated 15600 records
updated 15700 records
updated 15800 records
updated 15900 records
updated 16000 records
updated 16100 records
updated 16200 records
updated 16300 records
updated 16400 records
updated 16500 records
updated 16600 records
updated 16700 records
updated 16800 records
updated 16900 records
updated 17000 records
updated 17100 records
updated 17200 records
updated 17300 records
updated 17400 records
updated 17500 records
updated 17600 records
updated 17700 records
updated 17800 records
updated 17900 records
updated 18000 records
updated 18100 records
updated 18200 records
updated 18300 records
updated 18400 records
updated 18500 records
updated 18600 records
updated 18700 records
updated 18800 records
updated 18900 records
updated 19000 records
updated 19100 records
updated 19200 records
updated 19300 records
updated 19400 records
updated 19500 records
updated 19600 records
updated 19700 records
updated 19800 records
updated 19900 records
updated 20000 records
updated 20100 records
updated 20200 records
updated 20300 records
updated 20400 records
updated 20500 records
updated 20600 records
updated 20700 records
updated 20800 records
updated 20900 records
updated 21000 records
updated 21100 records
updated 21200 records
updated 21300 records
updated 21400 records
updated 21500 records
21552

In [ ]: