Corpus Generator

Generates a corpus for training using the titles of CamHarvestCollection


In [5]:
import pymongo
from pymongo import MongoClient
from nltk.corpus import stopwords
import string
import codecs
punct_filter = dict((ord(char), u' ') for char in '"#$%&\'()*+,./-:;<=>?@[\\]^_`{|}')   
stop = stopwords.words('english')
mongo_url = 'mongodb://localhost:27017/'
db = 'CamSim'
coll_in = 'CamHarvest'
client = MongoClient(mongo_url)
ch = client[db][coll_in]
corpusfile = 'corpus2.txt'

In [6]:
cursor = ch.find()
ind = 0
word_count = 0
with codecs.open(corpusfile, 'a',encoding='utf8') as outfile:
    for rec in cursor:
        title = rec['title']
        title = title.lower()
        title = title.translate(punct_filter)
        stop_filtered = [i for i in title.split() if i not in stop]
        word_count+=len(stop_filtered)
        export = u' '.join(stop_filtered)+u'\n'
        outfile.write(export)
        if (ind%100 == 0): print('writing record : '+ str(ind))
        ind+=1
print('Corpus Generated with '+ str(word_count) +' words')


writing record : 0
writing record : 100
writing record : 200
writing record : 300
writing record : 400
writing record : 500
writing record : 600
writing record : 700
writing record : 800
writing record : 900
writing record : 1000
writing record : 1100
writing record : 1200
writing record : 1300
writing record : 1400
writing record : 1500
writing record : 1600
writing record : 1700
writing record : 1800
writing record : 1900
writing record : 2000
writing record : 2100
writing record : 2200
writing record : 2300
writing record : 2400
writing record : 2500
writing record : 2600
writing record : 2700
writing record : 2800
writing record : 2900
writing record : 3000
writing record : 3100
writing record : 3200
writing record : 3300
writing record : 3400
writing record : 3500
writing record : 3600
writing record : 3700
writing record : 3800
writing record : 3900
writing record : 4000
writing record : 4100
writing record : 4200
writing record : 4300
writing record : 4400
writing record : 4500
writing record : 4600
writing record : 4700
writing record : 4800
writing record : 4900
writing record : 5000
writing record : 5100
writing record : 5200
writing record : 5300
writing record : 5400
writing record : 5500
writing record : 5600
writing record : 5700
writing record : 5800
writing record : 5900
writing record : 6000
writing record : 6100
writing record : 6200
writing record : 6300
writing record : 6400
writing record : 6500
writing record : 6600
writing record : 6700
writing record : 6800
writing record : 6900
writing record : 7000
writing record : 7100
writing record : 7200
writing record : 7300
writing record : 7400
writing record : 7500
writing record : 7600
writing record : 7700
writing record : 7800
writing record : 7900
writing record : 8000
writing record : 8100
writing record : 8200
writing record : 8300
writing record : 8400
writing record : 8500
writing record : 8600
writing record : 8700
writing record : 8800
writing record : 8900
writing record : 9000
writing record : 9100
writing record : 9200
writing record : 9300
writing record : 9400
writing record : 9500
writing record : 9600
writing record : 9700
writing record : 9800
writing record : 9900
writing record : 10000
writing record : 10100
writing record : 10200
writing record : 10300
writing record : 10400
writing record : 10500
writing record : 10600
writing record : 10700
writing record : 10800
writing record : 10900
writing record : 11000
writing record : 11100
writing record : 11200
writing record : 11300
writing record : 11400
writing record : 11500
writing record : 11600
writing record : 11700
writing record : 11800
writing record : 11900
writing record : 12000
writing record : 12100
writing record : 12200
writing record : 12300
writing record : 12400
writing record : 12500
writing record : 12600
writing record : 12700
writing record : 12800
writing record : 12900
writing record : 13000
writing record : 13100
writing record : 13200
writing record : 13300
writing record : 13400
writing record : 13500
writing record : 13600
writing record : 13700
writing record : 13800
writing record : 13900
writing record : 14000
writing record : 14100
writing record : 14200
writing record : 14300
writing record : 14400
writing record : 14500
writing record : 14600
writing record : 14700
writing record : 14800
writing record : 14900
writing record : 15000
writing record : 15100
writing record : 15200
writing record : 15300
writing record : 15400
writing record : 15500
writing record : 15600
writing record : 15700
writing record : 15800
writing record : 15900
writing record : 16000
writing record : 16100
writing record : 16200
writing record : 16300
writing record : 16400
writing record : 16500
writing record : 16600
writing record : 16700
writing record : 16800
Corpus Generated with 163548 words

In [ ]: