In [4]:
# Read data.
import os
# Folder containing all NIPS papers.
data_dir = '/Users/lipingzhang/Downloads/nipstxt/'
# Folders containin individual NIPS papers.
yrs = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dirs = ['nips' + yr for yr in yrs]
# Read all texts into a list.
docs = []
for yr_dir in dirs:
files = os.listdir(data_dir + yr_dir)
for filen in files:
# Note: ignoring characters that cause encoding errors.
with open(data_dir + yr_dir + '/' + filen, errors='ignore') as fid:
txt = fid.read()
docs.append(txt)
In [ ]: