In [3]:
import os
import sys
import string 
import time
import timeit
import re
import nltk
import os

from nltk.stem import *
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter

In [4]:
dirname = '/Users/mueric35/Box Sync/nytimes_articles/Org_article'

In [39]:
code = []
for filename in os.listdir(dirname):
    print("Loading: %s" % filename,file=sys.stderr)
    lines = open(dirname + '/' + filename).readlines()
    index = [index for index, value in enumerate(lines) if value == '************\n'][:-1]
    n_index = [0] + [i + 1 for i in index]
    code_article = [lines[i+1].split('\n')[0] for i in n_index]
    code.append(code_article)
    print("Saving: %s" % filename,file=sys.stderr)

    f = open('/Users/mueric35/Box Sync/nytimes_articles/code_article/' + 'code_' + filename,'w')
    for i in code_article:
        f.write(i+'\n')
    f.close()


Loading: article_201607.txt
Saving: article_201607.txt
Loading: article_201608.txt
Saving: article_201608.txt
Loading: article_201609.txt
Saving: article_201609.txt
Loading: article_201610.txt
Saving: article_201610.txt
Loading: article_201611.txt
Saving: article_201611.txt
Loading: article_201612.txt
Saving: article_201612.txt
Loading: article_201701.txt
Saving: article_201701.txt
Loading: article_201702.txt
Saving: article_201702.txt
Loading: article_201703.txt
Saving: article_201703.txt
Loading: article_201704.txt
Saving: article_201704.txt
Loading: article_201705.txt
Saving: article_201705.txt
Loading: article_201706.txt
Saving: article_201706.txt

In [40]:
sum = 0 
for i in code:
    sum += len(i)
sum


Out[40]:
56966