In [1]:
    
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
    
In [3]:
    
names = range(1,14)
df_list = []
for name in names:
    csvfile = '/Users/emilyhalket/Desktop/NLP_NYT/datafiles/{0}_100.csv'.format(name)
    df = pd.read_csv(csvfile)
    df_list.append(df)
article_df = pd.concat(df_list)
    
In [3]:
    
article_df = article_df[pd.notnull(article_df['full_text'])]
    
In [4]:
    
article_df.shape
    
    Out[4]:
This dataset has 11,648 op-eds from the NY Times. We have additional information for each article (title, author, number of comments, etc.) but for now we will just focus on the text data.
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [5]:
    
def preprocess_article_content(text_df):
    print 'preprocessing article text...'
    # text_df is data frame from SQL query, column 'content' contains text content from each article
    article_list = []
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))  # can add more stop words to this set
    
    stemmer = SnowballStemmer('english')
    kept_rows = [] # keep track of rows that have unusable articles
    for row, article in enumerate(text_df['full_text']):
        cleaned_tokens = []
        tokens = tokenizer.tokenize(article.decode('utf-8').lower())
        for token in tokens:
            
            if token not in stop_words:
                if len(token) > 0 and len(token) < 20: # removes non words
                    if not token[0].isdigit() and not token[-1].isdigit(): # removes numbers
                        
                        stemmed_tokens = stemmer.stem(token)
                        cleaned_tokens.append(stemmed_tokens)
        print 'success for row %d' % row 
        article_list.append(' '.join(wd for wd in cleaned_tokens))
        kept_rows.append(row)
    print 'preprocessed content for %d articles' % len(article_list)
    return article_list, kept_rows
    
In [ ]:
    
    
In [8]:
    
article_df = article_df[pd.notnull(article_df['full_text'])]
    
In [9]:
    
article_df.shape
    
    Out[9]:
In [10]:
    
article_list, kept_rows = preprocess_article_content(article_df)
    
    
In [12]:
    
len(article_list)
    
    Out[12]:
In [14]:
    
article_list[2000]
    
    Out[14]:
In [ ]: