In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
In [3]:
names = range(1,14)
df_list = []
for name in names:
csvfile = '/Users/emilyhalket/Desktop/NLP_NYT/datafiles/{0}_100.csv'.format(name)
df = pd.read_csv(csvfile)
df_list.append(df)
article_df = pd.concat(df_list)
In [3]:
article_df = article_df[pd.notnull(article_df['full_text'])]
In [4]:
article_df.shape
Out[4]:
This dataset has 11,648 op-eds from the NY Times. We have additional information for each article (title, author, number of comments, etc.) but for now we will just focus on the text data.
In [ ]:
In [ ]:
In [ ]:
In [5]:
def preprocess_article_content(text_df):
print 'preprocessing article text...'
# text_df is data frame from SQL query, column 'content' contains text content from each article
article_list = []
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english')) # can add more stop words to this set
stemmer = SnowballStemmer('english')
kept_rows = [] # keep track of rows that have unusable articles
for row, article in enumerate(text_df['full_text']):
cleaned_tokens = []
tokens = tokenizer.tokenize(article.decode('utf-8').lower())
for token in tokens:
if token not in stop_words:
if len(token) > 0 and len(token) < 20: # removes non words
if not token[0].isdigit() and not token[-1].isdigit(): # removes numbers
stemmed_tokens = stemmer.stem(token)
cleaned_tokens.append(stemmed_tokens)
print 'success for row %d' % row
article_list.append(' '.join(wd for wd in cleaned_tokens))
kept_rows.append(row)
print 'preprocessed content for %d articles' % len(article_list)
return article_list, kept_rows
In [ ]:
In [8]:
article_df = article_df[pd.notnull(article_df['full_text'])]
In [9]:
article_df.shape
Out[9]:
In [10]:
article_list, kept_rows = preprocess_article_content(article_df)
In [12]:
len(article_list)
Out[12]:
In [14]:
article_list[2000]
Out[14]:
In [ ]: