PREPROCESSING

Clean article collection



In [ ]:

    
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2



In [ ]:

    
import newspaper
from datetime import datetime
import pickle



In [ ]:

    
import pandas as pd
import numpy as np



In [ ]:

    
with open ("bubble_popper_postgres.txt","r") as myfile:
    lines = [line.replace("\n","") for line in myfile.readlines()] 
db, us, pw = 'bubble_popper', lines[0], lines[1]                     
engine = create_engine('postgresql://%s:%s@localhost:5432/%s'%(us,pw,db))
connstr = "dbname='%s' user='%s' host='localhost' password='%s'"%(db,us,pw)
conn = None; conn = psycopg2.connect(connstr)

Save article information in a table



In [ ]:

    
query = """SELECT * FROM pub_scores"""
pub_scores = pd.read_sql(query,conn)



In [ ]:

    
columns = ['publication','source','heard','trust','distrust','content','title','url']
articles = pd.DataFrame(columns=columns)

for handle in pub_scores['twitter']:
    
    print(str(datetime.now()),handle)
    
    articleList = pickle.load(open('pub_text_'+handle+'.pkl','rb'))
    content = [article.text for article in articleList]
    title = [article.title for article in articleList]
    url = [article.url for article in articleList]
    
    publication = np.repeat(pub_scores['Source'][pub_scores['twitter']==handle],len(content))
    source = np.repeat(pub_scores['source'][pub_scores['twitter']==handle],len(content))
    heard = np.repeat(pub_scores['heard'][pub_scores['twitter']==handle],len(content))
    trust = np.repeat(pub_scores['trust'][pub_scores['twitter']==handle],len(content))
    distrust = np.repeat(pub_scores['distrust'][pub_scores['twitter']==handle],len(content))
    
    temp = pd.DataFrame({'publication':publication,
                         'source':source,
                         'heard':heard,
                         'trust':trust,
                         'distrust':distrust,
                         'content':content,
                         'title':title,
                         'url':url})
    
    articles = articles.append(temp,ignore_index=True)

pickle.dump(articles,open('pub_articles.pkl','wb'))
articles.to_sql('pub_articles',engine,if_exists='replace')

Remove short (usually advertisements), Guardian (British news), Stack of Stuff (list of links), and duplicate articles



In [ ]:

    
short_text = []
for i,article in enumerate(pub_articles['content'].values.tolist()):
    if len(article)<=0:
        short_text.append(i)

guardian_text = []
for i,publication in enumerate(pub_articles['publication'].values.tolist()):
    if publication == 'Guardian':
        guardian_text.append(i)

stack_text = [i for i in range(0,len(pub_articles)) if 'Stack of Stuff' in pub_articles['title'].iloc[i]]

drop_text = short_text + guardian_text + stack_text
drop_text = list(set(drop_text))

articles = pub_articles.drop(pub_articles.index[drop_text])
articles = articles.drop_duplicates('content')

pickle.dump(articles,open('pub_articles_trimmed.pkl','wb'))
articles.to_sql('pub_articles_clean',engine,if_exists='replace')

Clean article content



In [ ]:

    
from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import gensim

Remove special characters, tokenize and lemmatize the articles, and remove stop and miscellaneous words



In [ ]:

    
doc_set = articles['content'].values.tolist()

doc_set = [doc.replace("\n"," ") for doc in doc_set]
doc_set = [doc.replace("\'","") for doc in doc_set]

doc_set = [gensim.utils.simple_preprocess(doc) for doc in doc_set]

wordnet_lemmatizer = WordNetLemmatizer()
doc_set = [[wordnet_lemmatizer.lemmatize(word) for word in doc] for doc in doc_set]
doc_set = [[wordnet_lemmatizer.lemmatize(word,pos='v') for word in doc] for doc in doc_set]

en_stop = get_stop_words('en')
letters = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
other = ["wa","ha","one","two","id","re","http","com","mr","image","photo","caption","don","sen","pic","co",
         "source","watch","play","duration","video","momentjs","getty","images","newsletter"]
doc_set = [[word for word in doc if not word in (en_stop+letters+other)] for doc in doc_set]

pickle.dump(doc_set,open('pub_articles_cleaned_super.pkl','wb'))