In [ ]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
In [ ]:
import newspaper
from datetime import datetime
import pickle
In [ ]:
import pandas as pd
import numpy as np
In [ ]:
with open ("bubble_popper_postgres.txt","r") as myfile:
lines = [line.replace("\n","") for line in myfile.readlines()]
db, us, pw = 'bubble_popper', lines[0], lines[1]
engine = create_engine('postgresql://%s:%s@localhost:5432/%s'%(us,pw,db))
connstr = "dbname='%s' user='%s' host='localhost' password='%s'"%(db,us,pw)
conn = None; conn = psycopg2.connect(connstr)
Save article information in a table
In [ ]:
query = """SELECT * FROM pub_scores"""
pub_scores = pd.read_sql(query,conn)
In [ ]:
columns = ['publication','source','heard','trust','distrust','content','title','url']
articles = pd.DataFrame(columns=columns)
for handle in pub_scores['twitter']:
print(str(datetime.now()),handle)
articleList = pickle.load(open('pub_text_'+handle+'.pkl','rb'))
content = [article.text for article in articleList]
title = [article.title for article in articleList]
url = [article.url for article in articleList]
publication = np.repeat(pub_scores['Source'][pub_scores['twitter']==handle],len(content))
source = np.repeat(pub_scores['source'][pub_scores['twitter']==handle],len(content))
heard = np.repeat(pub_scores['heard'][pub_scores['twitter']==handle],len(content))
trust = np.repeat(pub_scores['trust'][pub_scores['twitter']==handle],len(content))
distrust = np.repeat(pub_scores['distrust'][pub_scores['twitter']==handle],len(content))
temp = pd.DataFrame({'publication':publication,
'source':source,
'heard':heard,
'trust':trust,
'distrust':distrust,
'content':content,
'title':title,
'url':url})
articles = articles.append(temp,ignore_index=True)
pickle.dump(articles,open('pub_articles.pkl','wb'))
articles.to_sql('pub_articles',engine,if_exists='replace')
Remove short (usually advertisements), Guardian (British news), Stack of Stuff (list of links), and duplicate articles
In [ ]:
short_text = []
for i,article in enumerate(pub_articles['content'].values.tolist()):
if len(article)<=0:
short_text.append(i)
guardian_text = []
for i,publication in enumerate(pub_articles['publication'].values.tolist()):
if publication == 'Guardian':
guardian_text.append(i)
stack_text = [i for i in range(0,len(pub_articles)) if 'Stack of Stuff' in pub_articles['title'].iloc[i]]
drop_text = short_text + guardian_text + stack_text
drop_text = list(set(drop_text))
articles = pub_articles.drop(pub_articles.index[drop_text])
articles = articles.drop_duplicates('content')
pickle.dump(articles,open('pub_articles_trimmed.pkl','wb'))
articles.to_sql('pub_articles_clean',engine,if_exists='replace')
In [ ]:
from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import gensim
Remove special characters, tokenize and lemmatize the articles, and remove stop and miscellaneous words
In [ ]:
doc_set = articles['content'].values.tolist()
doc_set = [doc.replace("\n"," ") for doc in doc_set]
doc_set = [doc.replace("\'","") for doc in doc_set]
doc_set = [gensim.utils.simple_preprocess(doc) for doc in doc_set]
wordnet_lemmatizer = WordNetLemmatizer()
doc_set = [[wordnet_lemmatizer.lemmatize(word) for word in doc] for doc in doc_set]
doc_set = [[wordnet_lemmatizer.lemmatize(word,pos='v') for word in doc] for doc in doc_set]
en_stop = get_stop_words('en')
letters = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
other = ["wa","ha","one","two","id","re","http","com","mr","image","photo","caption","don","sen","pic","co",
"source","watch","play","duration","video","momentjs","getty","images","newsletter"]
doc_set = [[word for word in doc if not word in (en_stop+letters+other)] for doc in doc_set]
pickle.dump(doc_set,open('pub_articles_cleaned_super.pkl','wb'))