In [1]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
In [ ]:
import urllib.request
import bs4
In [2]:
import pandas as pd
import numpy as np
In [4]:
with open ("bubble_popper_postgres.txt","r") as myfile:
lines = [line.replace("\n","") for line in myfile.readlines()]
db, us ,pw = 'bubble_popper', lines[0], lines[1]
engine = create_engine('postgresql://%s:%s@localhost:5432/%s'%(us,pw,db))
connstr = "dbname='%s' user='%s' host='localhost' password='%s'"%(db,us,pw)
conn = None; conn = psycopg2.connect(connstr)
In [ ]:
url = 'http://www.journalism.org/interactives/media-polarization/table/consume/'
source = urllib.request.urlopen(url).read().decode('utf-8')
soup = bs4.BeautifulSoup(source,'html.parser')
In [ ]:
table_html = soup.findAll("table")
table_label = ['heard','source','trust','distrust']
for i, table in enumerate(table_html[0:4]):
table_df = pd.read_html(str(table))
table_df[0].to_sql('pub_'+table_label[i],engine,if_exists='replace')
Calculate audience ideology score for each publication for each of the factors (heard, source, trust, distrust) by numbering the columns 1, 2, 3, 4, 5 (mostly liberal, mixed liberal, mixed, mixed conservative, and mostly conservative) and averaging the two column numbers with the highest percentages
In [ ]:
conn = None
connstr = "dbname='%s' user='%s' host='localhost' password='%s'"%(db,us,pw)
conn = psycopg2.connect(connstr)
In [ ]:
pub_tables = []
table_label = ['heard','source','trust','distrust']
for table in table_label:
query = """SELECT * FROM pub_%s"""%(table)
pub_tables.append(pd.read_sql(query,conn))
In [ ]:
# Remove publications with insufficient reader/viewership (asterisks),
# publications/shows that have been discontinued, and TV-only outlets
for i in range(len(pub_tables)):
pub_tables[i] = pub_tables[i][-pub_tables[i]['Source'].isin(['Daily Show','Colbert Report','Al Jazeera America'
'Mother Jones*','Ed Schultz Show*','Daily Kos*','ThinkProgress*'])]
pub_tables[i] = pub_tables[i][pub_tables[i]['Source'] != 'Mother Jones*']
pub_tables[i] = pub_tables[i][pub_tables[i]['Source'] != 'Al Jazeera America']
In [ ]:
# Convert percentage strings to numbers
for i in range(len(pub_tables)):
pub_tables[i] = pub_tables[i].drop(['index','Overall'],1)
for col in pub_tables[i].columns[1:]:
pub_tables[i][col] = pd.to_numeric(pub_tables[i][col].str.replace('%',''))
In [ ]:
for tab in range(len(pub_tables)):
scores = []
for pub in range(len(pub_tables[tab])):
scores.append(np.mean((np.argsort(pub_tables[tab].iloc[pub].values[1:])[::-1]+1)[0:2]))
pub_tables[tab][table_label[tab]] = scores
pub_tables[tab] = pub_tables[tab].drop(pub_tables[tab].columns[1:-1],1)
In [ ]:
pub_scores = pub_tables[1].merge(pub_tables[0],on='Source')
pub_scores = pub_scores.merge(pub_tables[2],on='Source')
pub_scores = pub_scores.merge(pub_tables[3],on='Source')
In [ ]:
# Add publication twitter handles
pub_twitter = pd.read_csv('pub_twitter.txt',header=None)
pub_twitter.columns = ['pub_handle','scrape_url','share_url']
pub_scores['twitter'] = pub_twitter['pub_handle'].values
In [ ]:
pub_scores.to_sql('pub_scores',engine,if_exists='replace')
Scatterplot of publication (audience) ideology scores on the source and trust factors
In [5]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()
In [6]:
query = """SELECT * FROM pub_scores"""
pub_scores = pd.read_sql(query,conn)
In [11]:
publications = list(pub_scores.Source)
getnewsfrom = list(pub_scores.source)
trustsource = list(pub_scores.trust + 0.075)
In [ ]:
trace0 = go.Scatter(
x=getnewsfrom[::-1],
y=publications[::-1],
mode='markers',
name='Get news from',
marker=dict(
color='rgba(156, 165, 196, 1.0)',
line=dict(color='rgba(156, 165, 196, 1.0)',width=1),
symbol='circle',
size=16))
trace1 = go.Scatter(
x=trustsource[::-1],
y=publications[::-1],
mode='markers',
name='Trust news source',
marker=dict(
color='rgba(204, 204, 204, 0.95)',
line=dict(color='rgba(204, 204, 204, 0.95)',width=1),
symbol='circle',
size=16))
data = [trace0,trace1]
layout = go.Layout(
title="Publication audience (1 = liberal, 5 = conservative)",
xaxis=dict(
showgrid=False,
showline=True,
linecolor='rgb(102, 102, 102)',
titlefont=dict(color='rgb(204, 204, 204)'),
tickfont=dict(color='rgb(102, 102, 102)'),
autotick=True,
dtick=10,
ticks='outside',
tickcolor='rgb(102, 102, 102)'),
margin=dict(l=140,r=40,b=50,t=80),
legend=dict(
font=dict(size=10),
yanchor='middle',
xanchor='right'),
width=700,
height=700,
paper_bgcolor='rgb(240, 240, 240)',
plot_bgcolor='rgb(240, 240, 240)',
hovermode='closest')
fig = go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
Get the 3,200 most recent tweets, including shared article URLs, from each publication's Twitter feed
In [ ]:
import tweepy
import newspaper
In [ ]:
from time import sleep
from datetime import datetime
import pickle
In [ ]:
with open ("bubble_popper_twitter.txt","r") as myfile:
lines = [line.replace("\n","") for line in myfile.readlines()]
consumer_key, consumer_secret = lines[0], lines[1]
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
In [ ]:
pub_twitter = pd.read_csv('pub_twitter.txt',header=None)
pub_twitter.columns = ['pub_handle','scrape_url','share_url']
In [ ]:
def pub_tweets(pub_handle):
tweets = []
for tweet in tweepy.Cursor(api.user_timeline,screen_name=pub_handle,count=200,include_rts=True).pages(16):
tweets += tweet
tweets = [[tweet.text,[url['expanded_url'] for url in tweet.entities['urls']]] for tweet in tweets]
tweets = pd.DataFrame(tweets)
tweets.columns = ['tweet_text','tweet_url']
tweets['tweet_url'] = tweets['tweet_url'].apply(lambda x: ', '.join(x))
return tweets
In [ ]:
for pub_handle in pub_twitter['pub_handle']:
print(str(datetime.now()),pub_handle)
pub_data = pub_tweets(pub_handle)
pickle.dump(pub_data,open('pub_data_'+pub_handle+''.pkl','wb'))
In [ ]:
pub_data = {}
for pub_handle in pub_twitter['pub_handle']:
print(str(datetime.now()),pub_handle)
pub_data[pub_handle] = pickle.load(open('pub_data_'+pub_handle+'.pkl','rb'))
pickle.dump(pub_data,open('pub_data.pkl','wb'))
Scrape text of 1,000 articles from the URLs shared by each publication
In [ ]:
def pub_articles(pub_tweets,scrape_url):
links = pub_tweets['tweet_url'][pub_tweets['tweet_url'].str.contains(scrape_url)]
links = [link.split(',')[0] for link in links]
articles = []
for i,link in enumerate(links):
if i < 1000:
if (i+1)%100 == 0:
print(str(datetime.now()),str(i+1),'/',str(len(links)))
try:
article = newspaper.Article(link)
article.download()
article.parse()
articles.append(article)
sleep(3)
except:
print('Bad link:',link)
sleep(3)
continue
else:
break
return articles
In [ ]:
pub_data = pickle.load(open('pub_data.pkl','rb'))
for pub_handle in pub_twitter['pub_handle']:
if pub_handle not in ['seanhannity','rushlimbaugh']:
try:
print(str(datetime.now()),pub_handle)
pub_tweets = pub_data[pub_handle]
scrape_url = pub_twitter.scrape_url[pub_twitter.pub_handle == pub_handle].values[0]
pub_text = pub_articles(pub_tweets,scrape_url)
pickle.dump(pub_text,open('pub_text_'+pub_handle+'.pkl','wb'))
except:
print('Bad handle:',pub_handle)
continue
Scrape articles from Sean Hannity and Rush Limbaugh separately because they did not share URLs on Twitter
In [ ]:
print(str(datetime.now()),'seanhannity')
links = []
urls = ['http://www.hannity.com/articles/election-493995/',
'http://www.hannity.com/articles/immigration-487258/',
'http://www.hannity.com/articles/obamacare-487185/',
'http://www.hannity.com/articles/war-on-terror-487284/',
'http://www.hannity.com/articles/economy-487306/']
for url in urls:
source = urllib.request.urlopen(url).read().decode('utf-8')
soup = bs4.BeautifulSoup(source,'html.parser')
articles = soup.find_all('h2')
articles = ['http://www.hannity.com'+article.a.get('href') for article in articles]
links.extend(articles)
print(url)
sleep(3)
pickle.dump(links,open('pub_links_'+'seanhannity'+'.pkl','wb'))
In [ ]:
print(str(datetime.now()),'rushlimbaugh')
links = []
for i in range(1,100+1):
url = 'https://www.rushlimbaugh.com/archives/page/'+str(i)+'/'
source = urllib.request.urlopen(url).read().decode('utf-8')
soup = bs4.BeautifulSoup(source,'html.parser')
articles = soup.find_all('h2')
articles = [article.a.get('href') for article in articles]
links.extend(articles)
print(url)
sleep(3)
pickle.dump(links,open('pub_links_'+'rushlimbaugh'+'.pkl','wb'))
In [ ]:
# Additional Fox News URLs (i.e., not shortened)
pub_tweets = pub_data['FoxNews']
scrape_url = 'foxnews.com'
links = pub_tweets['tweet_url'][pub_tweets['tweet_url'].str.contains(scrape_url)]
links = [link.split(',')[0] for link in links]
pickle.dump(links,open('pub_links_'+'FoxNews'+'.pkl','wb'))
In [ ]:
def pub_articles_extra(links):
articles = []
for i,link in enumerate(links):
if i < 1000:
if (i+1)%100 == 0:
print(str(datetime.now()),str(i+1),'/',str(len(links)))
try:
article = newspaper.Article(link)
article.download()
article.parse()
articles.append(article)
sleep(3)
except:
print('Bad link:',link)
sleep(3)
continue
else:
break
return articles
In [ ]:
pub_handles = ['rushlimbaugh','seanhannity','FoxNews']
for pub_handle in pub_handles:
try:
print(str(datetime.now()),pub_handle)
links = pickle.load(open('pub_links_'+pub_handle+'.pkl','rb'))
pub_text = pub_articles_extra(links)
pickle.dump(pub_text,open('pub_text_'+pub_handle+'_extra.pkl','wb'))
except:
print('Bad handle:',pub_handle)
continue
In [ ]:
# Rename files first
pub_text1 = pickle.load(open('pub_text_FoxNews_1.pkl','rb'))
pub_text2 = pickle.load(open('pub_text_FoxNews_2.pkl','rb'))
pub_text = pub_text1 + pub_text2
pickle.dump(pub_text,open('pub_text_FoxNews.pkl','wb'))