PROJECT DATA

Publication Scores

Scrape tables from the Pew Research Center report on the polarization of media sources showing audience profiles of major news outlets


In [1]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

In [ ]:
import urllib.request
import bs4

In [2]:
import pandas as pd
import numpy as np

In [4]:
with open ("bubble_popper_postgres.txt","r") as myfile:
    lines = [line.replace("\n","") for line in myfile.readlines()] 
db, us ,pw = 'bubble_popper', lines[0], lines[1]                     
engine = create_engine('postgresql://%s:%s@localhost:5432/%s'%(us,pw,db))
connstr = "dbname='%s' user='%s' host='localhost' password='%s'"%(db,us,pw)
conn = None; conn = psycopg2.connect(connstr)

In [ ]:
url = 'http://www.journalism.org/interactives/media-polarization/table/consume/'
source = urllib.request.urlopen(url).read().decode('utf-8')
soup = bs4.BeautifulSoup(source,'html.parser')

In [ ]:
table_html = soup.findAll("table")
table_label = ['heard','source','trust','distrust']
for i, table in enumerate(table_html[0:4]):
    table_df = pd.read_html(str(table))
    table_df[0].to_sql('pub_'+table_label[i],engine,if_exists='replace')

Calculate audience ideology score for each publication for each of the factors (heard, source, trust, distrust) by numbering the columns 1, 2, 3, 4, 5 (mostly liberal, mixed liberal, mixed, mixed conservative, and mostly conservative) and averaging the two column numbers with the highest percentages


In [ ]:
conn = None
connstr = "dbname='%s' user='%s' host='localhost' password='%s'"%(db,us,pw)
conn = psycopg2.connect(connstr)

In [ ]:
pub_tables = []
table_label = ['heard','source','trust','distrust']
for table in table_label:
    query = """SELECT * FROM pub_%s"""%(table)
    pub_tables.append(pd.read_sql(query,conn))

In [ ]:
# Remove publications with insufficient reader/viewership (asterisks), 
# publications/shows that have been discontinued, and TV-only outlets  
for i in range(len(pub_tables)):
    pub_tables[i] = pub_tables[i][-pub_tables[i]['Source'].isin(['Daily Show','Colbert Report','Al Jazeera America'
                                                    'Mother Jones*','Ed Schultz Show*','Daily Kos*','ThinkProgress*'])]
    pub_tables[i] = pub_tables[i][pub_tables[i]['Source'] != 'Mother Jones*']
    pub_tables[i] = pub_tables[i][pub_tables[i]['Source'] != 'Al Jazeera America']

In [ ]:
# Convert percentage strings to numbers
for i in range(len(pub_tables)):
    pub_tables[i] = pub_tables[i].drop(['index','Overall'],1)
    for col in pub_tables[i].columns[1:]:
        pub_tables[i][col] = pd.to_numeric(pub_tables[i][col].str.replace('%',''))

In [ ]:
for tab in range(len(pub_tables)):
    scores = []
    for pub in range(len(pub_tables[tab])):
        scores.append(np.mean((np.argsort(pub_tables[tab].iloc[pub].values[1:])[::-1]+1)[0:2]))
    pub_tables[tab][table_label[tab]] = scores
    pub_tables[tab] = pub_tables[tab].drop(pub_tables[tab].columns[1:-1],1)

In [ ]:
pub_scores = pub_tables[1].merge(pub_tables[0],on='Source')
pub_scores = pub_scores.merge(pub_tables[2],on='Source')
pub_scores = pub_scores.merge(pub_tables[3],on='Source')

In [ ]:
# Add publication twitter handles
pub_twitter = pd.read_csv('pub_twitter.txt',header=None)
pub_twitter.columns = ['pub_handle','scrape_url','share_url']
pub_scores['twitter'] = pub_twitter['pub_handle'].values

In [ ]:
pub_scores.to_sql('pub_scores',engine,if_exists='replace')

Scatterplot of publication (audience) ideology scores on the source and trust factors


In [5]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()



In [6]:
query = """SELECT * FROM pub_scores"""
pub_scores = pd.read_sql(query,conn)

In [11]:
publications = list(pub_scores.Source)
getnewsfrom = list(pub_scores.source)
trustsource = list(pub_scores.trust + 0.075)

In [ ]:
trace0 = go.Scatter(
    x=getnewsfrom[::-1],
    y=publications[::-1],
    mode='markers',
    name='Get news from',
    marker=dict(
        color='rgba(156, 165, 196, 1.0)',
        line=dict(color='rgba(156, 165, 196, 1.0)',width=1),
        symbol='circle',
        size=16))

trace1 = go.Scatter(
    x=trustsource[::-1],
    y=publications[::-1],
    mode='markers',
    name='Trust news source',
    marker=dict(
        color='rgba(204, 204, 204, 0.95)',
        line=dict(color='rgba(204, 204, 204, 0.95)',width=1),
        symbol='circle',
        size=16))

data = [trace0,trace1]

layout = go.Layout(
    title="Publication audience (1 = liberal, 5 = conservative)",
    xaxis=dict(
        showgrid=False,
        showline=True,
        linecolor='rgb(102, 102, 102)',
        titlefont=dict(color='rgb(204, 204, 204)'),
        tickfont=dict(color='rgb(102, 102, 102)'),
        autotick=True,
        dtick=10,
        ticks='outside',
        tickcolor='rgb(102, 102, 102)'),
    margin=dict(l=140,r=40,b=50,t=80),
    legend=dict(
        font=dict(size=10),
        yanchor='middle',
        xanchor='right'),
    width=700,
    height=700,
    paper_bgcolor='rgb(240, 240, 240)',
    plot_bgcolor='rgb(240, 240, 240)',
    hovermode='closest')

fig = go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)

Article Texts

Get the 3,200 most recent tweets, including shared article URLs, from each publication's Twitter feed


In [ ]:
import tweepy
import newspaper

In [ ]:
from time import sleep
from datetime import datetime
import pickle

In [ ]:
with open ("bubble_popper_twitter.txt","r") as myfile:
    lines = [line.replace("\n","") for line in myfile.readlines()]
consumer_key, consumer_secret = lines[0], lines[1]
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [ ]:
pub_twitter = pd.read_csv('pub_twitter.txt',header=None)
pub_twitter.columns = ['pub_handle','scrape_url','share_url']

In [ ]:
def pub_tweets(pub_handle):    
    
    tweets = []
    for tweet in tweepy.Cursor(api.user_timeline,screen_name=pub_handle,count=200,include_rts=True).pages(16):
        tweets += tweet
    
    tweets = [[tweet.text,[url['expanded_url'] for url in tweet.entities['urls']]] for tweet in tweets]
    tweets = pd.DataFrame(tweets)
    tweets.columns = ['tweet_text','tweet_url']
    tweets['tweet_url'] = tweets['tweet_url'].apply(lambda x: ', '.join(x))
    
    return tweets

In [ ]:
for pub_handle in pub_twitter['pub_handle']:
    print(str(datetime.now()),pub_handle)
    pub_data = pub_tweets(pub_handle)
    pickle.dump(pub_data,open('pub_data_'+pub_handle+''.pkl','wb'))

In [ ]:
pub_data = {}
for pub_handle in pub_twitter['pub_handle']:
    print(str(datetime.now()),pub_handle)
    pub_data[pub_handle] = pickle.load(open('pub_data_'+pub_handle+'.pkl','rb'))
pickle.dump(pub_data,open('pub_data.pkl','wb'))

Scrape text of 1,000 articles from the URLs shared by each publication


In [ ]:
def pub_articles(pub_tweets,scrape_url):

    links = pub_tweets['tweet_url'][pub_tweets['tweet_url'].str.contains(scrape_url)]
    links = [link.split(',')[0] for link in links]
    
    articles = []
    for i,link in enumerate(links):
        if i < 1000:
            if (i+1)%100 == 0:
                print(str(datetime.now()),str(i+1),'/',str(len(links)))
            try:
                article = newspaper.Article(link)
                article.download()
                article.parse()
                articles.append(article)
                sleep(3)
            except:
                print('Bad link:',link)
                sleep(3)
                continue
        else:
            break
        
    return articles

In [ ]:
pub_data = pickle.load(open('pub_data.pkl','rb'))
for pub_handle in pub_twitter['pub_handle']:
    if pub_handle not in ['seanhannity','rushlimbaugh']:
        try:
            print(str(datetime.now()),pub_handle)
            pub_tweets = pub_data[pub_handle]
            scrape_url = pub_twitter.scrape_url[pub_twitter.pub_handle == pub_handle].values[0]
            pub_text = pub_articles(pub_tweets,scrape_url)
            pickle.dump(pub_text,open('pub_text_'+pub_handle+'.pkl','wb'))
        except:
            print('Bad handle:',pub_handle)
            continue

Scrape articles from Sean Hannity and Rush Limbaugh separately because they did not share URLs on Twitter


In [ ]:
print(str(datetime.now()),'seanhannity')

links = []

urls = ['http://www.hannity.com/articles/election-493995/',
        'http://www.hannity.com/articles/immigration-487258/',
        'http://www.hannity.com/articles/obamacare-487185/',
        'http://www.hannity.com/articles/war-on-terror-487284/',
        'http://www.hannity.com/articles/economy-487306/']

for url in urls:
    source = urllib.request.urlopen(url).read().decode('utf-8')
    soup = bs4.BeautifulSoup(source,'html.parser')
    articles = soup.find_all('h2')
    articles = ['http://www.hannity.com'+article.a.get('href') for article in articles]
    links.extend(articles)
    print(url)
    sleep(3)
    
pickle.dump(links,open('pub_links_'+'seanhannity'+'.pkl','wb'))

In [ ]:
print(str(datetime.now()),'rushlimbaugh')

links = []

for i in range(1,100+1):
    url = 'https://www.rushlimbaugh.com/archives/page/'+str(i)+'/'
    source = urllib.request.urlopen(url).read().decode('utf-8')
    soup = bs4.BeautifulSoup(source,'html.parser')
    articles = soup.find_all('h2')
    articles = [article.a.get('href') for article in articles]
    links.extend(articles)
    print(url)
    sleep(3)

pickle.dump(links,open('pub_links_'+'rushlimbaugh'+'.pkl','wb'))

In [ ]:
# Additional Fox News URLs (i.e., not shortened)

pub_tweets = pub_data['FoxNews']
scrape_url = 'foxnews.com'
links = pub_tweets['tweet_url'][pub_tweets['tweet_url'].str.contains(scrape_url)]
links = [link.split(',')[0] for link in links]

pickle.dump(links,open('pub_links_'+'FoxNews'+'.pkl','wb'))

In [ ]:
def pub_articles_extra(links):
    
    articles = []
    for i,link in enumerate(links):
        if i < 1000:
            if (i+1)%100 == 0:
                print(str(datetime.now()),str(i+1),'/',str(len(links)))
            try:
                article = newspaper.Article(link)
                article.download()
                article.parse()
                articles.append(article)
                sleep(3)
            except:
                print('Bad link:',link)
                sleep(3)
                continue
        else:
            break
        
    return articles

In [ ]:
pub_handles = ['rushlimbaugh','seanhannity','FoxNews']
for pub_handle in pub_handles:
    try:
        print(str(datetime.now()),pub_handle)
        links = pickle.load(open('pub_links_'+pub_handle+'.pkl','rb'))
        pub_text = pub_articles_extra(links)
        pickle.dump(pub_text,open('pub_text_'+pub_handle+'_extra.pkl','wb'))
    except:
        print('Bad handle:',pub_handle)
        continue

In [ ]:
# Rename files first
pub_text1 = pickle.load(open('pub_text_FoxNews_1.pkl','rb'))
pub_text2 = pickle.load(open('pub_text_FoxNews_2.pkl','rb'))
pub_text = pub_text1 + pub_text2
pickle.dump(pub_text,open('pub_text_FoxNews.pkl','wb'))