Firstly, I write for Python 3.x. Python 2 may work but I don't consciously try and correct for any Python 2 differences. Apart from installing the necessary packages (pandas, numpy, matplotlib and so on). Two things have to be done manually:
Set up PhantomJS with Selenium (This has gotten easier to install (not sure if PhantomJS now comes with Selenium by default or you still have to download it seperately). Used to be a pain.)
Get a client_id / client_secret set up with PRAW / Reddit. In this code it is assumed that there is a file called: praw.json which contains client_id, client_secret, password, user_agent, and username.
Tweaks will need to be made before the match events are fully automatic.
Notebooks are run in the main directory of the repository (and just archived in the notebook folder). So paths will have to be modified if you run the notebook in the notebook folder
In [1]:
import praw
import datetime
import pandas as pd
import nltk.sentiment.vader
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from selenium import webdriver
import numpy as np
import os
In [2]:
#url = 'http://www.telegraph.co.uk/football/2017/08/12/watford-vs-liverpool-premier-league-live-score-updates-team/'
thread_id = '6tw65y'
opposition = 'Hoffenheim'
#analysis_name = 'League_1_' + opposition
analysis_name = 'CL_01_' + opposition
In [3]:
# Define some objects to be used later
# set up driver for scraping
driver = webdriver.PhantomJS()
# Define NLTK object
vader = nltk.sentiment.vader.SentimentIntensityAnalyzer()
# set matplotlib style
plt.style.use('ggplot')
# Change this to 0 if you have downloaded the data and want to redownload
use_saved_data = 1
In [4]:
def get_match_report(url):
"""
Function gets all times and titles of telegraph match report
"""
#Open page and make a soup object
driver.get(url)
r = driver.page_source
soup = BeautifulSoup(r, 'lxml')
# This gets the titles and timeing of match events
updates = soup.findAll('div', class_='live-post js-live-post component')
titles = soup.findAll('h3', class_='live-post__title')
titles = [t.text.lower() for t in titles]
tele_times = [u.find('a').text for u in updates]
tele_times_dt = []
for t in tele_times:
t = t.split(':')
if t[1][-2:] == 'AM' or t[0] == '12':
tele_times_dt.append(datetime.time(int(t[0]),int(t[1][:-2])))
else:
tele_times_dt.append(datetime.time(int(t[0])+12,int(t[1][:-2])))
return titles,tele_times_dt
def get_comments(thread_id,praw_info):
reddit = praw.Reddit(client_id=praw_info['client_id'][0],
client_secret=praw_info['client_secret'][0],
password=praw_info['password'][0],
user_agent=praw_info['user_agent'][0],
username=praw_info['username'][0])
submission = reddit.submission(id=thread_id)
submission.comments.replace_more(limit=None, threshold = 0)
return submission
def comment_time_and_sentiment(submission):
time = []
sentiment = []
score = []
# Loop through top comments and add to time and sentiment list
for top_level_comment in submission.comments:
time.append((datetime.datetime.fromtimestamp(top_level_comment.created_utc) - datetime.timedelta(hours=1)))
sentiment.append(vader.polarity_scores(top_level_comment.body)['compound'])
score.append(top_level_comment.score)
# Make time format
pd_time = pd.to_datetime(time)
# Make to dateframe
df = pd.DataFrame(data={'sentiment': sentiment,'score':score}, index = pd_time)
return df
def posneg_sentiment_difference(df,bins='1min'):
# Find comments with positive > 0 and negative < 0 sentiment
pdf = df[df['sentiment'] > 0]
ndf = df[df['sentiment'] < 0]
# Bin
pgdf = pdf.groupby(pd.TimeGrouper(freq=bins)).count()
ngdf = ndf.groupby(pd.TimeGrouper(freq=bins)).count()
diff_df = (pgdf['sentiment']-ngdf['sentiment']).dropna()
return diff_df
def weighted_posneg_sentiment_difference(df,bins='1min'):
# Find comments with positive > 0 and negative < 0 sentiment
df = pd.DataFrame(df[df['score']>0])
pdf = df[df['sentiment'] > 0]
ndf = df[df['sentiment'] < 0]
# Bin
pgdf = pdf.groupby(pd.TimeGrouper(freq=bins)).count()
ngdf = ndf.groupby(pd.TimeGrouper(freq=bins)).count()
# Take the difference
diff_df = (pgdf['sentiment']*pgdf['score']-ngdf['sentiment']*ngdf['score']).dropna()
return diff_df
def plot_sentiment_figure(df,match_events,opposition):
fig = plt.figure(figsize=(6,8))
ax = plt.subplot2grid((7, 1), (1, 0), rowspan=6)
ax_me = plt.subplot2grid((7, 1), (0, 0),sharex=ax)
# Main line
ax.plot(df.index.time,df,linewidth=2,color='firebrick')
# Scale y axis (make even -/+ directions)
ax.set_ylim([-np.max(np.abs(ax.get_ylim())),np.max(np.abs(ax.get_ylim()))])
# Make axis ticks and labels correct
start_xaxis=datetime.datetime.combine(datetime.date.today(),match_events['kickoff'])-datetime.timedelta(minutes=10)
end_xaxis=datetime.datetime.combine(datetime.date.today(),match_events['2ndhalfend'])+datetime.timedelta(minutes=10)
ax.set_xticks([ax.get_xlim()[0]+m*60 for m in range(0,180,30)])
ax_me.set_xticks([ax.get_xlim()[0]+m*60 for m in range(0,180,30)])
ax.set_xlim([start_xaxis.time(),end_xaxis.time()])
ax.set_xlabel('Time (GMT/BST)')
# Get y axis lims to place events
scatter_y_min, scatter_y_max = ax.get_ylim()
# Define first and second half
ax.fill_between([match_events['kickoff'],match_events['1sthalfend']],scatter_y_min,scatter_y_max+np.abs(scatter_y_max*0.05),facecolor='dimgray',alpha=0.25,zorder=0)
ax.fill_between([match_events['2ndhalfbegin'],match_events['2ndhalfend']],scatter_y_min,scatter_y_max+np.abs(scatter_y_max*0.05),facecolor='dimgray',alpha=0.25,zorder=0)
ax.text(datetime.time(match_events['kickoff'].hour,match_events['kickoff'].minute+3),scatter_y_min+np.abs(scatter_y_min*0.05),'First Half')
ax.text(datetime.time(match_events['2ndhalfbegin'].hour,match_events['2ndhalfbegin'].minute+3),scatter_y_min+np.abs(scatter_y_min*0.05),'Second Half')
# MATCH EVENTS (BELOW HERE) MIGHT HAVE TO CHANGE
# Place match events
if match_events['liverpool_goal']:
ax_me.scatter(match_events['liverpool_goal'],np.tile(2,len(match_events['liverpool_goal'])),color='black',s=50)
if match_events['opponent_goal']:
ax_me.scatter(match_events['opponent_goal'],np.tile(1,len(match_events['opponent_goal'])),color='black',s=50)
if match_events['liverpool_dis_goal']:
ax_me.scatter(match_events['liverpool_dis_goal'],np.tile(2,len(match_events['liverpool_dis_goal'])),color='black',s=50)
ax_me.scatter(match_events['liverpool_dis_goal'],np.tile(2,len(match_events['liverpool_dis_goal'])),marker='x',color='red',s=40)
if match_events['opponent_dis_goal']:
ax_me.scatter(match_events['opponent_dis_goal'],np.tile(1,len(match_events['opponent_dis_goal'])),color='black',s=50)
ax_me.scatter(match_events['opponent_dis_goal'],np.tile(1,len(match_events['opponent_dis_goal'])),marker='x',color='red',s=40)
if match_events['liverpool_yellow']:
ax_me.scatter(match_events['liverpool_yellow'],np.tile(2,len(match_events['liverpool_yellow'])),marker='s',color='y',s=40)
if match_events['opponenet_yellow']:
ax_me.scatter(match_events['opponenet_yellow'],np.tile(1,len(match_events['opponenet_yellow'])),marker='s',color='y',s=40)
if match_events['liverpool_red']:
ax_me.scatter(match_events['liverpool_red'],np.tile(2,len(match_events['liverpool_red'])),marker='s',color='y',s=40)
if match_events['opponent_red']:
ax_me.scatter(match_events['opponent_red'],np.tile(1,len(match_events['opponent_red'])),marker='s',color='y',s=40)
ax_me.set_xticklabels([])
ax_me.set_ylim(0.5,2.5)
ax_me.set_yticks([1,2])
ax_me.set_yticklabels([opposition,'Liverpool'])
ax_me.set_xlabel('')
return fig,ax,ax_me
In [5]:
# If data doesn't exist, download it. If data exists, load it.
if use_saved_data == 1 and os.path.exists('./data/' + analysis_name + '.csv'):
df = pd.read_csv('./data/' + analysis_name + '.csv', index_col=0, parse_dates=[0])
else:
# read in reddit api info
praw_info = pd.read_json('praw.json')
# do the sentiment analysis
submission = get_comments(thread_id,praw_info)
df = comment_time_and_sentiment(submission)
df.to_csv('./data/' + analysis_name + '.csv')
# Delete reddit api info
praw_info = {}
In [6]:
# titles,matchevents = get_match_report(url)
In [7]:
#goal = [matchevents[i] for i,t in enumerate(titles) if t == 'goal!']
#penalty = [matchevents[i] for i,t in enumerate(titles) if t[:7] == 'penalty']
#halftime = [matchevents[i] for i,t in enumerate(titles) if t[:2] == 'ht']
#fulltime = [matchevents[i] for i,t in enumerate(titles) if t[:2] == 'ft']
#kickoff = [matchevents[i] for i,t in enumerate(titles) if t[:9] == 'we\'re off']
match_events = {}
match_events['kickoff'] = datetime.time(19,45)
match_events['1sthalfend'] = datetime.time(20,32)
match_events['2ndhalfbegin'] = datetime.time(20,49)
match_events['2ndhalfend'] = datetime.time(21,38)
match_events['liverpool_goal'] = [datetime.time(20,22),datetime.time(21,20)]
match_events['opponent_goal'] = [datetime.time(21,31)]
match_events['liverpool_dis_goal'] = []
match_events['opponent_dis_goal'] = [datetime.time(19,56)]
match_events['liverpool_yellow'] = [datetime.time(21,2),datetime.time(20,59)]
match_events['opponenet_yellow'] = [datetime.time(20,20),datetime.time(21,22)]
match_events['liverpool_red'] = []
match_events['opponent_red'] = []
# I couldn't be bothered to finish this
match_events['liverpool_chance'] = [datetime.time(20,00),datetime.time(20,32)]
match_events['opponent_chance'] = [datetime.time(20,29)]
In [8]:
posneg_df = posneg_sentiment_difference(df,bins='2min')
weighted_posneg_df = weighted_posneg_sentiment_difference(df,bins='1min')
In [9]:
# Plot weighted figure
fig,ax,axtop = plot_sentiment_figure(posneg_df,match_events,opposition)
ax.set_ylabel('# Pos - Neg Comments')
fig.tight_layout()
# Save
fig.savefig('./figures/' + analysis_name + '.png',dpi=300)
fig.savefig('./figures/' + analysis_name + '.pdf',dpi=300)
In [10]:
# Plot weighted figure
fig,ax,axtop = plot_sentiment_figure(weighted_posneg_df,match_events,opposition)
ax.set_ylabel('# Pos - Neg Comments (weighted by upvotes)')
fig.tight_layout()
# Save
fig.savefig('./figures/weighted_' + analysis_name + '.png',dpi=300)
fig.savefig('./figures/weighted_' + analysis_name + '.pdf',dpi=300)