Sentiment analysis of match thread

To run the code

Firstly, I write for Python 3.x. Python 2 may work but I don't consciously try and correct for any Python 2 differences. Apart from installing the necessary packages (pandas, numpy, matplotlib and so on). One additional thing is needed:

  1. Get a client_id / client_secret set up with PRAW / Reddit. In this code it is assumed that there is a file called: praw.json which contains client_id, client_secret, password, user_agent, and username.

Tweaks will need to be made before the match events are fully automatic.

Notebooks are run in the main directory of the repository (and just archived in the notebook folder). So paths will have to be modified if you run the notebook in the notebook folder

Notes for this mathch.

Premier league games will be automatic in getting match events. But Europe and cup games will probabaly be manually entering events.

Import packages


In [1]:
import praw
import datetime
import pandas as pd
import nltk.sentiment.vader
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
# from selenium import webdriver
import numpy as np
import os
from urllib.request import urlopen, urlretrieve

Some parameters

These need to be changed every match


In [2]:
url = ''
thread_id = '6zwm38'
opposition = 'Sevilla'
competition = 'CL'
hometeam = 'Liverpool'

#analysis_name = 'League_1_' + opposition
analysis_name = competition + '_03_' + opposition

# Hopefully this can be fixed somehow without having to specify it
kickoff = datetime.time(19,45)
firsthalfend = datetime.time(20,33)
secondhalfbegin = datetime.time(20,48)
secondhalfend = datetime.time(21,38)

More parameters

These parameters and definitions that don't need to change each game


In [3]:
# Define some objects to be used later
# set up driver for scraping
# driver = webdriver.PhantomJS()
# Define  NLTK object
vader = nltk.sentiment.vader.SentimentIntensityAnalyzer()
# set matplotlib style
plt.style.use('ggplot')
# Change this to 0 if you have downloaded the data and want to redownload
use_saved_data = 1

Function definitions

Funcitons that do most of the work

Functions for match events


In [4]:
def get_match_report(url,kickoff,secondhalfbegin):
    """
    Function gets all times and titles of telegraph match report

    """
    #Open page and make a soup object
    #driver.get(url)
    #r = driver.page_source
    r = urlopen(url).read()
    soup = BeautifulSoup(r, 'lxml')
    # This gets the titles and timeing of match events
    home_events_html = soup.findAll('div', class_='event home')
    away_events_html = soup.findAll('div', class_='event away')
    home_events = parse_match_events_html(home_events_html,kickoff,secondhalfbegin)
    away_events = parse_match_events_html(away_events_html,kickoff,secondhalfbegin)
    return home_events,away_events

def parse_match_events_html(events_html,kickoff,secondhalfbegin):
    event_list = []
    for e in events_html:
        time = e.find('time').text
        evtype = e.find('time').nextSibling.lower()
        evtype = evtype.replace(' ','')
        evtype = evtype.replace('\n','')
        time = time.replace(' ','')
        time = time.replace("'",'')
        # Adds time together (e.g. 90+2) if it exists
        time_split = time.split('+')
        time = np.sum(list(map(int,time_split)))

        if (len(time_split)>1 and time < 90) or time < 45:
            time_real = add_times(kickoff,time)
        else:
            time_real = add_times(secondhalfbegin,time-45)

        event_list.append([time_real,evtype])
    return event_list


def parse_match_events(liverpool_events,opposition_events):

    match_events = {}
    # Preallocate
    match_events['liverpool_goal'] = []
    match_events['opponenet_goal'] = []
    #match_events['liverpool_dis_goal'] = []
    #match_events['opponent_dis_goal'] = []
    match_events['liverpool_yellowcard'] = []
    match_events['opponenet_yellowcard'] = []
    match_events['liverpool_redcard'] = []
    match_events['opponenet_redcard'] = []
    match_events['liverpool_substitution'] = []
    match_events['opponenet_substitution'] = []

    for e in liverpool_events:
        match_events['liverpool_' + e[1]] += [e[0]]
    for e in opposition_events:
        match_events['opponenet_' + e[1]] += [e[0]]

    return match_events

Get comment data and sentiment funcitons


In [5]:
def get_comments(thread_id,praw_info):
    reddit = praw.Reddit(client_id=praw_info['client_id'][0],
                         client_secret=praw_info['client_secret'][0],
                         password=praw_info['password'][0],
                         user_agent=praw_info['user_agent'][0],
                         username=praw_info['username'][0])
    submission = reddit.submission(id=thread_id)
    submission.comments.replace_more(limit=None, threshold = 0)
    return submission

def comment_time_and_sentiment(submission):
    time = []
    sentiment = []
    score = []
    # Loop through top comments and add to time and sentiment list
    for top_level_comment in submission.comments:
        time.append((datetime.datetime.fromtimestamp(top_level_comment.created_utc) - datetime.timedelta(hours=1)))
        sentiment.append(vader.polarity_scores(top_level_comment.body)['compound'])
        score.append(top_level_comment.score)
    # Make time format
    pd_time = pd.to_datetime(time)
    # Make to dateframe
    df = pd.DataFrame(data={'sentiment': sentiment,'score':score}, index = pd_time)
    return df

def posneg_sentiment_difference(df,bins='1min'):
    # Find comments with positive > 0 and negative < 0 sentiment
    pdf = df[df['sentiment'] > 0]
    ndf = df[df['sentiment'] < 0]

    # Bin
    pgdf = pdf.groupby(pd.TimeGrouper(freq=bins)).count()
    ngdf = ndf.groupby(pd.TimeGrouper(freq=bins)).count()
    diff_df = (pgdf['sentiment']-ngdf['sentiment']).dropna()
    return diff_df


def weighted_posneg_sentiment_difference(df,bins='1min'):
    # Find comments with positive > 0 and negative < 0 sentiment
    df = pd.DataFrame(df[df['score']>0])
    pdf = df[df['sentiment'] > 0]
    ndf = df[df['sentiment'] < 0]
    # Bin
    pgdf = pdf.groupby(pd.TimeGrouper(freq=bins)).count()
    ngdf = ndf.groupby(pd.TimeGrouper(freq=bins)).count()
    # Take the difference
    diff_df = (pgdf['sentiment']*pgdf['score']-ngdf['sentiment']*ngdf['score']).dropna()
    return diff_df

Plotting and misc functions


In [6]:
def add_times(base_time, match_time):
    time = datetime.datetime.combine(datetime.date.today(),base_time)+datetime.timedelta(minutes=int(match_time))
    return time.time()

def plot_sentiment_figure(df,match_events,opposition):

    fig = plt.figure(figsize=(6,8))
    ax = plt.subplot2grid((7, 1), (1, 0), rowspan=6)
    ax_me = plt.subplot2grid((7, 1), (0, 0),sharex=ax)
    # Main line
    ax.plot(df.index.time,df,linewidth=2,color='firebrick')
    # Scale y axis (make even -/+ directions)
    ax.set_ylim([-np.max(np.abs(ax.get_ylim())),np.max(np.abs(ax.get_ylim()))])
    # Make axis ticks and labels correct
    start_xaxis=datetime.datetime.combine(datetime.date.today(),match_events['kickoff'])-datetime.timedelta(minutes=10)
    end_xaxis=datetime.datetime.combine(datetime.date.today(),match_events['secondhalfend'])+datetime.timedelta(minutes=10)
    ax.set_xticks([match_events['kickoff'].hour*3600+m*60 for m in range(0,180,30)])
    ax.set_xlim([start_xaxis.time(),end_xaxis.time()])
    ax.set_xlabel('Time (GMT/BST)')
    # Get y axis lims to place events
    scatter_y_min, scatter_y_max = ax.get_ylim()
    # Define first and second half
    ax.fill_between([match_events['kickoff'],match_events['firsthalfend']],scatter_y_min,scatter_y_max+np.abs(scatter_y_max*0.05),facecolor='dimgray',alpha=0.25,zorder=0)
    ax.fill_between([match_events['secondhalfbegin'],match_events['secondhalfend']],scatter_y_min,scatter_y_max+np.abs(scatter_y_max*0.05),facecolor='dimgray',alpha=0.25,zorder=0)
    ax.text(datetime.time(match_events['kickoff'].hour,match_events['kickoff'].minute+3),scatter_y_min+np.abs(scatter_y_min*0.05),'First Half')
    ax.text(datetime.time(match_events['secondhalfbegin'].hour,match_events['secondhalfbegin'].minute+3),scatter_y_min+np.abs(scatter_y_min*0.05),'Second Half')

    # MATCH EVENTS (BELOW HERE) MIGHT HAVE TO CHANGE
    # Place match events
    axlabs = []
    if match_events['liverpool_goal']:
        axlabs += [ax_me.scatter(match_events['liverpool_goal'],np.tile(2,len(match_events['liverpool_goal'])),color='black',s=50,label='goal')]
    if match_events['opponenet_goal']:
        axlabs += [ax_me.scatter(match_events['opponenet_goal'],np.tile(1,len(match_events['opponenet_goal'])),color='black',s=50,label='goal')]
    if match_events['liverpool_dis_goal']:
        ax_me.scatter(match_events['liverpool_dis_goal'],np.tile(2,len(match_events['liverpool_dis_goal'])),color='black',s=50)
        ax_me.scatter(match_events['liverpool_dis_goal'],np.tile(2,len(match_events['liverpool_dis_goal'])),marker='x',color='red',s=40)
    if match_events['opponent_dis_goal']:
        ax_me.scatter(match_events['opponent_dis_goal'],np.tile(1,len(match_events['opponent_dis_goal'])),color='black',s=50)
        ax_me.scatter(match_events['opponent_dis_goal'],np.tile(1,len(match_events['opponent_dis_goal'])),marker='x',color='red',s=40)
    if match_events['liverpool_yellowcard']:
        axlabs += [ax_me.scatter(match_events['liverpool_yellowcard'],np.tile(2,len(match_events['liverpool_yellowcard'])),marker='s',color='y',s=40,label='yellow')]
    if match_events['opponenet_yellowcard']:
        axlabs += [ax_me.scatter(match_events['opponenet_yellowcard'],np.tile(1,len(match_events['opponenet_yellowcard'])),marker='s',color='y',s=40,label='yellow')]
    if match_events['liverpool_redcard']:
        axlabs += [ax_me.scatter(match_events['liverpool_redcard'],np.tile(2,len(match_events['liverpool_redcard'])),marker='s',color='r',s=40,label='red')]
    if match_events['opponenet_redcard']:
        axlabs += [ax_me.scatter(match_events['opponenet_redcard'],np.tile(1,len(match_events['opponenet_redcard'])),marker='s',color='r',s=40,label='red')]
    if match_events['liverpool_substitution']:
        axlabs += [ax_me.scatter(match_events['liverpool_substitution'],np.tile(2,len(match_events['liverpool_substitution'])),marker='s',color='g',s=10,label='sub')]
    if match_events['opponenet_substitution']:
        axlabs += [ax_me.scatter(match_events['opponenet_substitution'],np.tile(1,len(match_events['opponenet_substitution'])),marker='s',color='g',s=10,label='sub')]

    # Filter out any duplicate labels
    l = []
    lax = []
    for n in axlabs:
        lt = n.get_label()
        if lt not in l:
            l += [lt]
            lax.append(n)
    fig.legend(lax,l,ncol=len(lax),loc=9,fontsize='small')

    ax_me.set_ylim(0.5,2.5)
    ax_me.set_yticks([1,2])
    ax_me.set_yticklabels([opposition,'Liverpool'])
    ax_me.set_xlabel('')
    plt.setp(ax_me.get_xticklabels(), visible=False)

    return fig,ax,ax_me

Get comments and sentiment score

If the data already exists, it loads that (remember to run in main directory, not notebook directory)


In [7]:
# If data doesn't exist, download it. If data exists, load it.
if use_saved_data == 1 and os.path.exists('./data/' + analysis_name + '.csv'):
    df = pd.read_csv('./data/' + analysis_name + '.csv', index_col=0, parse_dates=[0])
else:
    # read in reddit api info
    praw_info = pd.read_json('praw.json')
    # do the sentiment analysis
    submission = get_comments(thread_id,praw_info)
    df = comment_time_and_sentiment(submission)
    df.to_csv('./data/' + analysis_name +  '.csv')
    # Delete reddit api info
    praw_info = {}

Get match report


In [8]:
#if hometeam.lower() == 'liverpool':
#    liverpool_events,opposition_events = get_match_report(url,kickoff,secondhalfbegin)
#else:
#    opposition_events,liverpool_events = get_match_report(url,kickoff,secondhalfbegin)

Parse match report


In [9]:
# match_events = parse_match_events(liverpool_events,opposition_events)

# Manually entered. Taken from www.telegraph.co.uk/football/2017/08/23/liverpool-vs-hoffenheim-champions-league-qualifier-live-score/#update-20170823-1956 or matchthread
match_events = {}
# Preallocate
match_events['liverpool_goal'] = [datetime.time(20,6),datetime.time(20,21)]
match_events['opponenet_goal'] = [datetime.time(19,51),datetime.time(21,15)]
match_events['liverpool_dis_goal'] = [datetime.time(20,27)]
match_events['opponent_dis_goal'] = []
match_events['liverpool_yellowcard'] = [datetime.time(20,24),datetime.time(21,7)]
match_events['opponenet_yellowcard'] = [datetime.time(20,19),datetime.time(20,21),datetime.time(20,27)]
match_events['liverpool_redcard'] = [datetime.time(21,37)]
match_events['opponenet_redcard'] = [datetime.time(21,2)]
match_events['liverpool_substitution'] = [datetime.time(21,19),datetime.time(21,27),datetime.time(21,32)]
match_events['opponenet_substitution'] = [datetime.time(20,30),datetime.time(21,26)]
# match_events = parse_match_events(liverpool_events,opposition_events)
match_events['kickoff'] = kickoff
match_events['firsthalfend'] = firsthalfend
match_events['secondhalfbegin'] = secondhalfbegin
match_events['secondhalfend'] = secondhalfend

Get positive/negative difference

Sort into number of positive and number of negative comments


In [10]:
posneg_df = posneg_sentiment_difference(df,bins='2min')
weighted_posneg_df = weighted_posneg_sentiment_difference(df,bins='1min')

Plot figure (unweighted)


In [11]:
# Plot weighted figure
fig,ax,axtop = plot_sentiment_figure(posneg_df,match_events,opposition)
ax.set_ylabel('# Pos - Neg Comments')
fig.tight_layout()
# Save
fig.savefig('./figures/' + analysis_name +  '.png',dpi=300)
fig.savefig('./figures/' + analysis_name +  '.pdf',dpi=300)


Plot figure (weighted)


In [12]:
# Plot weighted figure
fig,ax,axtop = plot_sentiment_figure(weighted_posneg_df,match_events,opposition)
ax.set_ylabel('# Pos - Neg Comments (weighted by upvotes)')
fig.tight_layout()
# Save
fig.savefig('./figures/weighted_' + analysis_name +  '.png',dpi=300)
fig.savefig('./figures/weighted_' + analysis_name +  '.pdf',dpi=300)