In [1]:
# Dependencies
install.packages(c('readxl', 'qgraph', 'reshape2', 'RColorBrewer'))
In [2]:
# We'll need this for XLSX files:
library(readxl)
rm(list=ls())
Load and additionally preprocess the data:
In [3]:
# Open file
eurovision = data.frame(read_xlsx('Eurovision-Processed.xlsx', ))
# Load everything as factors
eurovision$Year = factor(eurovision$Year, ordered=T)
eurovision$Round = factor(eurovision$Round, ordered=T, levels=c('sf','sf1','sf2','f'))
eurovision$Voters = factor(eurovision$Voters)
#For simplicity replace S&M with Serbia (sorry Montenegro)
eurovision$From_country[eurovision$From_country=='Serbia & Montenegro'] = 'Serbia'
eurovision$To_country[eurovision$To_country=='Serbia & Montenegro'] = 'Serbia'
# Shorten long names so that they fit on diagrams
eurovision$From_country[eurovision$From_country=='Bosnia & Herzegovina'] = 'Bosnia&H.'
eurovision$To_country[eurovision$To_country=='Bosnia & Herzegovina'] = 'Bosnia&H.'
eurovision$From_country[eurovision$From_country=='F.Y.R. Macedonia'] = 'Macedonia'
eurovision$To_country[eurovision$To_country=='F.Y.R. Macedonia'] = 'Macedonia'
eurovision$From_country[eurovision$From_country=='Czech Republic'] = 'Czech R.'
eurovision$To_country[eurovision$To_country=='Czech Republic'] = 'Czech R.'
eurovision$From_country[eurovision$From_country=='United Kingdom'] = 'UK'
eurovision$To_country[eurovision$To_country=='United Kingdom'] = 'UK'
# Factor country names
eurovision$From_country = factor(eurovision$From_country)
eurovision$To_country = factor(eurovision$To_country)
# Discard votes from countries for themselves, they're always 0
eurovision = eurovision[eurovision$From_country != eurovision$To_country, ]
# Discard 2004 semifinal for countries that didn't vote in that:
eurovision = eurovision[eurovision$Year != '2004' | eurovision$Round != 'sf' | !(eurovision$From_country %in% c("France","Poland","Russia")),]
# Look what we got
head(eurovision)
Do some sanity checks
In [4]:
# Sanity check 1: sets of countries in both fields are identical
all(levels(eurovision$From_country) == levels(eurovision$To_country))
In [5]:
# Sanity check 2: every country gave away 11 distinct scores
table(aggregate(Points~From_country+Year+Round, data=eurovision, FUN=function(x){length(unique(x))})$Points)
In [6]:
# Sanity check 2.5: countries that didn't give away 11 distinct scores
ver_points = aggregate(Points~From_country+Year+Round+Voters, data=eurovision, FUN=function(c){length(unique(c))})
ver_points[ver_points$Points!=11,]
In [7]:
# Discard countries that participated in 3 contests or less:
# List of participations:
participations = aggregate(Year ~ From_country, eurovision, FUN=function(x){length(unique(x))})
participations = participations[order(participations$Year),]
participations
# Excluded list:
exclude = participations$From_country[participations$Year <= 3]
eurovision = eurovision[!(eurovision$From_country %in% exclude | eurovision$To_country %in% exclude),]
In [8]:
# Re-factor the list after exclusion
eurovision$From_country = factor(eurovision$From_country)
eurovision$To_country = factor(eurovision$To_country)
In [9]:
# Calculate average score for each country in each round:
avg_scores = aggregate(Points ~ To_country+Year+Round+Voters, data=eurovision, mean)
In [10]:
# Calculate biases by substracting average score from votes
biases = eurovision
# This function (signed square of difference) is used to calculate bias:
sigsq <- function(x,y){(x-y)^2*sign(x-y)}
# Calculate bias:
for(i in 1:nrow(biases)) {
biases$Points[i] = sigsq(biases$Points[i], avg_scores$Points[avg_scores$Year == biases[i, 'Year'] & avg_scores$Round == biases[i, 'Round'] & avg_scores$To_country == biases[i, 'To_country']][1])
}
In [11]:
# Aggregate bias scores over all years and tours:
aggr.bias = aggregate(Points~From_country+To_country, data=biases, FUN=mean)
aggr.bias = aggr.bias[order(aggr.bias$From_country),]
In [12]:
# Look at the most and least biased country pairs
aggr.bias[order(aggr.bias$Points, decreasing = T),]
In [13]:
# Load this for visualisation
library(qgraph)
In [14]:
# Plot biases on a weighted directional graph: (uncolored for now)
qgraph(aggr.bias, minimum=25, cut=40)
In [15]:
# This one is needed to convert between long and wide matrices forms
library('reshape2')
In [16]:
# Cast long form to wide form, turn country names into rownames, replace NA's with 0
aggr.distance = dcast(aggr.bias, From_country~To_country)
aggr.distance.matrix = aggr.distance[,-1]
aggr.distance.matrix = as.matrix(aggr.distance.matrix)
rownames(aggr.distance.matrix) = aggr.distance[,1]
aggr.distance.matrix[is.na(aggr.distance.matrix)] = 0
In [17]:
# To make matrix symmetric, we take the larger (by absolute value) number out of (m,n) and (n,m)
indeces = as.numeric(abs(aggr.distance.matrix)>=abs(t(aggr.distance.matrix)))
aggr.distance.matrix.sym = aggr.distance.matrix*indeces + t(aggr.distance.matrix) * (1-indeces)
In [18]:
# Rescale this matrix so that Largest bias is smallest distance
aggr.distance.matrix.sym = max(aggr.distance.matrix.sym)-aggr.distance.matrix.sym
In [19]:
# Convert to distance matrix
aggr.distance.matrix.sym = as.dist(aggr.distance.matrix.sym)
In [20]:
# Run hierarchical clustering:
clust = hclust(aggr.distance.matrix.sym)
In [21]:
# Show the dendrogram
plot(clust)
In [22]:
# Split into 25 clusters
cl_groups=cutree(clust, k=25)
cl_groups=cl_groups[clust$order]
cl_groups
# Get grouplist - list of character vectors; each vector represent 1 cluster
grlist = list()
i=1
for(j in unique(cl_groups)) {
countries = names(cl_groups)[cl_groups==j]
countries = factor(countries, levels = levels(eurovision$From_country))
grlist[[i]] = countries
i=i+1
}
# Remove groups with 1 item
grlist2 = grlist[lapply(grlist,length) > 1]
grlist2
In [23]:
# Remove countries which are alone in a cluster
loners = unlist(grlist[lapply(grlist,length) == 1])
aggr.bias.clust = aggr.bias[!(aggr.bias$From_country %in% loners | aggr.bias$To_country %in% loners),]
aggr.bias.clust$From_country = factor(aggr.bias.clust$From_country)
aggr.bias.clust$To_country = factor(aggr.bias.clust$To_country)
# Re-factorize the list
grlist2 = lapply(grlist2, function(c) {c = factor(c,levels = levels(aggr.bias.clust$From_country))})
Now, visualizations!
In [24]:
# Need this for the color palette
library(RColorBrewer)
In [25]:
# A palette of background colors for graph circles
my_pal=c(brewer.pal(9,"Pastel1"), brewer.pal(8,"Pastel2"), brewer.pal(12, "Paired")[c(1,3,5,7,9,11)], brewer.pal(6,"Set2")[c(5,6)])
In [26]:
# A pallette of foreground colors for the dendrogram
my_pal2=c(brewer.pal(12,"Paired")[c(2,4,6,8,10,12)], brewer.pal(8,"Dark2"), brewer.pal(5,"Set1"), brewer.pal(8,"Accent")[5:8],brewer.pal(10,"RdGy")[c(2,10)])
In [27]:
# Re-draw our unclustered graph with colors
qgraph(aggr.bias, minimum=25, cut=40, color=my_pal[cl_groups][order(names(cl_groups))])
postscript("unclustered.eps", width = 6, height = 6, horizontal = FALSE, onefile = FALSE)
qgraph(aggr.bias, minimum=25, cut=40, color=my_pal[cl_groups][order(names(cl_groups))])
dev.off()
In [28]:
# Unclustered graph with colors for negative items only
qgraph(aggr.bias[aggr.bias$Points<0,], minimum=15, color=my_pal[cl_groups][order(names(cl_groups))])
postscript("unclusteredNeg.eps", width = 6, height = 6, horizontal = FALSE, onefile = FALSE)
qgraph(aggr.bias[aggr.bias$Points<0,], minimum=15, color=my_pal[cl_groups][order(names(cl_groups))])
dev.off()
In [29]:
# Draw a clustered graphs (customly rearranged palette)
qgraph(aggr.bias.clust, layout='groups', groups=grlist2, minimum=35, cut=50, color=my_pal[c(5,17,7,11,2,13,12,18,16,8,1,19,3,6,20,9)])
postscript("clustered.eps", width = 6, height = 6, horizontal = FALSE, onefile = FALSE)
qgraph(aggr.bias.clust, layout='groups', groups=grlist2, minimum=35, cut=50, color=my_pal[c(5,17,7,11,2,13,12,18,16,8,1,19,3,6,20,9)])
dev.off()
In [30]:
# Export clusters into CSV file for map visualisation
cl_groups2 = cl_groups[!names(cl_groups) %in% loners]
ccc = sapply(cl_groups2, function(x){which.max(unique(cl_groups2)==x)})
write.csv(cbind(names(ccc),unname(ccc)), 'cl.csv')
In [31]:
# Count number of wins
a1 = aggregate(Points~To_country+Year+Voters, eurovision, subset = Round=="f", sum)
a2 = aggregate(Points~To_country+Year, a1, mean)
a3 = aggregate(Points~Year, a1, which.max)
winners = c()
for (y in a3$Year) {
aa4 = a2[a2$Year == y,]
winners = c(winners, as.character(aa4$To_country[which.max(aa4$Points)]))
}
winners = table(winners)
winners
In [32]:
# Extend wins with 0
winners[levels(eurovision$To_country)[!levels(eurovision$To_country) %in% names(winners)]]=0
winners=winners[order(names(winners))]
In [33]:
# For dendrogram fan layout
suppressPackageStartupMessages(library(ape))
In [34]:
# For circle
library(plotrix)
In [35]:
# Plot dendrogram with circular layout, draw the level circle
plot(as.phylo(clust), type = "radial", tip.color = my_pal2[cl_groups][order(names(cl_groups))], label.offset=0.04, cex=0.6)
draw.circle(0,0,0.94, border = 'gray', lty='dashed')
postscript("printsPerfectly.eps", width = 6, height = 6, horizontal = FALSE, onefile = FALSE)
plot(as.phylo(clust), type = "radial", tip.color = my_pal2[cl_groups][order(names(cl_groups))], label.offset=0.04, cex=0.6)
draw.circle(0,0,0.94, border = 'gray', lty='dashed')
dev.off()
In [36]:
# Re-sort bias to analyse how mutual the arrows are
aggr.bias.2 = aggr.bias[order(aggr.bias$To_country),]
head(aggr.bias.2,20)
In [37]:
# Here, two mutual bias scores between the countries are geometrically averaged:
ab3 = aggr.bias
ab3$Points = sign(aggr.bias$Points * aggr.bias.2$Points) * sqrt(abs(aggr.bias$Points * aggr.bias.2$Points))
ab3 = ab3[order(ab3$Points, decreasing = T),]
head(ab3)
tail(ab3)
In [38]:
# Plot graph of the bias symmetricity:
qgraph(ab3, minimum=15)
# Something didn't work as expected...
In [39]:
# Who is Estonia biased towards:
estb = aggr.bias[aggr.bias$From_country=="Estonia",]
estb[order(estb$Points, decreasing = T),]
In [40]:
# By years: 1975-1984
aggr.bias.84 = aggregate(Points~From_country+To_country, data=biases, FUN=mean, subset = (Year >= 1975 & Year < 1985))
aggr.bias.84 = aggr.bias.84[order(aggr.bias.84$From_country),]
In [41]:
# By years: 1985-1994
aggr.bias.94 = aggregate(Points~From_country+To_country, data=biases, FUN=mean, subset = (Year >= 1985 & Year < 1995))
aggr.bias.94 = aggr.bias.94[order(aggr.bias.94$From_country),]
In [42]:
# By years: 1995-2004
aggr.bias.04 = aggregate(Points~From_country+To_country, data=biases, FUN=mean, subset = (Year >= 1995 & Year < 2005))
aggr.bias.04 = aggr.bias.04[order(aggr.bias.04$From_country),]
In [43]:
# By years: 2005-2017
aggr.bias.17 = aggregate(Points~From_country+To_country, data=biases, FUN=mean, subset = (Year >= 2005))
aggr.bias.17 = aggr.bias.17[order(aggr.bias.17$From_country),]
In [44]:
# Plot all of these:
In [45]:
qgraph(aggr.bias.84, minimum=10, cut=20)
postscript("84.eps", width = 6, height = 6, horizontal = FALSE, onefile = FALSE)
qgraph(aggr.bias.84, minimum=10, cut=20)
dev.off()
In [46]:
qgraph(aggr.bias.94, minimum=20, cut=40)
postscript("94.eps", width = 6, height = 6, horizontal = FALSE, onefile = FALSE)
qgraph(aggr.bias.94, minimum=20, cut=40)
dev.off()
In [47]:
qgraph(aggr.bias.04, minimum=30, cut=40)
postscript("04.eps", width = 6, height = 6, horizontal = FALSE, onefile = FALSE)
qgraph(aggr.bias.04, minimum=30, cut=40)
dev.off()
In [48]:
qgraph(aggr.bias.17, minimum=30, cut=40)
postscript("17.eps", width = 6, height = 6, horizontal = FALSE, onefile = FALSE)
qgraph(aggr.bias.17, minimum=30, cut=40)
dev.off()