In [1]:
# Importing the Python libraries we will use below #
import scipy as scipy
import pandas as pd
import ggplot as gg
import seaborn as sns
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from bokeh.plotting import *
In [2]:
# Setting the chose graphical styles #
%matplotlib inline
output_notebook()
sns.set_style("darkgrid", {"grid.linewidth": .9, "axes.facecolor": ".98"})
sns.set_context("notebook") # paper, notebook, talk, poster
# colour_map = dict(unsafe="red", celebs="blue", general="yellow", awareness="grey", myths="purple", stigma="green", safe="#2ecc71", advocates="#34495e", race="#e74c3c", jokes="#3498db",needle="steelblue", questions="indianred")
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)
In [3]:
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba.xls")
In [4]:
curitibaData.sheet_names
Out[4]:
In [5]:
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True)
In [6]:
curitibaPublic
Out[6]:
In [7]:
curitibaPublic = curitibaPublic.loc[:,'2014-06':'2014-09']
curitibaPublic
Out[7]:
In [8]:
curitibaPublic = curitibaPublic.transpose()
curitibaPublic
Out[8]:
In [9]:
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
'HIV Testing, Females'] # Giving the columns fairly short and explanatory names
curitibaPublic
Out[9]:
In [10]:
curitibaPublic.dtypes # Figuring out what datatype each column is read as
Out[10]:
In [11]:
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such
In [12]:
curitibaPublic # Checking that data looks the same after the datatype shenanigans
Out[12]:
In [13]:
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic
Out[13]:
Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)
In [14]:
for col in curitibaPublic:
fig = figure( # "fig" holds all the global settings
plot_width = 1000,
plot_height = 600,
title = curitibaPublic[col].name, # Plot title
y_axis_label = 'Tests',
x_axis_label = 'Date',
title_text_font = 'Oswald',
title_text_color = '#363636',
background_fill = '#FAFAFA', # Background colour for plot area
outline_line_color = '#FAFAFA', # Colour of line sorrounding plot
border_fill = '#FAFAFA', # Background colour for surrounding area
x_axis_type = 'datetime', # NOTE: only need to define this on first graph
x_range = (curitibaPublic.index.min(),
curitibaPublic.index.max()), # Setting x-axis to start and end on first and last date of dataset
y_range = (0,(curitibaPublic[col].max() * 1.1)), # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
#tools="pan,wheel_zoom,box_zoom,reset,previewsave" # NOTE: only needed on first, if commented out, chooses default tools
)
fig.line( # Inserting a line in the chart called "fig"
curitibaPublic.index, # Variable values for the x-axis (index = dates)
curitibaPublic[col], # Variable values for the y-axis (loops over all columns)
line_color = '#404040', # Colour of the line
line_width = 10, # Width of the line
line_alpha = 0.7, # Opacity of the line
#legend = curitibaPublic[col].name, # Label name for the legend (column name)
)
legend().label_text_font='Open Sans'
legend().label_text_color='#363636'
legend().border_line_color='#f6f6f6'
axis().axis_label_text_font = "Open Sans"
axis().axis_label_text_font_size = "12pt"
axis().axis_label_text_color = "#363636"
axis().major_label_text_font="Open Sans"
axis().major_label_text_font_size="10pt"
axis().minor_tick_line_color = "#d4d4d4"
xaxis().axis_line_color = '#d4d4d4'
xaxis().major_tick_line_color = "#d4d4d4"
yaxis().major_tick_line_color = None
yaxis().axis_line_color = None
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 0.5
show(fig)
Below we'll insert trendlines for all topics in one chart to better compare.
In [15]:
fig = figure(
plot_width = 1000, plot_height = 600, title = 'All Topics', y_axis_label = 'Tests', x_axis_label = 'Date',
title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,3500),
)
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7,
legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["Pregnant Women"], line_color = '#5d6263', line_width = 5, line_alpha = 0.7,
legend = "Pregnant Women")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
legend = "HIV Testing, Males")
legend().label_text_font='Open Sans'
legend().label_text_color='#363636'
legend().border_line_color='#f6f6f6'
axis().axis_label_text_font = "Open Sans"
axis().axis_label_text_font_size = "12pt"
axis().axis_label_text_color = "#363636"
axis().major_label_text_font="Open Sans"
axis().major_label_text_font_size="10pt"
axis().minor_tick_line_color = "#d4d4d4"
xaxis().axis_line_color = '#d4d4d4'
xaxis().major_tick_line_color = "#d4d4d4"
yaxis().major_tick_line_color = None
yaxis().axis_line_color = None
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 0.5
show(fig)
We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.
First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.
As we're currently looking at timeseries correlations, we'll just use the default: Pearson.
Pearson assumes that the data is normal distributed. We can't really test that with only four data points, but code has been readied below for furture use.
In [16]:
(gg.ggplot(gg.aes(x="Primary Care, Females"), data=curitibaPublic)
+ gg.geom_histogram())
Out[16]:
In [17]:
(gg.ggplot(gg.aes(x="Primary Care, Males"), data=curitibaPublic)
+ gg.geom_histogram())
Out[17]:
In [18]:
(gg.ggplot(gg.aes(x="Pregnant Women"), data=curitibaPublic)
+ gg.geom_histogram())
Out[18]:
In [19]:
(gg.ggplot(gg.aes(x="HIV Testing, Females"), data=curitibaPublic)
+ gg.geom_histogram())
Out[19]:
In [20]:
(gg.ggplot(gg.aes(x="HIV Testing, Males"), data=curitibaPublic)
+ gg.geom_histogram())
Out[20]:
In [22]:
(gg.ggplot(gg.aes(x="Curitiba Total"), data=curitibaPublic)
+ gg.geom_histogram())
Out[22]:
In [23]:
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr
Out[23]:
In [24]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublicCorr, annot=False, sig_stars=True,
diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [25]:
sns.jointplot("Primary Care, Females", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");
In [26]:
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");
In [46]:
# Checking that the data file looks right #
!cat ../data/all.tsv | head
In [43]:
# Read in Twitter data file #
twitterData=pd.read_table('../data/all.tsv',
encoding='utf-8',
#header=None,
na_values=['NaN',''],
parse_dates=[1],
index_col=[1]
)
In [44]:
twitterData.head()
Out[44]:
In [31]:
twitterDataSmall = twitterData.loc[:,'Campaign':'Testing']
In [32]:
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg
Out[32]:
In [33]:
for col in twitterDataSmallAgg:
fig = figure( # "fig" holds all the global settings
plot_width = 1000,
plot_height = 600,
title = twitterDataSmallAgg[col].name, # Plot title
y_axis_label = 'Tests',
x_axis_label = 'Date',
title_text_font = 'Oswald',
title_text_color = '#363636',
background_fill = '#FAFAFA', # Background colour for plot area
outline_line_color = '#FAFAFA', # Colour of line sorrounding plot
border_fill = '#FAFAFA', # Background colour for surrounding area
x_axis_type = 'datetime', # NOTE: only need to define this on first graph
x_range = (twitterDataSmallAgg.index.min(),
twitterDataSmallAgg.index.max()), # Setting x-axis to start and end on first and last date of dataset
y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)), # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
#tools="pan,wheel_zoom,box_zoom,reset,previewsave" # NOTE: only needed on first, if commented out, chooses default tools
)
fig.line( # Inserting a line in the chart called "fig"
twitterDataSmallAgg.index, # Variable values for the x-axis (index = dates)
twitterDataSmallAgg[col], # Variable values for the y-axis (loops over all columns)
line_color = '#404040', # Colour of the line
line_width = 10, # Width of the line
line_alpha = 0.7, # Opacity of the line
#legend = curitibaPublic[col].name, # Label name for the legend (column name)
)
legend().label_text_font='Open Sans'
legend().label_text_color='#363636'
legend().border_line_color='#f6f6f6'
axis().axis_label_text_font = "Open Sans"
axis().axis_label_text_font_size = "12pt"
axis().axis_label_text_color = "#363636"
axis().major_label_text_font="Open Sans"
axis().major_label_text_font_size="10pt"
axis().minor_tick_line_color = "#d4d4d4"
xaxis().axis_line_color = '#d4d4d4'
xaxis().major_tick_line_color = "#d4d4d4"
yaxis().major_tick_line_color = None
yaxis().axis_line_color = None
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 0.5
show(fig)
In [34]:
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
left_index=True, right_index=True, sort=True,
suffixes=('_x', '_y'), copy=True)
In [35]:
df.head()
Out[35]:
In [36]:
dfCorr = df.corr() # Using default method: Pearson
dfCorr
Out[36]:
In [37]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfCorr, annot=False, sig_stars=True,
diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [38]:
sns.jointplot("Curitiba Total", "Twitter Total", df, kind="reg", color="#404040");
In [ ]:
fig = figure(
plot_width = 1000, plot_height = 600, title = 'All Topics', y_axis_label = 'Tests', x_axis_label = 'Date',
title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,3500),
)
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7,
legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["Pregnant Women"], line_color = '#5d6263', line_width = 5, line_alpha = 0.7,
legend = "Pregnant Women")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
legend = "HIV Testing, Males")
legend().label_text_font='Open Sans'
legend().label_text_color='#363636'
legend().border_line_color='#f6f6f6'
axis().axis_label_text_font = "Open Sans"
axis().axis_label_text_font_size = "12pt"
axis().axis_label_text_color = "#363636"
axis().major_label_text_font="Open Sans"
axis().major_label_text_font_size="10pt"
axis().minor_tick_line_color = "#d4d4d4"
xaxis().axis_line_color = '#d4d4d4'
xaxis().major_tick_line_color = "#d4d4d4"
yaxis().major_tick_line_color = None
yaxis().axis_line_color = None
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 0.5
show(fig)
In [40]:
sns.jointplot("Primary Care, Females", "Prevention", df, kind="reg", color="#404040");
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [ ]: