In [1]:
# Importing the Python libraries we will use below #
import sys
import numpy as np
import scipy as scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import ggplot as gg
import seaborn as sns
from bokeh.plotting import *
In [2]:
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark -a 'Rene Clausen Nielsen, UN Global Pulse' -p pandas,numpy,scipy,geolocator,ggplot,matplotlib,mpld3,seaborn,bokeh -d -n -t -z -v -m -g
In [3]:
# Setting the chosen graphical styles #
%matplotlib inline
output_notebook()
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)
sns.set_context("poster")
sns.despine()
#sns.set_style("whitegrid", {'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10, 'axes.titlesize': 12,
# 'xtick.labelsize': 10, 'ytick.labelsize': 10, 'grid.linewidth': .2, 'axes.facecolor': ".97",
# 'grid.color': '.9', 'axes.edgecolor': '.9', 'font.family': ['sans-serif'], 'lines.solid_capstyle': 'round',
# 'font.sans-serif': ['Liberation Sans','Bitstream Vera Sans','sans-serif','Arial'],})
sns.set_style("whitegrid", {'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10, 'axes.titlesize': 12,
'xtick.labelsize': 10, 'ytick.labelsize': 10, 'grid.linewidth': .2, 'grid.color': '.9',
'axes.edgecolor': '.9'})
In [4]:
populationData = pd.read_csv('../data/BrazilPopulation2014.csv', encoding='utf-8',)
populationData.sort("Population", ascending=False)
Out[4]:
In [5]:
populationData.sort("Population").plot(x = "City",
y = "Population",
kind = "barh",
title = "Number of Inhabitants per City",
legend = False,
# colormap = cmap,
color = "#00447c",
alpha=1)
Out[5]:
In [6]:
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba.xls")
In [7]:
curitibaData.sheet_names
Out[7]:
In [8]:
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True, index_col=0)
In [9]:
curitibaPublic
Out[9]:
In [10]:
# curitibaPublic = curitibaPublic.loc[:,'2014-06':'2014-09'] # Including only months where we also have Twitter data
# curitibaPublic
In [11]:
curitibaPublic = curitibaPublic.transpose()
curitibaPublic
Out[11]:
In [12]:
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
'HIV Testing, Females'] # Giving the columns fairly short and explanatory names
curitibaPublic = curitibaPublic.drop('Pregnant Women', 1) # Excluding numbers from matenal health facilities as they are tests delivered, not tests made
curitibaPublic
Out[12]:
In [13]:
curitibaPublic.dtypes # Figuring out what datatype each column is read as
Out[13]:
In [14]:
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such
In [15]:
curitibaPublic # Checking that data looks the same after the datatype shenanigans
Out[15]:
In [16]:
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic
Out[16]:
Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)
In [17]:
for col in curitibaPublic:
fig = figure( # "fig" holds all the global settings
plot_width = 1000,
plot_height = 600,
title = curitibaPublic[col].name, # Plot title
y_axis_label = 'Tests',
x_axis_label = 'Date',
title_text_font = 'Oswald',
title_text_color = '#363636',
background_fill = '#FFFFFF', # Background colour for plot area (#FAFAFA)
outline_line_color = '#FFFFFF', # Colour of line sorrounding plot (#FAFAFA)
border_fill = '#FFFFFF', # Background colour for surrounding area (#FAFAFA)
x_axis_type = 'datetime', # NOTE: only need to define this on first graph
x_range = (curitibaPublic.index.min(),
curitibaPublic.index.max()), # Setting x-axis to start and end on first and last date of dataset
y_range = (0,(curitibaPublic[col].max() * 1.1)), # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
#tools="pan,wheel_zoom,box_zoom,reset,previewsave" # NOTE: only needed on first, if commented out, chooses default tools
)
fig.line( # Inserting a line in the chart called "fig"
curitibaPublic.index, # Variable values for the x-axis (index = dates)
curitibaPublic[col], # Variable values for the y-axis (loops over all columns)
line_color = '#404040', # Colour of the line
line_width = 10, # Width of the line
line_alpha = 0.7, # Opacity of the line
#legend = curitibaPublic[col].name, # Label name for the legend (column name)
)
# legend().label_text_font='Open Sans'
# legend().label_text_color='#363636'
# legend().border_line_color='#f6f6f6'
# axis().axis_label_text_font = "Open Sans"
# axis().axis_label_text_font_size = "12pt"
# axis().axis_label_text_color = "#363636"
# axis().major_label_text_font="Open Sans"
# axis().major_label_text_font_size="10pt"
# axis().minor_tick_line_color = "#d4d4d4"
# xaxis().axis_line_color = '#d4d4d4'
# xaxis().major_tick_line_color = "#d4d4d4"
# yaxis().major_tick_line_color = None
# yaxis().axis_line_color = None
# xgrid().grid_line_color = None
# ygrid().grid_line_color = "#d4d4d4"
show(fig)
Below we'll insert lines for all topics in one chart to better compare.
In [18]:
fig = figure(
plot_width = 1000, plot_height = 600, title = 'All Groups', y_axis_label = 'Tests', x_axis_label = 'Date',
title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FFFFFF',
outline_line_color = '#FFFFFF', border_fill = '#FFFFFF', x_axis_type = 'datetime',
x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,1800),
)
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7,
legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
legend = "HIV Testing, Males")
#legend().label_text_font='Open Sans'
#legend().label_text_color='#363636'
#legend().border_line_color='#f6f6f6'
#axis().axis_label_text_font = "Open Sans"
#axis().axis_label_text_font_size = "12pt"
#axis().axis_label_text_color = "#363636"
#axis().major_label_text_font="Open Sans"
#axis().major_label_text_font_size="10pt"
#axis().minor_tick_line_color = "#d4d4d4"
#xaxis().axis_line_color = '#d4d4d4'
#xaxis().major_tick_line_color = "#d4d4d4"
#yaxis().major_tick_line_color = None
#yaxis().axis_line_color = None
#xgrid().grid_line_color = None
#ygrid().grid_line_color = "#d4d4d4"
#ygrid().grid_line_width = 0.5
show(fig)
We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.
First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.
As we're currently looking at timeseries correlations, we'll just use the default: Pearson.
In [19]:
normalTestPrimaryFemales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Females"])
normalTestPrimaryMales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Males"])
normalTestHIVTestFemales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Females"])
normalTestHIVTestMales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Males"])
normalTestTotal = scipy.stats.mstats.normaltest(curitibaPublic["Curitiba Total"])
print('Normal Distribution Test for "Primary Care, Females": %s' % (normalTestPrimaryFemales,))
print('Normal Distribution Test for "Primary Care, Males": %s' % (normalTestPrimaryMales,))
print('Normal Distribution Test for "HIV Testing, Females": %s' % (normalTestHIVTestFemales,))
print('Normal Distribution Test for "HIV Testing, Males": %s' % (normalTestHIVTestMales,))
print('Normal Distribution Test for "Curitiba Total": %s' % (normalTestTotal,))
#curitibaPublic["Primary Care, Females"].normaltest()
In [20]:
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr
Out[20]:
In [21]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublic, annot=False, sig_stars=True,
diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [22]:
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");
In [23]:
# Checking that the data file looks right #
!cat spark/output-final/all.csv | head
In [24]:
# Read in Twitter data file #
twitterData=pd.read_csv('spark/output-final/all.csv',
encoding='utf-8',
#header=None,
na_values=['NaN',''],
parse_dates=[3],
index_col=[3]
)
In [25]:
twitterData.head()
Out[25]:
In [26]:
twitterDataCounts = pd.DataFrame({"Tweets" : twitterData.groupby(["city"]).size()}).reset_index()
twitterDataCounts.sort("Tweets", ascending=False)
Out[26]:
In [27]:
twitterDataCounts.sort("Tweets").plot(x = "city", y = "Tweets", kind = "barh", title = "Number of Tweets per City",
legend = False, color="#00447c", alpha=0.8)
Out[27]:
In [28]:
cityData = pd.merge(twitterDataCounts, populationData, how='outer', left_on="city", right_on="City", copy=True)
cityData = cityData[["City","Population","Tweets"]]
cityData
Out[28]:
In [29]:
cityData["Tweets per 1,000 inhabitants"] = cityData["Tweets"]/cityData["Population"]*1000/15
cityData.sort("Tweets per 1,000 inhabitants").plot(x = "City", y = "Tweets per 1,000 inhabitants", kind = "barh",
title = "Monthly Tweets per 1,000 inhabitants", legend = False,
color="#00447c", alpha=0.8)
Out[29]:
In [30]:
twitterDataCuritiba = twitterData[twitterData['city'] == 'Curitiba'] # Getting Curitiba data only
twitterDataSmall = twitterDataCuritiba[['city','topic']] # Getting rid of columns we won't need
twitterDataSmall.head()
Out[30]:
In [31]:
twitterDataSmall.describe()
Out[31]:
In [32]:
twitterDataSmall = pd.get_dummies(twitterDataSmall['topic'])
twitterDataSmall.head()
Out[32]:
In [33]:
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg
Out[33]:
In [34]:
for col in twitterDataSmallAgg:
fig = figure( # "fig" holds all the global settings
plot_width = 1000,
plot_height = 600,
title = twitterDataSmallAgg[col].name, # Plot title
y_axis_label = 'Tweets',
x_axis_label = 'Date',
title_text_font = 'Oswald',
title_text_color = '#363636',
background_fill = '#FFFFFF', # Background colour for plot area
outline_line_color = '#FFFFFF', # Colour of line sorrounding plot
border_fill = '#FFFFFF', # Background colour for surrounding area
x_axis_type = 'datetime', # NOTE: only need to define this on first graph
x_range = (twitterDataSmallAgg.index.min(),
twitterDataSmallAgg.index.max()), # Setting x-axis to start and end on first and last date of dataset
y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)), # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
#tools="pan,wheel_zoom,box_zoom,reset,previewsave" # NOTE: only needed on first, if commented out, chooses default tools
)
fig.line( # Inserting a line in the chart called "fig"
twitterDataSmallAgg.index, # Variable values for the x-axis (index = dates)
twitterDataSmallAgg[col], # Variable values for the y-axis (loops over all columns)
line_color = '#404040', # Colour of the line
line_width = 10, # Width of the line
line_alpha = 0.7, # Opacity of the line
#legend = curitibaPublic[col].name, # Label name for the legend (column name)
)
#legend().label_text_font='Open Sans'
#legend().label_text_color='#363636'
#legend().border_line_color='#f6f6f6'
#axis().axis_label_text_font = "Open Sans"
#axis().axis_label_text_font_size = "12pt"
#axis().axis_label_text_color = "#363636"
#axis().major_label_text_font="Open Sans"
#axis().major_label_text_font_size="10pt"
#axis().minor_tick_line_color = "#d4d4d4"
#xaxis().axis_line_color = '#d4d4d4'
#xaxis().major_tick_line_color = "#d4d4d4"
#yaxis().major_tick_line_color = None
#yaxis().axis_line_color = None
#xgrid().grid_line_color = None
#ygrid().grid_line_color = "#d4d4d4"
#ygrid().grid_line_width = 0.5
show(fig)
In [35]:
curitibaTwitterCorr = twitterDataSmallAgg.corr() # Using default method: Pearson
curitibaTwitterCorr
Out[35]:
In [36]:
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
left_index=True, right_index=True, sort=True,
suffixes=('_x', '_y'), copy=True)
In [37]:
df
Out[37]:
In [38]:
dfNoTotals = df.drop(df.columns[[4, 12]], axis=1)
dfNoTotalsCorr = dfNoTotals.corr() # Using default method: Pearson
dfNoTotalsCorr
Out[38]:
In [39]:
f, ax = plt.subplots(figsize=(12, 12))
sns.corrplot(dfNoTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [40]:
dfTotals = df[["Curitiba Total", "Twitter Total"]]
dfTotalsCorr = dfTotals.corr() # Using default method: Pearson
dfTotalsCorr
Out[40]:
In [41]:
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfTotals, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [42]:
dfTotals["Curitiba Tests Total"] = dfTotals["Curitiba Total"].sum()
dfTotals["Curitiba Tweets Total"] = dfTotals["Twitter Total"].sum()
dfTotals["Curitiba Tests Total %"] = dfTotals["Curitiba Total"]/dfTotals["Curitiba Tests Total"]*100
dfTotals["Curitiba Tweets Total %"] = dfTotals["Twitter Total"]/dfTotals["Curitiba Tweets Total"]*100
In [43]:
dfTotals
Out[43]:
In [44]:
fig = figure(
plot_width = 1000, plot_height = 600, title = 'Tests vs. Tweets', x_axis_label = 'Month', # y_axis_label = 'Monthly %',
title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FFFFFF',
outline_line_color = '#FFFFFF', border_fill = '#FFFFFF', x_axis_type = 'datetime',
x_range = (dfTotals.index.min(),dfTotals.index.max()), y_range = (0,10),
)
fig.line(dfTotals.index, dfTotals["Curitiba Tests Total %"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7,
legend = "HIV Tests in Public Clinics, Curitiba"),
fig.line(dfTotals.index, dfTotals["Curitiba Tweets Total %"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
legend = "HIV and Discrimination Tweets, Curitiba")
show(fig)
In [45]:
sns.jointplot("Curitiba Total", "Twitter Total", df, kind="reg", color="#404040");
In [46]:
sns.jointplot("HIV Testing, Males", "Campaign_Portuguese", df, kind="reg", color="#404040");
In [47]:
sns.jointplot("HIV Testing, Females", "Campaign_Portuguese", df, kind="reg", color="#404040");
In [48]:
sns.jointplot("Primary Care, Females", "Prevention_Positive", df, kind="reg", color="#404040");
We can only do anomaly detection on tweets, because we don't have enough data points for our ground-truth data. As we have very fine temporal information for all tweets (seconds), we can resample to look at hourly and daily aggregates. That shulkd give us enough data for anomaly detection.
First we resample to daily aggregates.
In [55]:
twitterDataDailyAgg = twitterDataSmall.resample('D', how='sum') # Resampling by summing each topic over each day
twitterDataDailyAgg['Twitter Total'] = twitterDataDailyAgg.sum(axis=1) # Adding a column with monthly totals
# twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataDailyAgg.head()
Out[55]:
As we'll be using r instead of Python for this, we'll use IPyhon's built-in r interpreter using the so called Magic Functions.
In [56]:
%load_ext rmagic
In [57]:
%R library(AnomalyDetection)
Out[57]:
In [58]:
# %%R
# help(AnomalyDetectionTs)
# help(AnomalyDetectionVec)
In [59]:
df_r = twitterDataDailyAgg['Twitter Total']
df_r.to_csv('TwitterDailyAgg.csv', header=['Twitter Total'], date_format='%Y-%m-%d')
In [60]:
!cat TwitterDailyAgg.csv | head
In [61]:
%%R
df_r = read.csv("TwitterDailyAgg.csv", stringsAsFactors=FALSE)
In [62]:
%R data(df_r)
Out[62]:
In [63]:
%R df_r
Out[63]:
In [64]:
%%R
data(df_r)
res = AnomalyDetectionTs(df_r, max_anoms=0.02, direction='both', plot=TRUE)
res$plot
In [51]:
%R raw_data
Out[51]:
In [52]:
%%R
data(raw_data)
res = AnomalyDetectionTs(raw_data, max_anoms=0.02, direction='both', plot=TRUE)
res$plot
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [ ]: