Setting the Stage



In [1]:

    
# Importing the Python libraries we will use below #
import scipy as scipy
import pandas as pd
import ggplot as gg
import seaborn as sns
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from bokeh.plotting import *



In [2]:

    
# Setting the chose graphical styles #
%matplotlib inline
output_notebook()
sns.set_style("darkgrid", {"grid.linewidth": .9, "axes.facecolor": ".98"})
sns.set_context("notebook") # paper, notebook, talk, poster
# colour_map = dict(unsafe="red", celebs="blue", general="yellow", awareness="grey", myths="purple", stigma="green", safe="#2ecc71", advocates="#34495e", race="#e74c3c", jokes="#3498db",needle="steelblue", questions="indianred")
cmap = sns.diverging_palette(19, 251, s=60, l=30, sep=100, n=11, as_cmap=True)









    




    
        
        
        
    
        
        BokehJS successfully loaded.

Getting the Ground-Truth Data



In [3]:

    
curitibaData = pd.ExcelFile("../data/groundtruth/curitiba.xls")



In [4]:

    
curitibaData.sheet_names









    Out[4]:





[u'Curitiba HIV Tests (Public)', u'Curitiba HIV Tests (Private)']



In [5]:

    
curitibaPublic = curitibaData.parse(sheetname=0, header=0, parse_dates=True)



In [6]:

    
curitibaPublic









    Out[6]:






  
    
      
      HIV tests performed 
      2014-01
      2014-02
      2014-03
      2014-04
      2014-05
      2014-06
      2014-07
      2014-08
      2014-09
    
  
  
    
      0
       Primary Health Care services (females)
       1465
       1425
       1378
       1477
       1437
       1174
        712
       1267
       1438
    
    
      1
         Primary Health Care services (males)
       1057
       1056
        998
       1115
       1064
        840
       1197
       1009
       1250
    
    
      2
         Pregnant women in public health care
       2760
       2360
       2095
       2258
       2473
       2086
       3423
       2020
       2571
    
    
      3
                   HIV testing center (males)
        528
        358
        389
        383
        348
        307
        490
        404
        377
    
    
      4
                 HIV testing center (females)
        186
        133
        158
        155
        155
         94
        197
        158
        107



In [7]:

    
curitibaPublic = curitibaPublic.loc[:,'2014-06':'2014-09']
curitibaPublic

Turning the Table



In [8]:

    
curitibaPublic = curitibaPublic.transpose()
curitibaPublic



In [9]:

    
curitibaPublic.columns = ['Primary Care, Females', 'Primary Care, Males', 'Pregnant Women', 'HIV Testing, Males',
                          'HIV Testing, Females'] # Giving the columns fairly short and explanatory names
curitibaPublic









    Out[9]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      Pregnant Women
      HIV Testing, Males
      HIV Testing, Females
    
  
  
    
      2014-06
       1174
        840
       2086
       307
        94
    
    
      2014-07
        712
       1197
       3423
       490
       197
    
    
      2014-08
       1267
       1009
       2020
       404
       158
    
    
      2014-09
       1438
       1250
       2571
       377
       107



In [10]:

    
curitibaPublic.dtypes # Figuring out what datatype each column is read as









    Out[10]:





Primary Care, Females    int64
Primary Care, Males      int64
Pregnant Women           int64
HIV Testing, Males       int64
HIV Testing, Females     int64
dtype: object



In [11]:

    
curitibaPublic.index = pd.to_datetime(curitibaPublic.index) # Making sure that months are read as such

for col in curitibaPublic: curitibaPublic[col] = curitibaPublic[col].astype('int') # Convert each non-index column to integers curitibaPublic.dtypes # checking that all columns are now integers



In [12]:

    
curitibaPublic # Checking that data looks the same after the datatype shenanigans









    Out[12]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      Pregnant Women
      HIV Testing, Males
      HIV Testing, Females
    
  
  
    
      2014-06-01
       1174
        840
       2086
       307
        94
    
    
      2014-07-01
        712
       1197
       3423
       490
       197
    
    
      2014-08-01
       1267
       1009
       2020
       404
       158
    
    
      2014-09-01
       1438
       1250
       2571
       377
       107



In [13]:

    
curitibaPublic['Curitiba Total'] = curitibaPublic.sum(axis=1) # Adding a column with monthly totals
curitibaPublic









    Out[13]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      Pregnant Women
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
    
  
  
    
      2014-06-01
       1174
        840
       2086
       307
        94
       4501
    
    
      2014-07-01
        712
       1197
       3423
       490
       197
       6019
    
    
      2014-08-01
       1267
       1009
       2020
       404
       158
       4858
    
    
      2014-09-01
       1438
       1250
       2571
       377
       107
       5743

Looking at the Ground-Truth Data

Now we'll start having a graphical look at the data. First off we'll create a timeseries graph for each topic and the monthly totals (that is, a timeseries graph for each column)



In [14]:

    
for col in curitibaPublic:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = curitibaPublic[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (curitibaPublic.index.min(),
                   curitibaPublic.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(curitibaPublic[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        curitibaPublic.index,                               # Variable values for the x-axis (index = dates)
        curitibaPublic[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    legend().label_text_font='Open Sans'
    legend().label_text_color='#363636'
    legend().border_line_color='#f6f6f6'
    axis().axis_label_text_font = "Open Sans"
    axis().axis_label_text_font_size = "12pt"
    axis().axis_label_text_color = "#363636"
    axis().major_label_text_font="Open Sans"
    axis().major_label_text_font_size="10pt"
    axis().minor_tick_line_color = "#d4d4d4"
    xaxis().axis_line_color = '#d4d4d4'
    xaxis().major_tick_line_color = "#d4d4d4"
    yaxis().major_tick_line_color = None
    yaxis().axis_line_color = None
    xgrid().grid_line_color = None
    ygrid().grid_line_color = "#d4d4d4"
    ygrid().grid_line_width = 0.5
    show(fig)

Below we'll insert trendlines for all topics in one chart to better compare.



In [15]:

    
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Topics', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,3500),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["Pregnant Women"], line_color = '#5d6263', line_width = 5, line_alpha = 0.7, 
         legend = "Pregnant Women")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
legend().label_text_font='Open Sans'
legend().label_text_color='#363636'
legend().border_line_color='#f6f6f6'
axis().axis_label_text_font = "Open Sans"
axis().axis_label_text_font_size = "12pt"
axis().axis_label_text_color = "#363636"
axis().major_label_text_font="Open Sans"
axis().major_label_text_font_size="10pt"
axis().minor_tick_line_color = "#d4d4d4"
xaxis().axis_line_color = '#d4d4d4'
xaxis().major_tick_line_color = "#d4d4d4"
yaxis().major_tick_line_color = None
yaxis().axis_line_color = None
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 0.5
show(fig)

Correlation Between Test Groups

We can see above that there is some co-variance between the sub-groupings, but July also seems to be a bit of a dividing month. We'll therefore dig a bit further into any potential correlations to see if changes within the sub-groupings are indeed similar. If that is the case, it would imply that there is a large degree of robustness in the data. Or, in other words, if one group gets tested more in one month, so will others, meaning that external factors, such as campaigns or increased risk behavoiur across sub-groupings, could be at play.

Simple Matrix

First we'll make a simple correlation matrix using Pandas' built-in DataFrame correlations function, .corr. It correlates all columns pairwise using either pearson, kendall, or spearman.

As we're currently looking at timeseries correlations, we'll just use the default: Pearson.

Pearson assumes that the data is normal distributed. We can't really test that with only four data points, but code has been readied below for furture use.

normalTestPrimaryFemales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Females"]) normalTestPrimaryMales = scipy.stats.mstats.normaltest(curitibaPublic["Primary Care, Males"]) normalTestPregnantWomen = scipy.stats.mstats.normaltest(curitibaPublic["Pregnant Women"]) normalTestHIVTestFemales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Females"]) normalTestHIVTestMales = scipy.stats.mstats.normaltest(curitibaPublic["HIV Testing, Males"]) normalTestTotal = scipy.stats.mstats.normaltest(curitibaPublic["Total"]) print('Normal Distribution Test for "Primary Care, Females": %s' % (normalTestPrimaryFemales,)) print('Normal Distribution Test for "Primary Care, Males": %s' % (normalTestPrimaryMales,)) print('Normal Distribution Test for "Pregnant Women": %s' % (normalTestPregnantWomen,)) print('Normal Distribution Test for "HIV Testing, Females": %s' % (normalTestHIVTestFemales,)) print('Normal Distribution Test for "HIV Testing, Males": %s' % (normalTestHIVTestMales,)) print('Normal Distribution Test for "Total": %s' % (normalTestTotal,)) #curitibaPublic["Primary Care, Females"].normaltest()



In [16]:

    
(gg.ggplot(gg.aes(x="Primary Care, Females"), data=curitibaPublic)
 + gg.geom_histogram())









    



stat_bin: binwidth defaulted to range/30.
    Use 'binwidth = x' to adjust this.
/home/ubuntu/anaconda/lib/python2.7/site-packages/pandas/util/decorators.py:81: FutureWarning: the 'rows' keyword is deprecated, use 'index' instead
  warnings.warn(msg, FutureWarning)






    












    Out[16]:





<ggplot: (8789575508069)>



In [17]:

    
(gg.ggplot(gg.aes(x="Primary Care, Males"), data=curitibaPublic)
 + gg.geom_histogram())









    












    Out[17]:





<ggplot: (8789574913781)>



In [18]:

    
(gg.ggplot(gg.aes(x="Pregnant Women"), data=curitibaPublic)
 + gg.geom_histogram())









    












    Out[18]:





<ggplot: (8789574811461)>



In [19]:

    
(gg.ggplot(gg.aes(x="HIV Testing, Females"), data=curitibaPublic)
 + gg.geom_histogram())









    












    Out[19]:





<ggplot: (8789574720549)>



In [20]:

    
(gg.ggplot(gg.aes(x="HIV Testing, Males"), data=curitibaPublic)
 + gg.geom_histogram())









    












    Out[20]:





<ggplot: (8789574609437)>



In [22]:

    
(gg.ggplot(gg.aes(x="Curitiba Total"), data=curitibaPublic)
 + gg.geom_histogram())









    












    Out[22]:





<ggplot: (8789574442225)>



In [23]:

    
curitibaPublicCorr = curitibaPublic.corr() # Using default method: Pearson
curitibaPublicCorr









    Out[23]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      Pregnant Women
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
    
  
  
    
      Primary Care, Females
       1.000000
      -0.094142
      -0.746287
      -0.679287
      -0.756572
      -0.386639
    
    
      Primary Care, Males
      -0.094142
       1.000000
       0.699348
       0.671452
       0.404677
       0.948163
    
    
      Pregnant Women
      -0.746287
       0.699348
       1.000000
       0.807249
       0.658780
       0.889943
    
    
      HIV Testing, Males
      -0.679287
       0.671452
       0.807249
       1.000000
       0.947279
       0.777277
    
    
      HIV Testing, Females
      -0.756572
       0.404677
       0.658780
       0.947279
       1.000000
       0.538360
    
    
      Curitiba Total
      -0.386639
       0.948163
       0.889943
       0.777277
       0.538360
       1.000000

curitibaPublicCorrKendall = curitibaPublic.corr(method='kendall') # Using kendall curitibaPublicCorrKendall

Correlation Matrix



In [24]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(curitibaPublicCorr, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()

Main Group Correlations



In [25]:

    
sns.jointplot("Primary Care, Females", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");



In [26]:

    
sns.jointplot("HIV Testing, Males", "HIV Testing, Females", curitibaPublic, kind="reg", color="#404040");

Getting the Twitter Data



In [46]:

    
# Checking that the data file looks right #
!cat ../data/all.tsv | head









    












cat: write error: Broken pipe



In [43]:

    
# Read in Twitter data file #
twitterData=pd.read_table('../data/all.tsv',
                          encoding='utf-8',
                          #header=None,
                          na_values=['NaN',''],
                          parse_dates=[1],
                          index_col=[1]
                        )



In [44]:

    
twitterData.head()









    Out[44]:






  
    
      
      city
      lat
      lon
      topic
    
    
      origdate
      
      
      
      
    
  
  
    
      2014-06-19 06:01:11
       Porto Alegre
      -30.11462
       -51.16393
           Prevention_Positive
    
    
      2014-06-19 09:06:28
          Fortaleza
       -3.72271
       -38.52465
       Discrimination_Negative
    
    
      2014-06-19 00:22:09
             Recife
       -8.01175
       -34.95291
       Discrimination_Negative
    
    
      2014-06-19 02:07:21
           Brasília
      -15.79159
       -47.89558
       Discrimination_Negative
    
    
      2014-06-19 23:55:34
          Fortaleza
       -3.72271
       -38.52465
       Discrimination_Negative



In [31]:

    
twitterDataSmall = twitterData.loc[:,'Campaign':'Testing']



In [32]:

    
twitterDataSmallAgg = twitterDataSmall.resample('MS', how='sum') # Resampling by summing each topic over each month
twitterDataSmallAgg['Twitter Total'] = twitterDataSmallAgg.sum(axis=1) # Adding a column with monthly totals
twitterDataSmallAgg = twitterDataSmallAgg.iloc[:4,] # We don't have October data in ground-truth so get rid of that here
twitterDataSmallAgg









    Out[32]:






  
    
      
      Campaign
      Discrimination
      Prevention
      Testing
      Twitter Total
    
  
  
    
      2014-06-01
       3754
       78987
       16729
       536
       100006
    
    
      2014-07-01
       4346
       73833
       20443
       563
        99185
    
    
      2014-08-01
       1667
       67650
       14883
       165
        84365
    
    
      2014-09-01
        812
       77534
       13699
       157
        92202

Looking at the Twitter Data



In [33]:

    
for col in twitterDataSmallAgg:
    fig = figure(                                           # "fig" holds all the global settings
        plot_width = 1000,
        plot_height = 600,
        title = twitterDataSmallAgg[col].name,                   # Plot title
        y_axis_label = 'Tests',
        x_axis_label = 'Date',
        title_text_font = 'Oswald',
        title_text_color = '#363636',
        background_fill = '#FAFAFA',                        # Background colour for plot area
        outline_line_color = '#FAFAFA',                     # Colour of line sorrounding plot
        border_fill = '#FAFAFA',                            # Background colour for surrounding area
        x_axis_type = 'datetime',                           # NOTE: only need to define this on first graph
        x_range = (twitterDataSmallAgg.index.min(),
                   twitterDataSmallAgg.index.max()),             # Setting x-axis to start and end on first and last date of dataset
        y_range = (0,(twitterDataSmallAgg[col].max() * 1.1)),    # Setting y-axis to start at 0 and end at highest value (plus 10% to make it prettier)
        #tools="pan,wheel_zoom,box_zoom,reset,previewsave"  # NOTE: only needed on first, if commented out, chooses default tools
        )
    fig.line(                                               # Inserting a line in the chart called "fig"
        twitterDataSmallAgg.index,                               # Variable values for the x-axis (index = dates)
        twitterDataSmallAgg[col],                                # Variable values for the y-axis (loops over all columns)
        line_color = '#404040',                             # Colour of the line
        line_width = 10,                                    # Width of the line
        line_alpha = 0.7,                                   # Opacity of the line
        #legend = curitibaPublic[col].name,                 # Label name for the legend (column name)
        )
    legend().label_text_font='Open Sans'
    legend().label_text_color='#363636'
    legend().border_line_color='#f6f6f6'
    axis().axis_label_text_font = "Open Sans"
    axis().axis_label_text_font_size = "12pt"
    axis().axis_label_text_color = "#363636"
    axis().major_label_text_font="Open Sans"
    axis().major_label_text_font_size="10pt"
    axis().minor_tick_line_color = "#d4d4d4"
    xaxis().axis_line_color = '#d4d4d4'
    xaxis().major_tick_line_color = "#d4d4d4"
    yaxis().major_tick_line_color = None
    yaxis().axis_line_color = None
    xgrid().grid_line_color = None
    ygrid().grid_line_color = "#d4d4d4"
    ygrid().grid_line_width = 0.5
    show(fig)

Merging Data



In [34]:

    
df = pd.merge(curitibaPublic, twitterDataSmallAgg, how='left', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True,
      suffixes=('_x', '_y'), copy=True)



In [35]:

    
df.head()









    Out[35]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      Pregnant Women
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
      Campaign
      Discrimination
      Prevention
      Testing
      Twitter Total
    
  
  
    
      2014-06-01
       1174
        840
       2086
       307
        94
       4501
       3754
       78987
       16729
       536
       100006
    
    
      2014-07-01
        712
       1197
       3423
       490
       197
       6019
       4346
       73833
       20443
       563
        99185
    
    
      2014-08-01
       1267
       1009
       2020
       404
       158
       4858
       1667
       67650
       14883
       165
        84365
    
    
      2014-09-01
       1438
       1250
       2571
       377
       107
       5743
        812
       77534
       13699
       157
        92202

Comparisons



In [36]:

    
dfCorr = df.corr() # Using default method: Pearson
dfCorr









    Out[36]:






  
    
      
      Primary Care, Females
      Primary Care, Males
      Pregnant Women
      HIV Testing, Males
      HIV Testing, Females
      Curitiba Total
      Campaign
      Discrimination
      Prevention
      Testing
      Twitter Total
    
  
  
    
      Primary Care, Females
       1.000000
      -0.094142
      -0.746287
      -0.679287
      -0.756572
      -0.386639
      -0.871252
       0.100282
      -0.990627
      -0.793788
      -0.556484
    
    
      Primary Care, Males
      -0.094142
       1.000000
       0.699348
       0.671452
       0.404677
       0.948163
      -0.328454
      -0.053807
       0.026313
      -0.311391
      -0.111999
    
    
      Pregnant Women
      -0.746287
       0.699348
       1.000000
       0.807249
       0.658780
       0.889943
       0.445380
       0.104931
       0.721673
       0.445322
       0.481301
    
    
      HIV Testing, Males
      -0.679287
       0.671452
       0.807249
       1.000000
       0.947279
       0.777277
       0.231663
      -0.500540
       0.583412
       0.111548
      -0.054721
    
    
      HIV Testing, Females
      -0.756572
       0.404677
       0.658780
       0.947279
       1.000000
       0.538360
       0.371501
      -0.649023
       0.660181
       0.207545
      -0.091602
    
    
      Curitiba Total
      -0.386639
       0.948163
       0.889943
       0.777277
       0.538360
       1.000000
      -0.011861
       0.028211
       0.334325
       0.002520
       0.152247
    
    
      Campaign
      -0.871252
      -0.328454
       0.445380
       0.231663
       0.371501
      -0.011861
       1.000000
       0.195708
       0.922476
       0.976264
       0.769774
    
    
      Discrimination
       0.100282
      -0.053807
       0.104931
      -0.500540
      -0.649023
       0.028211
       0.195708
       1.000000
       0.021836
       0.403337
       0.760921
    
    
      Prevention
      -0.990627
       0.026313
       0.721673
       0.583412
       0.660181
       0.334325
       0.922476
       0.021836
       1.000000
       0.868131
       0.659190
    
    
      Testing
      -0.793788
      -0.311391
       0.445322
       0.111548
       0.207545
       0.002520
       0.976264
       0.403337
       0.868131
       1.000000
       0.887252
    
    
      Twitter Total
      -0.556484
      -0.111999
       0.481301
      -0.054721
      -0.091602
       0.152247
       0.769774
       0.760921
       0.659190
       0.887252
       1.000000



In [37]:

    
f, ax = plt.subplots(figsize=(8, 8))
sns.corrplot(dfCorr, annot=False, sig_stars=True,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [38]:

    
sns.jointplot("Curitiba Total", "Twitter Total", df, kind="reg", color="#404040");



In [ ]:

    
fig = figure(
    plot_width = 1000, plot_height = 600, title = 'All Topics', y_axis_label = 'Tests', x_axis_label = 'Date',
    title_text_font = 'Oswald', title_text_color = '#363636', background_fill = '#FAFAFA',
    outline_line_color = '#FAFAFA', border_fill = '#FAFAFA', x_axis_type = 'datetime',
    x_range = (curitibaPublic.index.min(),curitibaPublic.index.max()), y_range = (0,3500),
    )
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Females"], line_color = '#00aeef', line_width = 5, line_alpha = 0.7,
         legend = "Primary Care, Females")
fig.line(curitibaPublic.index, curitibaPublic["Primary Care, Males"], line_color = '#cf5c42', line_width = 5, line_alpha = 0.7, 
         legend = "Primary Care, Males")
fig.line(curitibaPublic.index, curitibaPublic["Pregnant Women"], line_color = '#5d6263', line_width = 5, line_alpha = 0.7, 
         legend = "Pregnant Women")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Females"], line_color = '#00447c', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Females")
fig.line(curitibaPublic.index, curitibaPublic["HIV Testing, Males"], line_color = '#e1d8ad', line_width = 5, line_alpha = 0.7,
         legend = "HIV Testing, Males")
legend().label_text_font='Open Sans'
legend().label_text_color='#363636'
legend().border_line_color='#f6f6f6'
axis().axis_label_text_font = "Open Sans"
axis().axis_label_text_font_size = "12pt"
axis().axis_label_text_color = "#363636"
axis().major_label_text_font="Open Sans"
axis().major_label_text_font_size="10pt"
axis().minor_tick_line_color = "#d4d4d4"
xaxis().axis_line_color = '#d4d4d4'
xaxis().major_tick_line_color = "#d4d4d4"
yaxis().major_tick_line_color = None
yaxis().axis_line_color = None
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 0.5
show(fig)



In [40]:

    
sns.jointplot("Primary Care, Females", "Prevention", df, kind="reg", color="#404040");



In [1]:

    
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)









    Out[1]:



In [ ]:

	HIV tests performed	2014-01	2014-02	2014-03	2014-04	2014-05	2014-06	2014-07	2014-08	2014-09
0	Primary Health Care services (females)	1465	1425	1378	1477	1437	1174	712	1267	1438
1	Primary Health Care services (males)	1057	1056	998	1115	1064	840	1197	1009	1250
2	Pregnant women in public health care	2760	2360	2095	2258	2473	2086	3423	2020	2571
3	HIV testing center (males)	528	358	389	383	348	307	490	404	377
4	HIV testing center (females)	186	133	158	155	155	94	197	158	107

	0	1	2	3	4
2014-06	1174	840	2086	307	94
2014-07	712	1197	3423	490	197
2014-08	1267	1009	2020	404	158
2014-09	1438	1250	2571	377	107

	Primary Care, Females	Primary Care, Males	Pregnant Women	HIV Testing, Males	HIV Testing, Females
2014-06-01	1174	840	2086	307	94
2014-07-01	712	1197	3423	490	197
2014-08-01	1267	1009	2020	404	158
2014-09-01	1438	1250	2571	377	107

	Primary Care, Females	Primary Care, Males	Pregnant Women	HIV Testing, Males	HIV Testing, Females	Curitiba Total
Primary Care, Females	1.000000	-0.094142	-0.746287	-0.679287	-0.756572	-0.386639
Primary Care, Males	-0.094142	1.000000	0.699348	0.671452	0.404677	0.948163
Pregnant Women	-0.746287	0.699348	1.000000	0.807249	0.658780	0.889943
HIV Testing, Males	-0.679287	0.671452	0.807249	1.000000	0.947279	0.777277
HIV Testing, Females	-0.756572	0.404677	0.658780	0.947279	1.000000	0.538360
Curitiba Total	-0.386639	0.948163	0.889943	0.777277	0.538360	1.000000

	city	lat	lon	topic
origdate
2014-06-19 06:01:11	Porto Alegre	-30.11462	-51.16393	Prevention_Positive
2014-06-19 09:06:28	Fortaleza	-3.72271	-38.52465	Discrimination_Negative
2014-06-19 00:22:09	Recife	-8.01175	-34.95291	Discrimination_Negative
2014-06-19 02:07:21	Brasília	-15.79159	-47.89558	Discrimination_Negative
2014-06-19 23:55:34	Fortaleza	-3.72271	-38.52465	Discrimination_Negative

	Campaign	Discrimination	Prevention	Testing	Twitter Total
2014-06-01	3754	78987	16729	536	100006
2014-07-01	4346	73833	20443	563	99185
2014-08-01	1667	67650	14883	165	84365
2014-09-01	812	77534	13699	157	92202