Testing functions in epydemiology

Import epydemiology

(All other packages will be imported or reported missing.)


In [ ]:
%matplotlib inline

In [ ]:
import numpy as np
import pandas as pd

import epydemiology as epy

Some background details


In [ ]:
help(epy)

In [ ]:
print(dir(epy))

FILE: phjGetData.py

FUNCTION: phjReadDataFromExcelNamedCellRange()


In [ ]:
phjPath = "/Users/philipjones/Documents/git_repositories/epydemiology"
phjFileName = "Test data.xlsx"

import pandas as pd
import openpyxl
import epydemiology as epy

print("RANGE: some_test_data")
print("=====================")
myDF = epy.phjReadDataFromExcelNamedCellRange(phjExcelPathAndFileName = '/'.join([phjPath,phjFileName]),
                                              phjExcelCellRangeName = 'some_test_data',
                                              phjDatetimeFormat = "%d%b%Y",
                                              phjMissingValue = "missing",
                                              phjHeaderRow = True,
                                              phjPrintResults = True)

print(myDF.dtypes)

print('\n')

print("RANGE: some_more_test_data")
print("==========================")
myDF2 = epy.phjReadDataFromExcelNamedCellRange(phjExcelPathAndFileName = '/'.join([phjPath,phjFileName]),
                                               phjExcelCellRangeName = 'some_more_test_data',
                                               phjDatetimeFormat = "%Y-%m-%d",
                                               phjMissingValue = "missing",
                                               phjHeaderRow = True,
                                               phjPrintResults = True)

print(myDF.dtypes)

FILE: phjGetDBData.py

FUNCTION: phjConnectToDatabase()


In [ ]:
import pymysql
import pymssql
import epydemiology as epy

tempConn = epy.phjConnectToDatabase('mysql')

print(tempConn)

FUNCTION: phjGetDataFromDatabase()

Example 1 – Query stored in file


In [ ]:
# The following external libraries are imported automatically but are incuded here for completeness.
import pandas as pd
import pymysql
import pymssql
import epydemiology as epy

myDF = epy.phjGetDataFromDatabase(phjQueryPathAndFile = '/Users/username/Desktop/theSQLQueryFile.mssql',
                                  phjPrintResults = True)

Example 2 – Query entered directly in function call


In [ ]:
# The following external libraries are imported automatically but are incuded here for completeness.
import pandas as pd
import pymysql
import pymssql
import epydemiology as epy

myDF = epy.phjGetDataFromDatabase(phjQueryStr = 'SELECT * FROM Table1',
                                  phjPrintResults = True)

FILE: phjMiscFuncs.py

FUNCTION: phjGetStrFromArgOrFile()


In [ ]:

FUNCTION: phjReadTextFromFile()


In [ ]:
myStr = epy.phjReadTextFromFile(phjFilePathAndName = '/Users/username/Desktop/myTextFile.txt',
                                phjMaxAttempts = 3,
                                phjPrintResults = False)

FUNCTION: phjCreateNameGroupRegex()

phjRegexPreCompile parameter set to False


In [ ]:
import numpy as np
import pandas as pd
import re
import epydemiology as epy

df = pd.DataFrame({'id':[2,2,2,1,1],
                   'group':['dog','dog','dog','cat','cat'],
                   'regex':['(?:dog)','(?:canine)','(?:k9)','(?:cat)','(?:feline)']})

print("Dataframe\n---------")
print(df)

regexStr = epy.phjCreateNamedGroupRegex(phjDF = df,
                                        phjGroupVarName = 'group',
                                        phjRegexVarName = 'regex',
                                        phjIDVarName = 'id',
                                        phjRegexPreCompile = False,
                                        phjPrintResults = False)

print("\nCombined Regex string\n---------------------")
print(regexStr)

phjRegexPreCompile parameter set to True


In [ ]:
df = pd.DataFrame({'id':[2,2,2,1,1],
                   'group':['dog','dog','dog','cat','cat'],
                   'regex':['(?:dog)','(?:canine)','(?:k9)','(?:cat)','(?:feline)']})

print("Dataframe\n---------")
print(df)

myCompiledRegexObj = epy.phjCreateNamedGroupRegex(phjDF = df,
                                                  phjGroupVarName = 'group',
                                                  phjRegexVarName = 'regex',
                                                  phjIDVarName = 'id',
                                                  phjRegexPreCompile = True,
                                                  phjPrintResults = False)

print("\nCompiled Regex object\n---------------------")
print(myCompiledRegexObj)

FUNCTION: phjFindRegexNamedGroup()


In [ ]:

FUNCTION: phjMaxLevelOfTaxonomicDetail()


In [ ]:
import numpy as np
import pandas as pd
import collections

myOrderedDict = collections.OrderedDict()
myOrderedDict['Descriptor'] = ['dog','ferret','cat','rabbit','horse','primate','rodent','gerbil','guinea pig','rat','mammal','lizard','snake','common basilisk','turtle','tortoise','spur-thighed tortoise']
myOrderedDict['Phylum'] = ['Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata']
myOrderedDict['Class'] = ['Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Reptilia','Reptilia','Reptilia','Reptilia','Reptilia','Reptilia']
myOrderedDict['Order'] = ['Carnivora','Carnivora','Carnivora','Lagomorpha','Perissodactyla','Primates','Rodentia','Rodentia','Rodentia','Rodentia','','Squamata','Squamata','Squmata','Testudines','Testudines','Testudines']
myOrderedDict['Suborder'] = ['','','Feliformia','','','','','','','','','Lacertilia','Serpentes','Iguania','','Cryptodira','Cryptodira']
myOrderedDict['Superfamily'] = ['','','','','','','','','','','','','','','','','']
myOrderedDict['Family'] = ['Canidae','Mustelidae','Felidae','Leporidae','Equidae','','','Muridae','Caviidae','Muridae','','','','Corytophanidae','','Testudinidae','Testudinidae']
myOrderedDict['Subfamily'] = ['','','','','','','','Gerbillinae','','Murinae','','','','','','','']
myOrderedDict['Genus'] = ['Canis','Mustela','Felis','Oryctolagus','Equus','','','','Cavia','Rattus','','','','Basiliscus','','','Testudo']
myOrderedDict['Species'] = ['lupus','putorius','silvestris','cuniculus','ferus','','','','porcellus','norvegicus','','','','basiliscus','','','graeca']
myOrderedDict['Subspecies'] = ['familiaris','furo','catus','','caballus','','','','','domestica','','','','','','','']

df = pd.DataFrame(myOrderedDict)

df = epy.phjMaxLevelOfTaxonomicDetail(phjDF = df,
                                      phjFirstCol = 'Phylum',
                                      phjLastCol = 'Subspecies',
                                      phjNewColName = 'max_tax_details',
                                      phjDropPreExisting = False,
                                      phjCleanup = True,
                                      phjPrintResults = False)

FUNCTION: phjReverseMap()

Example 1 – exact string matches


In [ ]:
myDF = pd.DataFrame({'id':[1,2,3,4,5,6,7],
                     'var':['dogg','canine','cannine','catt','felin','cot','feline'],
                     'dog':[1,2,3,4,5,6,7]})

print(myDF)

d = {'dog':['dogg','canine','cannine'],
     'cat':['catt','felin','feline']}

In [ ]:
myDF = epy.phjReverseMap(phjDF = myDF,
                         phjMappingDict = d,
                         phjCategoryVarName = 'var',
                         phjMappedVarName = 'spp',
                         phjUnmapped = 'missing',
                         phjTreatAsRegex = False,
                         phjDropPreExisting = True,
                         phjPrintResults = True)

Example 2 – regex


In [ ]:
myDF = pd.DataFrame({'id':[1,2,3,4,5,6,7],
                     'var':['dogg','canine','cannine','catt','felin','cot','feline'],
                     'dog':[1,2,3,4,5,6,7]})

print(myDF)
print('\n')
       
d = {'dog':['(?:dog+)','(?:can*ine)'],
     'cat':['(?:cat+)','(?:fel+ine?)']}

print(d)

In [ ]:
myDF = epy.phjReverseMap(phjDF = myDF,
                         phjMappingDict = d,
                         phjCategoryVarName = 'var',
                         phjMappedVarName = 'new',
                         phjUnmapped = 'missing',
                         phjTreatAsRegex = True,
                         phjDropPreExisting = True,
                         phjPrintResults = True)

FUNCTION: phjRetrieveUniqueFromMultiDataFrames()

Single dataframe


In [ ]:
phjTempDF = pd.DataFrame({'a':[1,2,3,4,5,6,1,2,3,4,5,6],
                          'b':['a','b','c','d','e','f','a','b','w','d','e','f']})

print('Single variable')
print('---------------')

phjOutDF = epy.phjRetrieveUniqueFromMultiDataFrames(phjDFList = [phjTempDF],
                                                    phjVarNameList = 'a',
                                                    phjSort = True,
                                                    phjPrintResults = True)
 
print('\n')
print('Multiple variables')
print('------------------')

phjOutDF = epy.phjRetrieveUniqueFromMultiDataFrames(phjDFList = phjTempDF,
                                                    phjVarNameList = ['a','b'],
                                                    phjSort = True,
                                                    phjPrintResults = True)

Multiple dataframes of data


In [ ]:
df1 = pd.DataFrame({'m':[1,2,3,4,5,6],
                    'n':['a','b','c','d','e','f']})
 
df2 = pd.DataFrame({'m':[2,5,7,8],
                    'n':['b','e','g','h']})

phjOutDF = epy.phjRetrieveUniqueFromMultiDataFrames(phjDFList = [df1,df2],
                                                    phjVarNameList = ['m','n'],
                                                    phjSort = True,
                                                    phjPrintResults = True)

FUNCTION: phjUpdateLUT()

Testing phjUpdateLUT() function with dataframe with single column


In [ ]:
old_df = pd.DataFrame({'id':[1,2,3,4,5,6],
                       'm':['a','b','c','d','e','f']})
 
new_df = pd.DataFrame({'id':[1,2,3,4],
                       'm':['b','E','g','H']})
 
update_df = epy.phjUpdateLUT(phjExistDF = old_df,
                             phjNewDF = new_df,
                             phjIDName = 'id',
                             phjVarNameList = ['m'],
                             phjMissStr = 'missing',
                             phjMissCode = 999,
                             phjIgnoreCase = True,
                             phjPrintResults = True)

Testing phjUpdateLUT() function with dataframe with multiple columns


In [ ]:
old_df = pd.DataFrame({'id':[1,2,3,4,5,6],
                       'm':['a','b','c','d','e','f'],
                       'n':['A','B','C','D','E','F']})
 
new_df = pd.DataFrame({'id':[1,2,3,4,5],
                       'm':['b','e','g','h','a'],
                       'n':['BB','e','GG','H','a']})
 
update_df = epy.phjUpdateLUT(phjExistDF = old_df,
                             phjNewDF = new_df,
                             phjIDName = 'id',
                             phjVarNameList = ['m','n'],
                             phjMissStr = 'missing',
                             phjMissCode = 999,
                             phjIgnoreCase = True,
                             phjPrintResults = True)
 
print('Updated dataframe')
print('-----------------')
print(update_df)

FUNCTION: phjUpdateLUTToLatestValues()


In [ ]:
df1 = pd.DataFrame({'id':[1,2,3,4,5,6,7,8],
                   'name':['a','b','c','d','e','f','g','h'],
                   'value':[999,22,33,44,55,66,999,88]})

df2 = pd.DataFrame({'id':[9,10,11,12],
                    'name':['a','i','d','g'],
                    'value':[11,99,None,77]})

df = df1.append(df2).sort_values(by = ['name','id'])

In [ ]:
print('First dataframe')
print('---------------')
print(df1)
print('\n')

print('Second dataframe')
print('----------------')
print(df2)
print('\n')

print('Joined dataframes')
print('-----------------')
print(df)

In [ ]:
df = epy.phjUpdateLUTToLatestValues(phjDF = df,
                                    phjIDVarName = 'id',
                                    phjGroupbyVarName = 'name',
                                    phjAddCountCol = True,
                                    phjPrintResults = True)

FILE: phjMatrices.py

FUNCTION: phjBinaryVarsToSquareMatrix()

Output a numpy array


In [ ]:
rawDataDF = pd.DataFrame({'a':[0,1,1,1,0,0,1,0],
                          'b':[1,1,0,0,1,0,0,1],
                          'c':[0,0,1,0,1,1,1,1],
                          'd':[1,0,0,0,1,0,0,0],
                          'e':[1,0,0,0,0,1,0,0]})

columns = ['a','b','c','d','e']

print('Raw data')
print(rawDataDF)
print('\n')

phjMatrix = epy.phjBinaryVarsToSquareMatrix(phjDataDF = rawDataDF,
                                        phjColumnNamesList = columns,
                                        phjOutputFormat = 'arr',
                                        phjPrintResults = False)
                                        
print('Returned square matrix')
print(phjMatrix)

Output a Pandas dataframe


In [ ]:
rawDataDF = pd.DataFrame({'a':[0,1,1,1,0,0,1,0],
                          'b':[1,1,0,0,1,0,0,1],
                          'c':[0,0,1,0,1,1,1,1],
                          'd':[1,0,0,0,1,0,0,0],
                          'e':[1,0,0,0,0,1,0,0]})

columns = ['a','b','c','d','e']

print('Raw data')
print(rawDataDF)
print('\n')

phjMatrixDF = epy.phjBinaryVarsToSquareMatrix(phjDataDF = rawDataDF,
                                              phjColumnNamesList = columns,
                                              phjOutputFormat = 'df',
                                              phjPrintResults = False)
                                        
print('Returned square matrix dataframe')
print(phjMatrixDF)

FUNCTION: phjLongToWideBinary()


In [ ]:
df = pd.DataFrame({'X':[1,1,1,2,2,3,3,3,3,4],
                   'Y':['a','b','d','b','c','d','e','a','f','b']})

newDF = epy.phjLongToWideBinary(phjDF = df,
                                phjGroupbyVarName = 'X',
                                phjVariablesVarName = 'Y',
                                phjValuesDict = {0:0,1:1},
                                phjPrintResults = False)

print('Original dataframe\n')
print(df)

print('\n')

print('New wide dataframe\n')
print(newDF)

FILE: phjCalculateProportions.py

FUNCTION: phjCalculateBinomialProportions()

Example of calculating binomial proportions (using phjCaculateBinomialProportions() function)


In [ ]:
phjTempDF = pd.DataFrame({'group':['g1','g1','g2','g1','g2','g2','g1','g1','g2','g1'],
                          'A':['yes','yes','no','no','no','no','no','yes',np.nan,'yes'],
                          'B':['no',np.nan,np.nan,'yes','yes','yes','yes','no','no','no'],
                          'C':['yes','yes','yes',np.nan,'no','yes','yes','yes','no','no']})

print(phjTempDF)
print('\n')

phjPropDF = epy.phjCalculateBinomialProportions(phjDF = phjTempDF,
                                                phjColumnsList = ['A','B','C'],
                                                phjSuccess = 'yes',
                                                phjGroupVarName = 'group',
                                                phjMissingValue = 'missing',
                                                phjBinomialConfIntMethod = 'normal',
                                                phjAlpha = 0.05,
                                                phjPlotProportions = True,
                                                phjGroupsToPlotList = 'all',
                                                phjSortProportions = True,
                                                phjGraphTitle = None,
                                                phjPrintResults = False)

print(phjPropDF)

FUNCTION: phjCalculateBinomialConfInts()


In [ ]:
phjTempDF = pd.DataFrame({'year':[2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018],
                          'success':[109,77,80,57,29,31,29,19,10,16,6,8,4,0],
                          'failure':[784-109,840-77,715-80,780-57,743-29,743-31,752-29,645-19,509-10,562-16,471-6,471-8,472-4,0-0],
                          #'total':[784,840,715,780,743,743,752,645,509,562,471,471,472,0]
                         })

print('Original dataframe\n')
print(phjTempDF)
print('\n')

phjPropDF = epy.phjCalculateBinomialConfInts(phjDF = phjTempDF,
                                             phjSuccVarName = 'success',
                                             phjFailVarName = 'failure',
                                             phjTotalVarName = None,
                                             phjBinomialConfIntMethod = 'normal',
                                             phjAlpha = 0.05,
                                             phjPrintResults = False)
 
print('Dataframe of confidence intervals\n')
print(phjPropDF)

FUNCTION: phjCalculateMultinomialProportions()

Example of calculating multinomial proportions (using phjCalculateMultinomialProportions() function)


In [ ]:
phjTempDF = pd.DataFrame({'group':['case','case','case','control','control','case','case','case','control','control','control','control','case','case','case','control','control','control','control','case','case','case','case','case',np.nan,np.nan],
                          'category':[np.nan,'spaniel','missing','terrier','collie','labrador','labrador','collie','spaniel','spaniel','labrador','collie','terrier','terrier','terrier','collie','labrador','labrador','labrador','spaniel','spaniel','collie','collie','collie','terrier','spaniel'],
                          'catint':[1,2,3,2,3,2,1,2,1,2,3,2,3,2,3,1,2,3,2,3,2,3,2,3,1,2]})

print(phjTempDF)
print('\n')

phjRelFreqDF = epy.phjCalculateMultinomialProportions(phjDF = phjTempDF,
                                                      phjCategoryVarName = 'category',
                                                      phjGroupVarName = 'group',
                                                      phjMissingValue = 'missing',
                                                      phjMultinomialConfIntMethod = 'goodman',
                                                      phjAlpha = 0.05,
                                                      phjPlotRelFreq = True,
                                                      phjCategoriesToPlotList = 'all',
                                                      phjGroupsToPlotList = 'all',   # Currently not implemented
                                                      phjGraphTitle = 'Relative frequencies (Goodman CI)',
                                                      phjPrintResults = True)

print(phjRelFreqDF)

FUNCTION: phjSummaryTableToBinaryOutcomes()


In [ ]:
# Generate the dataframe used in the original description of the function
df = pd.DataFrame({'year':[2010,2011,2012,2013,2014],
                   'cases':[23,34,41,57,62],
                   'controls':[1023,1243,1145,2017,1876],
                   'comment':['Small number of cases',
                              'Proportion increase',
                              'Trend continues',
                              'Decreased proportion',
                              'Increased again']})

# Reorder the columns a little
df = df[['year','cases','controls','comment']]

# Convert to dataframe containing binary outcome data
newDF = epy.phjSummaryTableToBinaryOutcomes(phjDF = df,
                                            phjVarsToIncludeList = ['year','cases','controls'],
                                            phjSuccVarName = 'cases',
                                            phjFailVarName = 'controls',
                                            phjTotalVarName = None,
                                            phjOutcomeVarName = 'outcome',
                                            phjPrintResults = False)

# Print results
print('Original table of summary results\n')
print(df)

print('\n')

print('Dataframe of binary outcomes\n')
with pd.option_context('display.max_rows',6, 'display.max_columns',2):
    print(newDF)

FUNCTION: phjAnnualDiseaseTrend()


In [ ]:
phjDiseaseDF = pd.DataFrame({'year':[2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018],
                             'positive':[18,34,24,26,30,27,36,17,18,15,4],
                             'negative':[1695,1733,1929,1517,1449,1329,1130,928,753,496,325]})

phjDiseaseDF = epy.phjAnnualDiseaseTrend(phjDF = phjDiseaseDF.loc[phjDiseaseDF['year'] < 2018,:],
                                         phjYearVarName = 'year',
                                         phjPositivesVarName = 'positive',
                                         phjNegativesVarName = 'negative',
                                         phjTotalVarName = None,
                                         phjConfIntMethod = 'normal',
                                         phjAlpha = 0.05,
                                         phjPlotProportions = True,
                                         phjPlotPrediction = True,
                                         phjGraphTitleStr = None,
                                         phjPrintResults = False)

FILE: phjCleanUKPostcodes.py

FUNCTION: phjCleanUKPostcodeVariable()

Clean postcodes based on format alone


In [ ]:
# Create a test dataframe that contains a postcode variable and some other empty variables
# that have the same names as the new variables that will be created. Setting the 'phjDropExisting'
# variable to true will automatically drop pre-existing variables before running the function.
# Some of the variables in the test dataframe are not duplicated and are present to show that the
# function preserves those variables in tact.

import numpy as np
import pandas as pd
import re

# Create test dataframe
myTestPostcodeDF = pd.DataFrame({'postcode': ['NP45DG',
                                              'CH647TE',
                                              'CH5 4HE',
                                              'GIR 0AA',
                                              'NOT NOWN',
                                              'GIR0AB',
                                              'NOR12A',
                                              'no idea',
                                              'W1A 1AA',
                                              'missin',
                                              'NP4  OGH',
                                              'P012 OLL',
                                              'p01s',
                                              'ABCD',
                                              '',
                                              'ab123cd',
                                              'un-known',
                                              'B1    INJ',
                                              'AB123CD',
                                              'No idea what the postcode is',
                                              '    ???NP4-5DG_*#   '],
                                 'pcdClean': np.nan,
                                 'pcd7': np.nan,
                                 'postcodeOutward': np.nan,
                                 'someOtherCol': np.nan})

# Run function to extract postcode data
print('\nStart dataframe\n===============\n')
print(myTestPostcodeDF)
print('\n')

myTestPostcodeDF = epy.phjCleanUKPostcodeVariable(phjDF = myTestPostcodeDF,
                                                  phjRealPostcodeSer = None,
                                                  phjOrigPostcodeVarName = 'postcode',
                                                  phjNewPostcodeVarName = 'pcdClean',
                                                  phjNewPostcodeStrLenVarName = 'pcdCleanStrLen',
                                                  phjPostcodeCheckVarName = 'pcdFormatCheck',
                                                  phjMissingValueCode = 'missing',
                                                  phjMinDamerauLevenshteinDistanceVarName = 'minDamLevDist',
                                                  phjBestAlternativesVarName = 'bestAlternatives',
                                                  phjPostcode7VarName = 'pcd7',
                                                  phjPostcodeAreaVarName = 'pcdArea',
                                                  phjSalvageOutwardPostcodeComponent = True,
                                                  phjCheckByOption = 'format',
                                                  phjDropExisting = True,
                                                  phjPrintResults = True)

print('\nReturned dataframe\n==================\n')
print(myTestPostcodeDF)

Clean postcodes based on real postcode and identify closest matches


In [ ]:
import re

# N.B. When calculating best alternative postcodes, only postcodes that are within
# 1 DL distance are considered.

# Create a Pandas series that could contain all the postcodes in the UK
realPostcodesSer = pd.Series(['NP4 5DG','CH647TE','CH5 4HE','W1A 1AA','NP4 0GH','PO120LL','AB123CF','AB124DF','AB123CV'])

# Create test dataframe
myTestPostcodeDF = pd.DataFrame({'postcode': ['NP45DG',
                                              'CH647TE',
                                              'CH5 4HE',
                                              'GIR 0AA',
                                              'NOT NOWN',
                                              'GIR0AB',
                                              'NOR12A',
                                              'no idea',
                                              'W1A 1AA',
                                              'missin',
                                              'NP4  OGH',
                                              'P012 OLL',
                                              'p01s',
                                              'ABCD',
                                              '',
                                              'ab123cd',
                                              'un-known',
                                              'B1    INJ',
                                              'AB123CD',
                                              'No idea what the postcode is',
                                              '    ???NP4-5DG_*#   '],
                                 'pcdClean': np.nan,
                                 'pcd7': np.nan,
                                 'postcodeOutward': np.nan,
                                 'someOtherCol': np.nan})

# Run function to extract postcode data
print('\nStart dataframe\n===============\n')
print(myTestPostcodeDF)
print('\n')

myTestPostcodeDF = epy.phjCleanUKPostcodeVariable(phjDF = myTestPostcodeDF,
                                                  phjRealPostcodeSer = realPostcodesSer,
                                                  phjOrigPostcodeVarName = 'postcode',
                                                  phjNewPostcodeVarName = 'pcdClean',
                                                  phjNewPostcodeStrLenVarName = 'pcdCleanStrLen',
                                                  phjPostcodeCheckVarName = 'pcdFormatCheck',
                                                  phjMissingValueCode = 'missing',
                                                  phjMinDamerauLevenshteinDistanceVarName = 'minDamLevDist',
                                                  phjBestAlternativesVarName = 'bestAlternatives',
                                                  phjPostcode7VarName = 'pcd7',
                                                  phjPostcodeAreaVarName = 'pcdArea',
                                                  phjSalvageOutwardPostcodeComponent = True,
                                                  phjCheckByOption = 'dictionary',
                                                  phjDropExisting = True,
                                                  phjPrintResults = True)

print('\nReturned dataframe\n==================\n')
print(myTestPostcodeDF)

FUNCTION: phjPostcodeFormat7()


In [ ]:


FILE: phjSelectData.py

FUNCTION: phjGenerateCaseControlDataset()


In [ ]:

FUNCTION: phjSelectCaseControlDataset()

Unmatched controls


In [ ]:
casesDF = pd.DataFrame({'animalID':[1,2,3,4,5],'var1':[43,45,34,45,56],'sp':['dog','dog','dog','dog','dog']})
potControlsDF = pd.DataFrame({'animalID':[11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],
                              'var1':[34,54,34,23,34,45,56,67,56,67,78,98,65,54,34,76,87,56,45,34],
                              'sp':['dog','cat','dog','dog','cat','dog','cat','dog','cat','dog',
                                    'dog','dog','dog','cat','dog','cat','dog','dog','dog','cat']})

print("This dataframe contains all the cases of disease\n")
print(casesDF)
print("\n")
print("This dataframe contains all the animals you could potentially use as controls\n")
print(potControlsDF)
print("\n")

# Selecting unmatched controls
unmatchedDF = epy.phjSelectCaseControlDataset(phjCasesDF = casesDF,
                                              phjPotentialControlsDF = potControlsDF,
                                              phjUniqueIdentifierVarName = 'animalID',
                                              phjMatchingVariablesList = None,
                                              phjControlsPerCaseInt = 2,
                                              phjPrintResults = False)

print(unmatchedDF)

Matched controls


In [ ]:
casesDF = pd.DataFrame({'animalID':[1,2,3,4,5],'var1':[43,45,34,45,56],'sp':['dog','dog','dog','dog','dog']})
potControlsDF = pd.DataFrame({'animalID':[11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],
                              'var1':[34,54,34,23,34,45,56,67,56,67,78,98,65,54,34,76,87,56,45,34],
                              'sp':['dog','cat','dog','dog','cat','dog','cat','dog','cat','dog',
                                    'dog','dog','dog','cat','dog','cat','dog','dog','dog','cat']})

print("This dataframe contains all the cases of disease\n")
print(casesDF)
print("\n")
print("This dataframe contains all the animals you could potentially use as controls\n")
print(potControlsDF)
print("\n")

# Selecting controls that are matched to cases on variable 'sp'
matchedDF = epy.phjSelectCaseControlDataset(phjCasesDF = casesDF,
                                            phjPotentialControlsDF = potControlsDF,
                                            phjUniqueIdentifierVarName = 'animalID',
                                            phjMatchingVariablesList = ['sp'],
                                            phjControlsPerCaseInt = 2,
                                            phjPrintResults = False)

print(matchedDF)

FUNCTION: phjCollapseOnPatientID()


In [ ]:


FILE: phjCleanData.py

FUNCTION: phjParseDateVar()


In [ ]:


FILE: phjExploreData.py

FUNCTION: phjViewLogOdds()

Example of viewing log odds plotted against mid-point of categories.

Categorise using Jenks breaks and using 'yes' and 'no' as binary outcome


In [ ]:
# Define example dataset
phjTempDF = pd.DataFrame({'binDepVar':[False]*50000 + [True]*50000,
                          'riskFactorCont':np.random.uniform(0,1,100000)})

with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
    print(phjTempDF)

    
# View log odds
phjTempDF = epy.phjViewLogOdds(phjDF = phjTempDF,
                               phjBinaryDepVarName = 'binDepVar',
                               phjCaseValue = False,
                               phjContIndepVarName = 'riskFactorCont',
                               phjMissingValue = 'missing',
                               phjNumberOfCategoriesInt = 3,
                               phjNewCategoryVarName = 'categoricalVar',
                               phjCategorisationMethod = 'jenks',
                               phjGroupVarName = None,
                               phjAlpha = 0.05,
                               phjPrintResults = False)

with pd.option_context('display.max_rows', 10, 'display.max_columns', 10):
    print('Log odds for categorised variable')
    print(phjTempDF)

Categorise using quantile breaks and using 1 and 0 as binary outcome


In [ ]:
# Define example dataset
phjTempDF = pd.DataFrame({'binDepVar':[1]*50000 + [0]*50000,
                          'riskFactorCont':np.random.uniform(0,1,100000)})

with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
    print(phjTempDF)

    
# View log odds
phjTempDF = epy.phjViewLogOdds(phjTempDF = phjTempDF,
                               phjBinaryDepVarName = 'binDepVar',
                               phjContIndepVarName = 'riskFactorCont',
                               phjCaseValue = 1,
                               phjMissingValue = 'missing',
                               phjNumberOfCategoriesInt = 8,
                               phjNewCategoryVarName = 'categoricalVar',
                               phjCategorisationMethod = 'quantile',
                               phjGroupNameVar = None,
                               phjPrintResults = False)

with pd.option_context('display.max_rows', 10, 'display.max_columns', 10):
    print('Log odds for categorised variable')
    print(phjTempDF)

FUNCTION: phjCategoriseContinuousVariable()

Return dataframe alone


In [ ]:
# Define example dataset
phjTempDF = pd.DataFrame({'binDepVar':['yes']*50000 + ['no']*50000,
                          'riskFactorCont':np.random.uniform(0,1,100000)})

with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
    print(phjTempDF)

    
# Categorise a continuous variable
phjTempDF = epy.phjCategoriseContinuousVariable(phjTempDF = phjTempDF,
                                                phjContinuousVarName = 'riskFactorCont',
                                                phjMissingValue = 'missing',
                                                phjNumberOfCategoriesInt = 6,
                                                phjNewCategoryVarName = 'catVar',
                                                phjCategorisationMethod = 'jenks',
                                                phjReturnBreaks = False,
                                                phjPrintResults = False)

with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
    print('\nLog odds for categorised variable')
    print(phjTempDF)

Return dataframe and list of breaks


In [ ]:
# Define example dataset
phjTempDF = pd.DataFrame({'binDepVar':['yes']*50000 + ['no']*50000,
                          'riskFactorCont':np.random.uniform(0,1,100000)})

with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
    print(phjTempDF)

    
# Categorise a continuous variable
phjTempDF, phjBreaksList = epy.phjCategoriseContinuousVariable(phjTempDF = phjTempDF,
                                                               phjContinuousVarName = 'riskFactorCont',
                                                               phjMissingValue = 'missing',
                                                               phjNumberOfCategoriesInt = 6,
                                                               phjNewCategoryVarName = 'catVar',
                                                               phjCategorisationMethod = 'jenks',
                                                               phjReturnBreaks = True,
                                                               phjPrintResults = False)

with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
    print('\nCategorised variable')
    print(phjTempDF)
    print('\n')
    print('Breaks')
    print(phjBreaksList)

FILE: phjRROR.py

FUNCTION: phjOddsRatio()


In [ ]:
tempDF = pd.DataFrame({'caseN':[1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0],
                       'caseA':['y','y','y','y','y','y','y','y','n','n','n','n','n','n','n','n','n','n','n','n'],
                       'catN':[1,2,3,2,3,4,3,2,3,4,3,2,1,2,1,2,3,2,3,4],
                       'catA':['a','a','b','b','c','d','a','c','c','d','a','b','c','a','d','a','b','c','a','d'],
                       'floatN':[1.2,4.3,2.3,4.3,5.3,4.3,2.4,6.5,4.5,7.6,5.6,5.6,4.8,5.2,7.4,5.4,6.5,5.7,6.8,4.5]})

phjORTable = epy.phjOddsRatio(phjDF = tempDF,
                              phjCaseVarName = 'caseA',
                              phjCaseValue = 'y',
                              phjRiskFactorVarName = 'catA',
                              phjRiskFactorBaseValue = 'a',
                              phjMissingValue = np.nan,
                              phjAlpha = 0.05,
                              phjPrintResults = False)

pd.options.display.float_format = '{:,.3f}'.format

print(phjORTable)

FUNCTION: phjRelativeRisk()


In [ ]:
tempDF = pd.DataFrame({'caseN':[1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0],
                       'caseA':['y','y','y','y','y','y','y','y','n','n','n','n','n','n','n','n','n','n','n','n'],
                       'catN':[1,2,3,2,3,4,3,2,3,4,3,2,1,2,1,2,3,2,3,4],
                       'catA':['a','a','b','b','c','d','a','c','c','d','a','b','c','a','d','a','b','c','a','d'],
                       'floatN':[1.2,4.3,2.3,4.3,5.3,4.3,2.4,6.5,4.5,7.6,5.6,5.6,4.8,5.2,7.4,5.4,6.5,5.7,6.8,4.5]})

phjRRTable = epy.phjRelativeRisk( phjDF = tempDF,
                                  phjCaseVarName = 'caseA',
                                  phjCaseValue = 'y',
                                  phjRiskFactorVarName = 'catA',
                                  phjRiskFactorBaseValue = 'a',
                                  phjMissingValue = np.nan,
                                  phjAlpha = 0.05,
                                  phjPrintResults = False)

pd.options.display.float_format = '{:,.3f}'.format

print(phjRRTable)