In [ ]:
%matplotlib inline
In [ ]:
import numpy as np
import pandas as pd
import epydemiology as epy
In [ ]:
help(epy)
In [ ]:
print(dir(epy))
In [ ]:
phjPath = "/Users/philipjones/Documents/git_repositories/epydemiology"
phjFileName = "Test data.xlsx"
import pandas as pd
import openpyxl
import epydemiology as epy
print("RANGE: some_test_data")
print("=====================")
myDF = epy.phjReadDataFromExcelNamedCellRange(phjExcelPathAndFileName = '/'.join([phjPath,phjFileName]),
phjExcelCellRangeName = 'some_test_data',
phjDatetimeFormat = "%d%b%Y",
phjMissingValue = "missing",
phjHeaderRow = True,
phjPrintResults = True)
print(myDF.dtypes)
print('\n')
print("RANGE: some_more_test_data")
print("==========================")
myDF2 = epy.phjReadDataFromExcelNamedCellRange(phjExcelPathAndFileName = '/'.join([phjPath,phjFileName]),
phjExcelCellRangeName = 'some_more_test_data',
phjDatetimeFormat = "%Y-%m-%d",
phjMissingValue = "missing",
phjHeaderRow = True,
phjPrintResults = True)
print(myDF.dtypes)
In [ ]:
import pymysql
import pymssql
import epydemiology as epy
tempConn = epy.phjConnectToDatabase('mysql')
print(tempConn)
In [ ]:
# The following external libraries are imported automatically but are incuded here for completeness.
import pandas as pd
import pymysql
import pymssql
import epydemiology as epy
myDF = epy.phjGetDataFromDatabase(phjQueryPathAndFile = '/Users/username/Desktop/theSQLQueryFile.mssql',
phjPrintResults = True)
In [ ]:
# The following external libraries are imported automatically but are incuded here for completeness.
import pandas as pd
import pymysql
import pymssql
import epydemiology as epy
myDF = epy.phjGetDataFromDatabase(phjQueryStr = 'SELECT * FROM Table1',
phjPrintResults = True)
In [ ]:
In [ ]:
myStr = epy.phjReadTextFromFile(phjFilePathAndName = '/Users/username/Desktop/myTextFile.txt',
phjMaxAttempts = 3,
phjPrintResults = False)
In [ ]:
import numpy as np
import pandas as pd
import re
import epydemiology as epy
df = pd.DataFrame({'id':[2,2,2,1,1],
'group':['dog','dog','dog','cat','cat'],
'regex':['(?:dog)','(?:canine)','(?:k9)','(?:cat)','(?:feline)']})
print("Dataframe\n---------")
print(df)
regexStr = epy.phjCreateNamedGroupRegex(phjDF = df,
phjGroupVarName = 'group',
phjRegexVarName = 'regex',
phjIDVarName = 'id',
phjRegexPreCompile = False,
phjPrintResults = False)
print("\nCombined Regex string\n---------------------")
print(regexStr)
In [ ]:
df = pd.DataFrame({'id':[2,2,2,1,1],
'group':['dog','dog','dog','cat','cat'],
'regex':['(?:dog)','(?:canine)','(?:k9)','(?:cat)','(?:feline)']})
print("Dataframe\n---------")
print(df)
myCompiledRegexObj = epy.phjCreateNamedGroupRegex(phjDF = df,
phjGroupVarName = 'group',
phjRegexVarName = 'regex',
phjIDVarName = 'id',
phjRegexPreCompile = True,
phjPrintResults = False)
print("\nCompiled Regex object\n---------------------")
print(myCompiledRegexObj)
In [ ]:
In [ ]:
import numpy as np
import pandas as pd
import collections
myOrderedDict = collections.OrderedDict()
myOrderedDict['Descriptor'] = ['dog','ferret','cat','rabbit','horse','primate','rodent','gerbil','guinea pig','rat','mammal','lizard','snake','common basilisk','turtle','tortoise','spur-thighed tortoise']
myOrderedDict['Phylum'] = ['Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata','Chordata']
myOrderedDict['Class'] = ['Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Mammalia','Reptilia','Reptilia','Reptilia','Reptilia','Reptilia','Reptilia']
myOrderedDict['Order'] = ['Carnivora','Carnivora','Carnivora','Lagomorpha','Perissodactyla','Primates','Rodentia','Rodentia','Rodentia','Rodentia','','Squamata','Squamata','Squmata','Testudines','Testudines','Testudines']
myOrderedDict['Suborder'] = ['','','Feliformia','','','','','','','','','Lacertilia','Serpentes','Iguania','','Cryptodira','Cryptodira']
myOrderedDict['Superfamily'] = ['','','','','','','','','','','','','','','','','']
myOrderedDict['Family'] = ['Canidae','Mustelidae','Felidae','Leporidae','Equidae','','','Muridae','Caviidae','Muridae','','','','Corytophanidae','','Testudinidae','Testudinidae']
myOrderedDict['Subfamily'] = ['','','','','','','','Gerbillinae','','Murinae','','','','','','','']
myOrderedDict['Genus'] = ['Canis','Mustela','Felis','Oryctolagus','Equus','','','','Cavia','Rattus','','','','Basiliscus','','','Testudo']
myOrderedDict['Species'] = ['lupus','putorius','silvestris','cuniculus','ferus','','','','porcellus','norvegicus','','','','basiliscus','','','graeca']
myOrderedDict['Subspecies'] = ['familiaris','furo','catus','','caballus','','','','','domestica','','','','','','','']
df = pd.DataFrame(myOrderedDict)
df = epy.phjMaxLevelOfTaxonomicDetail(phjDF = df,
phjFirstCol = 'Phylum',
phjLastCol = 'Subspecies',
phjNewColName = 'max_tax_details',
phjDropPreExisting = False,
phjCleanup = True,
phjPrintResults = False)
In [ ]:
myDF = pd.DataFrame({'id':[1,2,3,4,5,6,7],
'var':['dogg','canine','cannine','catt','felin','cot','feline'],
'dog':[1,2,3,4,5,6,7]})
print(myDF)
d = {'dog':['dogg','canine','cannine'],
'cat':['catt','felin','feline']}
In [ ]:
myDF = epy.phjReverseMap(phjDF = myDF,
phjMappingDict = d,
phjCategoryVarName = 'var',
phjMappedVarName = 'spp',
phjUnmapped = 'missing',
phjTreatAsRegex = False,
phjDropPreExisting = True,
phjPrintResults = True)
In [ ]:
myDF = pd.DataFrame({'id':[1,2,3,4,5,6,7],
'var':['dogg','canine','cannine','catt','felin','cot','feline'],
'dog':[1,2,3,4,5,6,7]})
print(myDF)
print('\n')
d = {'dog':['(?:dog+)','(?:can*ine)'],
'cat':['(?:cat+)','(?:fel+ine?)']}
print(d)
In [ ]:
myDF = epy.phjReverseMap(phjDF = myDF,
phjMappingDict = d,
phjCategoryVarName = 'var',
phjMappedVarName = 'new',
phjUnmapped = 'missing',
phjTreatAsRegex = True,
phjDropPreExisting = True,
phjPrintResults = True)
In [ ]:
phjTempDF = pd.DataFrame({'a':[1,2,3,4,5,6,1,2,3,4,5,6],
'b':['a','b','c','d','e','f','a','b','w','d','e','f']})
print('Single variable')
print('---------------')
phjOutDF = epy.phjRetrieveUniqueFromMultiDataFrames(phjDFList = [phjTempDF],
phjVarNameList = 'a',
phjSort = True,
phjPrintResults = True)
print('\n')
print('Multiple variables')
print('------------------')
phjOutDF = epy.phjRetrieveUniqueFromMultiDataFrames(phjDFList = phjTempDF,
phjVarNameList = ['a','b'],
phjSort = True,
phjPrintResults = True)
In [ ]:
df1 = pd.DataFrame({'m':[1,2,3,4,5,6],
'n':['a','b','c','d','e','f']})
df2 = pd.DataFrame({'m':[2,5,7,8],
'n':['b','e','g','h']})
phjOutDF = epy.phjRetrieveUniqueFromMultiDataFrames(phjDFList = [df1,df2],
phjVarNameList = ['m','n'],
phjSort = True,
phjPrintResults = True)
In [ ]:
old_df = pd.DataFrame({'id':[1,2,3,4,5,6],
'm':['a','b','c','d','e','f']})
new_df = pd.DataFrame({'id':[1,2,3,4],
'm':['b','E','g','H']})
update_df = epy.phjUpdateLUT(phjExistDF = old_df,
phjNewDF = new_df,
phjIDName = 'id',
phjVarNameList = ['m'],
phjMissStr = 'missing',
phjMissCode = 999,
phjIgnoreCase = True,
phjPrintResults = True)
In [ ]:
old_df = pd.DataFrame({'id':[1,2,3,4,5,6],
'm':['a','b','c','d','e','f'],
'n':['A','B','C','D','E','F']})
new_df = pd.DataFrame({'id':[1,2,3,4,5],
'm':['b','e','g','h','a'],
'n':['BB','e','GG','H','a']})
update_df = epy.phjUpdateLUT(phjExistDF = old_df,
phjNewDF = new_df,
phjIDName = 'id',
phjVarNameList = ['m','n'],
phjMissStr = 'missing',
phjMissCode = 999,
phjIgnoreCase = True,
phjPrintResults = True)
print('Updated dataframe')
print('-----------------')
print(update_df)
In [ ]:
df1 = pd.DataFrame({'id':[1,2,3,4,5,6,7,8],
'name':['a','b','c','d','e','f','g','h'],
'value':[999,22,33,44,55,66,999,88]})
df2 = pd.DataFrame({'id':[9,10,11,12],
'name':['a','i','d','g'],
'value':[11,99,None,77]})
df = df1.append(df2).sort_values(by = ['name','id'])
In [ ]:
print('First dataframe')
print('---------------')
print(df1)
print('\n')
print('Second dataframe')
print('----------------')
print(df2)
print('\n')
print('Joined dataframes')
print('-----------------')
print(df)
In [ ]:
df = epy.phjUpdateLUTToLatestValues(phjDF = df,
phjIDVarName = 'id',
phjGroupbyVarName = 'name',
phjAddCountCol = True,
phjPrintResults = True)
In [ ]:
rawDataDF = pd.DataFrame({'a':[0,1,1,1,0,0,1,0],
'b':[1,1,0,0,1,0,0,1],
'c':[0,0,1,0,1,1,1,1],
'd':[1,0,0,0,1,0,0,0],
'e':[1,0,0,0,0,1,0,0]})
columns = ['a','b','c','d','e']
print('Raw data')
print(rawDataDF)
print('\n')
phjMatrix = epy.phjBinaryVarsToSquareMatrix(phjDataDF = rawDataDF,
phjColumnNamesList = columns,
phjOutputFormat = 'arr',
phjPrintResults = False)
print('Returned square matrix')
print(phjMatrix)
In [ ]:
rawDataDF = pd.DataFrame({'a':[0,1,1,1,0,0,1,0],
'b':[1,1,0,0,1,0,0,1],
'c':[0,0,1,0,1,1,1,1],
'd':[1,0,0,0,1,0,0,0],
'e':[1,0,0,0,0,1,0,0]})
columns = ['a','b','c','d','e']
print('Raw data')
print(rawDataDF)
print('\n')
phjMatrixDF = epy.phjBinaryVarsToSquareMatrix(phjDataDF = rawDataDF,
phjColumnNamesList = columns,
phjOutputFormat = 'df',
phjPrintResults = False)
print('Returned square matrix dataframe')
print(phjMatrixDF)
In [ ]:
df = pd.DataFrame({'X':[1,1,1,2,2,3,3,3,3,4],
'Y':['a','b','d','b','c','d','e','a','f','b']})
newDF = epy.phjLongToWideBinary(phjDF = df,
phjGroupbyVarName = 'X',
phjVariablesVarName = 'Y',
phjValuesDict = {0:0,1:1},
phjPrintResults = False)
print('Original dataframe\n')
print(df)
print('\n')
print('New wide dataframe\n')
print(newDF)
In [ ]:
phjTempDF = pd.DataFrame({'group':['g1','g1','g2','g1','g2','g2','g1','g1','g2','g1'],
'A':['yes','yes','no','no','no','no','no','yes',np.nan,'yes'],
'B':['no',np.nan,np.nan,'yes','yes','yes','yes','no','no','no'],
'C':['yes','yes','yes',np.nan,'no','yes','yes','yes','no','no']})
print(phjTempDF)
print('\n')
phjPropDF = epy.phjCalculateBinomialProportions(phjDF = phjTempDF,
phjColumnsList = ['A','B','C'],
phjSuccess = 'yes',
phjGroupVarName = 'group',
phjMissingValue = 'missing',
phjBinomialConfIntMethod = 'normal',
phjAlpha = 0.05,
phjPlotProportions = True,
phjGroupsToPlotList = 'all',
phjSortProportions = True,
phjGraphTitle = None,
phjPrintResults = False)
print(phjPropDF)
In [ ]:
phjTempDF = pd.DataFrame({'year':[2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018],
'success':[109,77,80,57,29,31,29,19,10,16,6,8,4,0],
'failure':[784-109,840-77,715-80,780-57,743-29,743-31,752-29,645-19,509-10,562-16,471-6,471-8,472-4,0-0],
#'total':[784,840,715,780,743,743,752,645,509,562,471,471,472,0]
})
print('Original dataframe\n')
print(phjTempDF)
print('\n')
phjPropDF = epy.phjCalculateBinomialConfInts(phjDF = phjTempDF,
phjSuccVarName = 'success',
phjFailVarName = 'failure',
phjTotalVarName = None,
phjBinomialConfIntMethod = 'normal',
phjAlpha = 0.05,
phjPrintResults = False)
print('Dataframe of confidence intervals\n')
print(phjPropDF)
In [ ]:
phjTempDF = pd.DataFrame({'group':['case','case','case','control','control','case','case','case','control','control','control','control','case','case','case','control','control','control','control','case','case','case','case','case',np.nan,np.nan],
'category':[np.nan,'spaniel','missing','terrier','collie','labrador','labrador','collie','spaniel','spaniel','labrador','collie','terrier','terrier','terrier','collie','labrador','labrador','labrador','spaniel','spaniel','collie','collie','collie','terrier','spaniel'],
'catint':[1,2,3,2,3,2,1,2,1,2,3,2,3,2,3,1,2,3,2,3,2,3,2,3,1,2]})
print(phjTempDF)
print('\n')
phjRelFreqDF = epy.phjCalculateMultinomialProportions(phjDF = phjTempDF,
phjCategoryVarName = 'category',
phjGroupVarName = 'group',
phjMissingValue = 'missing',
phjMultinomialConfIntMethod = 'goodman',
phjAlpha = 0.05,
phjPlotRelFreq = True,
phjCategoriesToPlotList = 'all',
phjGroupsToPlotList = 'all', # Currently not implemented
phjGraphTitle = 'Relative frequencies (Goodman CI)',
phjPrintResults = True)
print(phjRelFreqDF)
In [ ]:
# Generate the dataframe used in the original description of the function
df = pd.DataFrame({'year':[2010,2011,2012,2013,2014],
'cases':[23,34,41,57,62],
'controls':[1023,1243,1145,2017,1876],
'comment':['Small number of cases',
'Proportion increase',
'Trend continues',
'Decreased proportion',
'Increased again']})
# Reorder the columns a little
df = df[['year','cases','controls','comment']]
# Convert to dataframe containing binary outcome data
newDF = epy.phjSummaryTableToBinaryOutcomes(phjDF = df,
phjVarsToIncludeList = ['year','cases','controls'],
phjSuccVarName = 'cases',
phjFailVarName = 'controls',
phjTotalVarName = None,
phjOutcomeVarName = 'outcome',
phjPrintResults = False)
# Print results
print('Original table of summary results\n')
print(df)
print('\n')
print('Dataframe of binary outcomes\n')
with pd.option_context('display.max_rows',6, 'display.max_columns',2):
print(newDF)
In [ ]:
phjDiseaseDF = pd.DataFrame({'year':[2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018],
'positive':[18,34,24,26,30,27,36,17,18,15,4],
'negative':[1695,1733,1929,1517,1449,1329,1130,928,753,496,325]})
phjDiseaseDF = epy.phjAnnualDiseaseTrend(phjDF = phjDiseaseDF.loc[phjDiseaseDF['year'] < 2018,:],
phjYearVarName = 'year',
phjPositivesVarName = 'positive',
phjNegativesVarName = 'negative',
phjTotalVarName = None,
phjConfIntMethod = 'normal',
phjAlpha = 0.05,
phjPlotProportions = True,
phjPlotPrediction = True,
phjGraphTitleStr = None,
phjPrintResults = False)
In [ ]:
# Create a test dataframe that contains a postcode variable and some other empty variables
# that have the same names as the new variables that will be created. Setting the 'phjDropExisting'
# variable to true will automatically drop pre-existing variables before running the function.
# Some of the variables in the test dataframe are not duplicated and are present to show that the
# function preserves those variables in tact.
import numpy as np
import pandas as pd
import re
# Create test dataframe
myTestPostcodeDF = pd.DataFrame({'postcode': ['NP45DG',
'CH647TE',
'CH5 4HE',
'GIR 0AA',
'NOT NOWN',
'GIR0AB',
'NOR12A',
'no idea',
'W1A 1AA',
'missin',
'NP4 OGH',
'P012 OLL',
'p01s',
'ABCD',
'',
'ab123cd',
'un-known',
'B1 INJ',
'AB123CD',
'No idea what the postcode is',
' ???NP4-5DG_*# '],
'pcdClean': np.nan,
'pcd7': np.nan,
'postcodeOutward': np.nan,
'someOtherCol': np.nan})
# Run function to extract postcode data
print('\nStart dataframe\n===============\n')
print(myTestPostcodeDF)
print('\n')
myTestPostcodeDF = epy.phjCleanUKPostcodeVariable(phjDF = myTestPostcodeDF,
phjRealPostcodeSer = None,
phjOrigPostcodeVarName = 'postcode',
phjNewPostcodeVarName = 'pcdClean',
phjNewPostcodeStrLenVarName = 'pcdCleanStrLen',
phjPostcodeCheckVarName = 'pcdFormatCheck',
phjMissingValueCode = 'missing',
phjMinDamerauLevenshteinDistanceVarName = 'minDamLevDist',
phjBestAlternativesVarName = 'bestAlternatives',
phjPostcode7VarName = 'pcd7',
phjPostcodeAreaVarName = 'pcdArea',
phjSalvageOutwardPostcodeComponent = True,
phjCheckByOption = 'format',
phjDropExisting = True,
phjPrintResults = True)
print('\nReturned dataframe\n==================\n')
print(myTestPostcodeDF)
In [ ]:
import re
# N.B. When calculating best alternative postcodes, only postcodes that are within
# 1 DL distance are considered.
# Create a Pandas series that could contain all the postcodes in the UK
realPostcodesSer = pd.Series(['NP4 5DG','CH647TE','CH5 4HE','W1A 1AA','NP4 0GH','PO120LL','AB123CF','AB124DF','AB123CV'])
# Create test dataframe
myTestPostcodeDF = pd.DataFrame({'postcode': ['NP45DG',
'CH647TE',
'CH5 4HE',
'GIR 0AA',
'NOT NOWN',
'GIR0AB',
'NOR12A',
'no idea',
'W1A 1AA',
'missin',
'NP4 OGH',
'P012 OLL',
'p01s',
'ABCD',
'',
'ab123cd',
'un-known',
'B1 INJ',
'AB123CD',
'No idea what the postcode is',
' ???NP4-5DG_*# '],
'pcdClean': np.nan,
'pcd7': np.nan,
'postcodeOutward': np.nan,
'someOtherCol': np.nan})
# Run function to extract postcode data
print('\nStart dataframe\n===============\n')
print(myTestPostcodeDF)
print('\n')
myTestPostcodeDF = epy.phjCleanUKPostcodeVariable(phjDF = myTestPostcodeDF,
phjRealPostcodeSer = realPostcodesSer,
phjOrigPostcodeVarName = 'postcode',
phjNewPostcodeVarName = 'pcdClean',
phjNewPostcodeStrLenVarName = 'pcdCleanStrLen',
phjPostcodeCheckVarName = 'pcdFormatCheck',
phjMissingValueCode = 'missing',
phjMinDamerauLevenshteinDistanceVarName = 'minDamLevDist',
phjBestAlternativesVarName = 'bestAlternatives',
phjPostcode7VarName = 'pcd7',
phjPostcodeAreaVarName = 'pcdArea',
phjSalvageOutwardPostcodeComponent = True,
phjCheckByOption = 'dictionary',
phjDropExisting = True,
phjPrintResults = True)
print('\nReturned dataframe\n==================\n')
print(myTestPostcodeDF)
In [ ]:
In [ ]:
In [ ]:
casesDF = pd.DataFrame({'animalID':[1,2,3,4,5],'var1':[43,45,34,45,56],'sp':['dog','dog','dog','dog','dog']})
potControlsDF = pd.DataFrame({'animalID':[11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],
'var1':[34,54,34,23,34,45,56,67,56,67,78,98,65,54,34,76,87,56,45,34],
'sp':['dog','cat','dog','dog','cat','dog','cat','dog','cat','dog',
'dog','dog','dog','cat','dog','cat','dog','dog','dog','cat']})
print("This dataframe contains all the cases of disease\n")
print(casesDF)
print("\n")
print("This dataframe contains all the animals you could potentially use as controls\n")
print(potControlsDF)
print("\n")
# Selecting unmatched controls
unmatchedDF = epy.phjSelectCaseControlDataset(phjCasesDF = casesDF,
phjPotentialControlsDF = potControlsDF,
phjUniqueIdentifierVarName = 'animalID',
phjMatchingVariablesList = None,
phjControlsPerCaseInt = 2,
phjPrintResults = False)
print(unmatchedDF)
In [ ]:
casesDF = pd.DataFrame({'animalID':[1,2,3,4,5],'var1':[43,45,34,45,56],'sp':['dog','dog','dog','dog','dog']})
potControlsDF = pd.DataFrame({'animalID':[11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],
'var1':[34,54,34,23,34,45,56,67,56,67,78,98,65,54,34,76,87,56,45,34],
'sp':['dog','cat','dog','dog','cat','dog','cat','dog','cat','dog',
'dog','dog','dog','cat','dog','cat','dog','dog','dog','cat']})
print("This dataframe contains all the cases of disease\n")
print(casesDF)
print("\n")
print("This dataframe contains all the animals you could potentially use as controls\n")
print(potControlsDF)
print("\n")
# Selecting controls that are matched to cases on variable 'sp'
matchedDF = epy.phjSelectCaseControlDataset(phjCasesDF = casesDF,
phjPotentialControlsDF = potControlsDF,
phjUniqueIdentifierVarName = 'animalID',
phjMatchingVariablesList = ['sp'],
phjControlsPerCaseInt = 2,
phjPrintResults = False)
print(matchedDF)
In [ ]:
In [ ]:
In [ ]:
# Define example dataset
phjTempDF = pd.DataFrame({'binDepVar':[False]*50000 + [True]*50000,
'riskFactorCont':np.random.uniform(0,1,100000)})
with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
print(phjTempDF)
# View log odds
phjTempDF = epy.phjViewLogOdds(phjDF = phjTempDF,
phjBinaryDepVarName = 'binDepVar',
phjCaseValue = False,
phjContIndepVarName = 'riskFactorCont',
phjMissingValue = 'missing',
phjNumberOfCategoriesInt = 3,
phjNewCategoryVarName = 'categoricalVar',
phjCategorisationMethod = 'jenks',
phjGroupVarName = None,
phjAlpha = 0.05,
phjPrintResults = False)
with pd.option_context('display.max_rows', 10, 'display.max_columns', 10):
print('Log odds for categorised variable')
print(phjTempDF)
In [ ]:
# Define example dataset
phjTempDF = pd.DataFrame({'binDepVar':[1]*50000 + [0]*50000,
'riskFactorCont':np.random.uniform(0,1,100000)})
with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
print(phjTempDF)
# View log odds
phjTempDF = epy.phjViewLogOdds(phjTempDF = phjTempDF,
phjBinaryDepVarName = 'binDepVar',
phjContIndepVarName = 'riskFactorCont',
phjCaseValue = 1,
phjMissingValue = 'missing',
phjNumberOfCategoriesInt = 8,
phjNewCategoryVarName = 'categoricalVar',
phjCategorisationMethod = 'quantile',
phjGroupNameVar = None,
phjPrintResults = False)
with pd.option_context('display.max_rows', 10, 'display.max_columns', 10):
print('Log odds for categorised variable')
print(phjTempDF)
In [ ]:
# Define example dataset
phjTempDF = pd.DataFrame({'binDepVar':['yes']*50000 + ['no']*50000,
'riskFactorCont':np.random.uniform(0,1,100000)})
with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
print(phjTempDF)
# Categorise a continuous variable
phjTempDF = epy.phjCategoriseContinuousVariable(phjTempDF = phjTempDF,
phjContinuousVarName = 'riskFactorCont',
phjMissingValue = 'missing',
phjNumberOfCategoriesInt = 6,
phjNewCategoryVarName = 'catVar',
phjCategorisationMethod = 'jenks',
phjReturnBreaks = False,
phjPrintResults = False)
with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
print('\nLog odds for categorised variable')
print(phjTempDF)
In [ ]:
# Define example dataset
phjTempDF = pd.DataFrame({'binDepVar':['yes']*50000 + ['no']*50000,
'riskFactorCont':np.random.uniform(0,1,100000)})
with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
print(phjTempDF)
# Categorise a continuous variable
phjTempDF, phjBreaksList = epy.phjCategoriseContinuousVariable(phjTempDF = phjTempDF,
phjContinuousVarName = 'riskFactorCont',
phjMissingValue = 'missing',
phjNumberOfCategoriesInt = 6,
phjNewCategoryVarName = 'catVar',
phjCategorisationMethod = 'jenks',
phjReturnBreaks = True,
phjPrintResults = False)
with pd.option_context('display.max_rows', 10, 'display.max_columns', 5):
print('\nCategorised variable')
print(phjTempDF)
print('\n')
print('Breaks')
print(phjBreaksList)
In [ ]:
tempDF = pd.DataFrame({'caseN':[1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0],
'caseA':['y','y','y','y','y','y','y','y','n','n','n','n','n','n','n','n','n','n','n','n'],
'catN':[1,2,3,2,3,4,3,2,3,4,3,2,1,2,1,2,3,2,3,4],
'catA':['a','a','b','b','c','d','a','c','c','d','a','b','c','a','d','a','b','c','a','d'],
'floatN':[1.2,4.3,2.3,4.3,5.3,4.3,2.4,6.5,4.5,7.6,5.6,5.6,4.8,5.2,7.4,5.4,6.5,5.7,6.8,4.5]})
phjORTable = epy.phjOddsRatio(phjDF = tempDF,
phjCaseVarName = 'caseA',
phjCaseValue = 'y',
phjRiskFactorVarName = 'catA',
phjRiskFactorBaseValue = 'a',
phjMissingValue = np.nan,
phjAlpha = 0.05,
phjPrintResults = False)
pd.options.display.float_format = '{:,.3f}'.format
print(phjORTable)
In [ ]:
tempDF = pd.DataFrame({'caseN':[1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0],
'caseA':['y','y','y','y','y','y','y','y','n','n','n','n','n','n','n','n','n','n','n','n'],
'catN':[1,2,3,2,3,4,3,2,3,4,3,2,1,2,1,2,3,2,3,4],
'catA':['a','a','b','b','c','d','a','c','c','d','a','b','c','a','d','a','b','c','a','d'],
'floatN':[1.2,4.3,2.3,4.3,5.3,4.3,2.4,6.5,4.5,7.6,5.6,5.6,4.8,5.2,7.4,5.4,6.5,5.7,6.8,4.5]})
phjRRTable = epy.phjRelativeRisk( phjDF = tempDF,
phjCaseVarName = 'caseA',
phjCaseValue = 'y',
phjRiskFactorVarName = 'catA',
phjRiskFactorBaseValue = 'a',
phjMissingValue = np.nan,
phjAlpha = 0.05,
phjPrintResults = False)
pd.options.display.float_format = '{:,.3f}'.format
print(phjRRTable)