HEOR workflow

Programmer: Degang Wang

Date: 11/01/2017

Version: 0.1

1. Load modules


In [ ]:
import numpy as np
import pandas as pd
from pandasql import sqldf

2. Read in SAS data


In [2]:
path = 'F:\\GHEOR_Rapastinel_Tung_201704_001\\_DATA\\'

In [3]:
df01 = pd.read_sas(path + 'p1p02_cohort_1a.sas7bdat')
df01.head()


Out[3]:
ENROLID CASEID PDX ADMDATE DISDATE DSTATUS DAYS TOTPAY AGE SEX REGION PLANTYP suicide_attempt suicide_ideation cohort2 source cohort1
0 16801.0 8438.0 b'29632' 2013-06-25 2013-06-25 b'01' 1.0 2122.01 74.0 b'2' b'2' 6.0 0.0 1.0 b'2C' b'mdcr' b'1Ab'
1 19001.0 8444.0 b'29632' 2013-05-28 2013-05-28 b'01' 1.0 8139.70 76.0 b'1' b'2' 6.0 0.0 0.0 b'2D' b'mdcr' b'1Ac'
2 43301.0 8499.0 b'29633' 2013-03-13 2013-03-22 b'01' 9.0 10653.13 69.0 b'1' b'2' 6.0 1.0 0.0 b'2B' b'mdcr' b'1Aa'
3 190102.0 8881.0 b'29633' 2013-01-01 2013-01-07 b'01' 6.0 5239.60 67.0 b'2' b'2' 6.0 0.0 0.0 b'2D' b'mdcr' b'1Ac'
4 202402.0 8929.0 b'29624' 2013-12-04 2013-12-18 b'03' 14.0 17189.10 80.0 b'2' b'2' 6.0 0.0 0.0 b'2D' b'mdcr' b'1Ac'

In [35]:
enr = list(df01['ENROLID'])
type(enr)
enr[:6]

enr[1] == 19001
pdx = list(df01['PDX'])
pdx[:6]
pdx[3] == b'29633'


Out[35]:
True

In [43]:
df01.head()


Out[43]:
ENROLID CASEID PDX ADMDATE DISDATE DSTATUS DAYS TOTPAY AGE SEX REGION PLANTYP suicide_attempt suicide_ideation cohort2 source cohort1
0 16801.0 8438.0 b'29632' 2013-06-25 2013-06-25 b'01' 1.0 2122.01 74.0 b'2' b'2' 6.0 0.0 1.0 b'2C' b'mdcr' b'1Ab'
1 19001.0 8444.0 b'29632' 2013-05-28 2013-05-28 b'01' 1.0 8139.70 76.0 b'1' b'2' 6.0 0.0 0.0 b'2D' b'mdcr' b'1Ac'
2 43301.0 8499.0 b'29633' 2013-03-13 2013-03-22 b'01' 9.0 10653.13 69.0 b'1' b'2' 6.0 1.0 0.0 b'2B' b'mdcr' b'1Aa'
3 190102.0 8881.0 b'29633' 2013-01-01 2013-01-07 b'01' 6.0 5239.60 67.0 b'2' b'2' 6.0 0.0 0.0 b'2D' b'mdcr' b'1Ac'
4 202402.0 8929.0 b'29624' 2013-12-04 2013-12-18 b'03' 14.0 17189.10 80.0 b'2' b'2' 6.0 0.0 0.0 b'2D' b'mdcr' b'1Ac'

In [47]:
q = """
    select ENROLID, DAYS
    from df01
    where days = 1 ;
"""

df02 = sqldf(q)
df02.head()


Out[47]:
ENROLID DAYS
0 16801.0 1.0
1 19001.0 1.0
2 305902.0 1.0
3 479602.0 1.0
4 621901.0 1.0

In [9]:
path = "C:\\Users\\Wang_Degang\\Documents\\Python Scripts\\"

In [7]:
df01 = pd.read_sas(path + 'ccaed11test.sas7bdat')
df01.tail()


Out[7]:
SEQNUM VERSION EFAMID ENROLID NDCNUM SVCDATE DOBYR YEAR AGE AWP ... EECLASS EESTATU EGEOLOC EIDFLAG EMPREL ENRFLAG PHYFLAG SEX HLTHPLAN INDSTRY
4995 325908297.0 b'10' 27447188.0 NaN b'00591333230' 2011-01-07 1971.0 2011.0 40.0 53.829895 ... b'7' b'1' b'01' b'3' b'4' b'1' b'1' b'2' b'0' NaN
4996 325908298.0 b'10' 27447188.0 NaN b'49884087211' 2011-01-07 1971.0 2011.0 40.0 160.089844 ... b'7' b'1' b'01' b'3' b'4' b'1' b'1' b'2' b'0' NaN
4997 325908299.0 b'10' 27451483.0 NaN b'00555913167' 2011-01-07 1975.0 2011.0 36.0 76.683289 ... b'7' b'1' b'01' b'3' b'4' b'1' b'1' b'1' b'0' NaN
4998 325908300.0 b'10' 27451483.0 NaN b'50111033401' 2011-01-07 1975.0 2011.0 36.0 10.190598 ... b'7' b'1' b'01' b'3' b'4' b'1' b'1' b'1' b'0' NaN
4999 325908301.0 b'10' 27453473.0 NaN b'00406035705' 2011-01-07 1966.0 2011.0 45.0 11.811600 ... b'7' b'1' b'01' b'3' b'4' b'1' b'1' b'1' b'0' NaN

5 rows × 54 columns


In [3]:
# ccaed111 = pd.read_sas("G:\ccaed111.sas7bdat")

In [ ]:


In [18]:
from sas7bdat import SAS7BDAT
rows = 0

with SAS7BDAT(path + "ccaed11test.sas7bdat") as f:
    ccaed111 = f.to_data_frame()
    rows += len(ccaed111)
    
print("read in{: ,} rows".format(rows))


read in 5,000 rows

In [19]:
ccaed111.head()


Out[19]:
SEQNUM VERSION EFAMID ENROLID NDCNUM SVCDATE DOBYR YEAR AGE AWP ... EECLASS EESTATU EGEOLOC EIDFLAG EMPREL ENRFLAG PHYFLAG SEX HLTHPLAN INDSTRY
0 90330405.0 10 18371167.0 None 59310057920 2011-01-01 2007.0 2011.0 3.0 0.000000 ... 9 9 62 3 3 1 1 0 6
1 90330406.0 10 18371854.0 None 00078038566 2011-01-01 1953.0 2011.0 57.0 0.000000 ... 9 9 62 3 2 1 1 0 6
2 90330407.0 10 18371854.0 None 62175011843 2011-01-01 1953.0 2011.0 57.0 0.000000 ... 9 9 62 3 2 1 1 0 6
3 90330408.0 10 18371854.0 None 68180051503 2011-01-01 1953.0 2011.0 57.0 0.000000 ... 9 9 62 3 2 1 1 0 6
4 90330409.0 10 24706619.0 None 00143988775 2011-01-01 NaN 2011.0 NaN 14.659492 ... 9 9 62 3 4 1 1 0 6

5 rows × 54 columns


In [13]:
df = SAS7BDAT(path + "ccaed11test.sas7bdat").to_data_frame()

In [15]:
len(df)


Out[15]:
5000

In [20]:
df.tail()


Out[20]:
SEQNUM VERSION EFAMID ENROLID NDCNUM SVCDATE DOBYR YEAR AGE AWP ... EECLASS EESTATU EGEOLOC EIDFLAG EMPREL ENRFLAG PHYFLAG SEX HLTHPLAN INDSTRY
4995 325908297.0 10 27447188.0 None 00591333230 2011-01-07 1971.0 2011.0 40.0 53.829895 ... 7 1 01 3 4 1 1 2 0
4996 325908298.0 10 27447188.0 None 49884087211 2011-01-07 1971.0 2011.0 40.0 160.089844 ... 7 1 01 3 4 1 1 2 0
4997 325908299.0 10 27451483.0 None 00555913167 2011-01-07 1975.0 2011.0 36.0 76.683289 ... 7 1 01 3 4 1 1 1 0
4998 325908300.0 10 27451483.0 None 50111033401 2011-01-07 1975.0 2011.0 36.0 10.190598 ... 7 1 01 3 4 1 1 1 0
4999 325908301.0 10 27453473.0 None 00406035705 2011-01-07 1966.0 2011.0 45.0 11.811600 ... 7 1 01 3 4 1 1 1 0

5 rows × 54 columns


In [ ]:
df = SAS7BDAT("G:\ccaed111.sas7bdat").to_data_frame()

In [5]:
import pandas as pd
from io import StringIO

csv = r"""dummy,date,loc,x
bar,20090101,a,1
bar,20090102,a,3
bar,20090103,a,5
bar,20090101,b,1
bar,20090102,b,3
bar,20090103,b,5"""

df = pd.read_csv(StringIO(csv),
        header=0,
        index_col=["date", "loc"], 
        usecols=["date", "loc", "x"],
        parse_dates=["date"])

In [6]:
df


Out[6]:
x
date loc
2009-01-01 a 1
2009-01-02 a 3
2009-01-03 a 5
2009-01-01 b 1
2009-01-02 b 3
2009-01-03 b 5

In [2]:
from time import sleep, perf_counter as pc
t0 = pc()
sleep(70)
print(pc()-t0)


69.99840661509089

In [3]:
from datetime import datetime 
startTime= datetime.now() 

t0 = pc()
sleep(70)
print(pc()-t0)

timeElapsed=datetime.now()-startTime 
print(timeElapsed)
print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))


69.9981883656646
0:01:10.001000
Time elpased (hh:mm:ss.ms) 0:01:10.001000

In [1]:
# python seeks modules under these pathes
import sys
from pprint import pprint as p

p(sys.path)


['',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\python36.zip',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\DLLs',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\lib',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\Sphinx-1.5.6-py3.6.egg',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\win32',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\win32\\lib',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\Pythonwin',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\setuptools-27.2.0-py3.6.egg',
 'C:\\Users\\wang_degang\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\wang_degang\\.ipython']

In [ ]:
import