In [1]:
import pandas as pd
import sys
import os, os.path

sys.path.append('/home/will/PatientPicker/')

In [2]:
import LoadingTools

In [3]:
redcap_data = LoadingTools.load_redcap_data().set_index(['Patient ID', 'VisitNum'])
cyto_data = pd.read_csv('/home/will/HIVSystemsBio/NewCytokineAnalysis/CytoRawData.csv', sep='\t')
cyto_data['HasCyto'] = True
has_cyto = cyto_data.groupby(['Patient ID', 'VisitNum'])[['HasCyto']].all()

In [19]:
tmp = pd.read_csv('/home/will/HIVSystemsBio/NewCytokineAnalysis/CytoPatData.csv', sep='\t')
ages = tmp.groupby('Patient ID')['Age'].first()
print ages.max(), ages.min()


71 20

In [4]:
import glob
files = glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*.fasta')
seqs = []
for f in files:
    fname = f.split('/')[-1]
    try:
        pid, vn, prot = fname.split('.')[0].split('-', 2)
    except ValueError:
        print fname
    seqs.append((pid, vn, prot, 1))
    
df = pd.DataFrame(seqs, columns = ['Patient ID', 'VisitNum', 'Prot', 'HasSeq'])
has_seq = pd.pivot_table(df, rows = ['Patient ID', 'VisitNum'], cols = 'Prot', values='HasSeq')

In [5]:
redcap_data = pd.merge(redcap_data, has_cyto,
                       left_index=True, right_index=True,
                       how='outer')
redcap_data = pd.merge(redcap_data, has_seq,
                       left_index=True, right_index=True,
                       how='outer')
redcap_data = redcap_data.drop(['VisitNum', 'Patient ID'], axis=1)

In [6]:
import numpy
def safe_mean(col):
    ncol = col.dropna()
    if len(ncol) == 0:
        return np.nan
    return ncol.mean()

def safe_col_apply_mean(col, func, indf):
    return indf[col].dropna().map(func).mean()

In [7]:
from functools import partial
seq_cols = list(has_seq.columns)
test_drug_cols = [col for col in redcap_data.columns if col.startswith('Test-')]
admit_drug_cols = [col for col in redcap_data.columns if col.startswith('Admit-')]
race_cols = [col for col in redcap_data.columns if col.startswith('Race-')]

mean_cols = test_drug_cols+admit_drug_cols+seq_cols+race_cols
agg_dict = dict([(col, safe_mean) for col in mean_cols])
agg_dict['VisitNum'] = 'count'

cut_list = [('%LowVL', 'Latest viral load', 50),
              ('%LowCD4', 'Latest CD4 count (cells/uL)', 200)]

In [8]:
pat_sum = redcap_data.reset_index().groupby('Patient ID').agg(agg_dict)
for ncol, tcol, cut in cut_list:
    pat_sum[ncol] = (redcap_data[tcol]<cut).groupby(level='Patient ID').agg(mean)
pat_sum


Out[8]:
&ltclass 'pandas.core.frame.DataFrame'>
Index: 508 entries, A0001 to A0514
Data columns (total 33 columns):
Admit-Cocaine           507  non-null values
Admit-Cannabinoid       507  non-null values
Test-Cannabinoid        470  non-null values
Admit-Heroin            507  non-null values
Race-Indian             507  non-null values
Admit-Narcotics         507  non-null values
Race-Black              507  non-null values
Vpr                     57  non-null values
Test-Opiates            470  non-null values
V3                      60  non-null values
Admit-None              507  non-null values
Admit-Ritalin           507  non-null values
Admit-Benzodiazapine    507  non-null values
LTR                     454  non-null values
Test-Amphetamines       470  non-null values
Admit-Ecstasy           507  non-null values
Test-Phencyclidine      470  non-null values
Race-Multiple           507  non-null values
Race-Unknown            507  non-null values
Race-Hawaiian           507  non-null values
Admit-Amphetamines      507  non-null values
Race-White              507  non-null values
Test-Barbiturates       470  non-null values
Race-Asian              507  non-null values
VisitNum                508  non-null values
Admit-Other             507  non-null values
Admit-PCP               507  non-null values
Test-Cocaine            470  non-null values
Test-Benzodiazepine     470  non-null values
Tat-1                   57  non-null values
Tat-2                   58  non-null values
%LowVL                  508  non-null values
%LowCD4                 508  non-null values
dtypes: float64(32), int64(1)

In [11]:
twpats = ['A0014','A0037','A0041','A0052','A0071','A0110','A0128','A0278','A0281','A0365']
pat_sum.ix[twpats][['LTR', 'V3']]


Out[11]:
LTR V3
A0014 1 NaN
A0037 1 1
A0041 1 1
A0052 1 NaN
A0071 1 NaN
A0110 1 1
A0128 1 1
A0278 1 NaN
A0281 1 NaN
A0365 1 NaN

In [13]:
non_users = pat_sum[(pat_sum[test_drug_cols]==0).all(axis=1)]

PN_pats = non_users.drop(test_drug_cols, axis=1)

In [15]:
#PN_pats.to_excel('/home/will/HIVTropism/PN_pats.xlsx')

In [20]:
import pickle

with open('/home/will/HIVTropism/trop_dict.pkl') as handle:
    trop_data = pickle.load(handle)
tmp = list(trop_data.items())

In [17]:
for key, val in tmp:
    if key.startswith('A0'):
        print key, val


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-17-3885af77b414> in <module>()
      1 for key, val in trop_data.items():
----> 2     if key.startswith('A0'):
      3         print key, val

AttributeError: 'numpy.int64' object has no attribute 'startswith'

In [27]:
(PN_pats['VisitNum']>=3).sum()


Out[27]:
105

In [25]:
len(PN_pats)


Out[25]:
275

In [ ]: