In [1]:
import pandas as pd
import sys
import os, os.path
sys.path.append('/home/will/PatientPicker/')
In [2]:
import LoadingTools
In [3]:
redcap_data = LoadingTools.load_redcap_data().set_index(['Patient ID', 'VisitNum'])
cyto_data = pd.read_csv('/home/will/HIVSystemsBio/NewCytokineAnalysis/CytoRawData.csv', sep='\t')
cyto_data['HasCyto'] = True
has_cyto = cyto_data.groupby(['Patient ID', 'VisitNum'])[['HasCyto']].all()
In [19]:
tmp = pd.read_csv('/home/will/HIVSystemsBio/NewCytokineAnalysis/CytoPatData.csv', sep='\t')
ages = tmp.groupby('Patient ID')['Age'].first()
print ages.max(), ages.min()
In [4]:
import glob
files = glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*.fasta')
seqs = []
for f in files:
fname = f.split('/')[-1]
try:
pid, vn, prot = fname.split('.')[0].split('-', 2)
except ValueError:
print fname
seqs.append((pid, vn, prot, 1))
df = pd.DataFrame(seqs, columns = ['Patient ID', 'VisitNum', 'Prot', 'HasSeq'])
has_seq = pd.pivot_table(df, rows = ['Patient ID', 'VisitNum'], cols = 'Prot', values='HasSeq')
In [5]:
redcap_data = pd.merge(redcap_data, has_cyto,
left_index=True, right_index=True,
how='outer')
redcap_data = pd.merge(redcap_data, has_seq,
left_index=True, right_index=True,
how='outer')
redcap_data = redcap_data.drop(['VisitNum', 'Patient ID'], axis=1)
In [6]:
import numpy
def safe_mean(col):
ncol = col.dropna()
if len(ncol) == 0:
return np.nan
return ncol.mean()
def safe_col_apply_mean(col, func, indf):
return indf[col].dropna().map(func).mean()
In [7]:
from functools import partial
seq_cols = list(has_seq.columns)
test_drug_cols = [col for col in redcap_data.columns if col.startswith('Test-')]
admit_drug_cols = [col for col in redcap_data.columns if col.startswith('Admit-')]
race_cols = [col for col in redcap_data.columns if col.startswith('Race-')]
mean_cols = test_drug_cols+admit_drug_cols+seq_cols+race_cols
agg_dict = dict([(col, safe_mean) for col in mean_cols])
agg_dict['VisitNum'] = 'count'
cut_list = [('%LowVL', 'Latest viral load', 50),
('%LowCD4', 'Latest CD4 count (cells/uL)', 200)]
In [8]:
pat_sum = redcap_data.reset_index().groupby('Patient ID').agg(agg_dict)
for ncol, tcol, cut in cut_list:
pat_sum[ncol] = (redcap_data[tcol]<cut).groupby(level='Patient ID').agg(mean)
pat_sum
Out[8]:
In [11]:
twpats = ['A0014','A0037','A0041','A0052','A0071','A0110','A0128','A0278','A0281','A0365']
pat_sum.ix[twpats][['LTR', 'V3']]
Out[11]:
In [13]:
non_users = pat_sum[(pat_sum[test_drug_cols]==0).all(axis=1)]
PN_pats = non_users.drop(test_drug_cols, axis=1)
In [15]:
#PN_pats.to_excel('/home/will/HIVTropism/PN_pats.xlsx')
In [20]:
import pickle
with open('/home/will/HIVTropism/trop_dict.pkl') as handle:
trop_data = pickle.load(handle)
tmp = list(trop_data.items())
In [17]:
for key, val in tmp:
if key.startswith('A0'):
print key, val
In [27]:
(PN_pats['VisitNum']>=3).sum()
Out[27]:
In [25]:
len(PN_pats)
Out[25]:
In [ ]: