In [1]:
import pandas as pd
import sys
import os, os.path

sys.path.append('/home/will/PatientPicker/')

In [2]:
import LoadingTools

In [3]:
redcap_data = LoadingTools.load_redcap_data().set_index(['Patient ID', 'VisitNum'])
cyto_data = pd.read_csv('/home/will/HIVSystemsBio/NewCytokineAnalysis/CytoRawData.csv', sep='\t')
cyto_data['HasCyto'] = True
has_cyto = cyto_data.groupby(['Patient ID', 'VisitNum'])[['HasCyto']].all()

In [17]:
cols = ['Psychomotor Speed Score',
 'Memory Recall Score',
 'Constructional Score',
 'TMHDS']
redcap_data['Psychomotor Speed Score'].unique()


Out[17]:
array([ nan,  0. ,  1. ,  6. ,  3. ,  5. ,  4. ,  2. ,  5.5,  3.5])

In [4]:
import glob
files = glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*.fasta')
seqs = []
for f in files:
    fname = f.split('/')[-1]
    try:
        pid, vn, prot = fname.split('.')[0].split('-', 2)
    except ValueError:
        print fname
    seqs.append((pid, vn, prot, 1))
    
df = pd.DataFrame(seqs, columns = ['Patient ID', 'VisitNum', 'Prot', 'HasSeq'])
has_seq = pd.pivot_table(df, rows = ['Patient ID', 'VisitNum'], cols = 'Prot', values='HasSeq')

In [5]:
import sys
sys.path.append('/home/will/PySeqUtils/')
import GeneralSeqTools

with open('/home/will/DrugStuff/pat_data.fasta') as handle:
    seqs = list(GeneralSeqTools.fasta_reader(handle))
    out = GeneralSeqTools.WebPSSM_V3_fasta(seqs)

In [6]:
tmp = []
for row in out:
    parts = row[0].split('-')
    if len(parts) == 2:
        pat, vnum = parts
    else:
        pat, vnum, _ = parts
    tmp.append({'Patient ID':pat,
                'VisitNum':vnum,
                'IsR5':row[2]=='0',
                'IsX4':row[2]=='1',
                })
tropism = pd.DataFrame(tmp).groupby(['Patient ID', 'VisitNum']).first()

In [7]:
redcap_data = pd.merge(redcap_data, has_cyto,
                       left_index=True, right_index=True,
                       how='outer')
redcap_data = pd.merge(redcap_data, has_seq,
                       left_index=True, right_index=True,
                       how='outer')
redcap_data = pd.merge(redcap_data, tropism,
                       left_index=True, right_index=True,
                       how='outer')
redcap_data = redcap_data.drop(['VisitNum', 'Patient ID'], axis=1)

In [8]:
import numpy
def safe_sum(col):
    ncol = col.dropna()
    if len(ncol) == 0:
        return np.nan
    return ncol.sum()

def safe_col_apply_mean(col, func, indf):
    return indf[col].dropna().map(func).mean()

In [12]:
from functools import partial
seq_cols = list(has_seq.columns)
test_drug_cols = [col for col in redcap_data.columns if col.startswith('Test-')]
admit_drug_cols = [col for col in redcap_data.columns if col.startswith('Admit-')]
race_cols = [col for col in redcap_data.columns if col.startswith('Race-')]
gender_cols = ['Male', 'Female']
mean_cols = test_drug_cols+admit_drug_cols+race_cols+seq_cols+['HasCyto', 'IsR5', 'IsX4', 'Hepatitis C status (HCV)']
agg_dict = dict([(col, safe_sum) for col in mean_cols])
agg_dict['VisitNum'] = 'count'

cut_list = [('%LowVL', 'Latest viral load', 50),
              ('%LowCD4', 'Latest CD4 count (cells/uL)', 200)]

In [13]:
redcap_data['Male'] = redcap_data['Gender'] == 'Male'
redcap_data['Female'] = redcap_data['Gender'] == 'Female'
redcap_data[mean_cols] = redcap_data[mean_cols].applymap(float)
pat_sum = redcap_data.reset_index().groupby('Patient ID').agg(agg_dict)
for ncol, tcol, cut in cut_list:
    pat_sum[ncol] = (redcap_data[tcol]<cut).groupby(level='Patient ID').agg(mean)
pat_sum


Out[13]:
&ltclass 'pandas.core.frame.DataFrame'>
Index: 508 entries, A0001 to A0514
Data columns (total 37 columns):
Admit-Cocaine               507  non-null values
Admit-Cannabinoid           507  non-null values
IsR5                        60  non-null values
IsX4                        60  non-null values
Admit-Heroin                507  non-null values
Race-Indian                 507  non-null values
Admit-Narcotics             507  non-null values
Race-Black                  507  non-null values
Vpr                         57  non-null values
HasCyto                     113  non-null values
Test-Opiates                470  non-null values
V3                          60  non-null values
Hepatitis C status (HCV)    479  non-null values
Test-Cannabinoid            470  non-null values
Admit-None                  507  non-null values
Admit-Ritalin               507  non-null values
Admit-Benzodiazapine        507  non-null values
LTR                         454  non-null values
Test-Amphetamines           470  non-null values
Admit-Ecstasy               507  non-null values
Test-Phencyclidine          470  non-null values
Race-Multiple               507  non-null values
Race-Unknown                507  non-null values
Race-Hawaiian               507  non-null values
Admit-Amphetamines          507  non-null values
Race-White                  507  non-null values
Test-Barbiturates           470  non-null values
Race-Asian                  507  non-null values
VisitNum                    508  non-null values
Admit-Other                 507  non-null values
Admit-PCP                   507  non-null values
Test-Cocaine                470  non-null values
Test-Benzodiazepine         470  non-null values
Tat-1                       57  non-null values
Tat-2                       58  non-null values
%LowVL                      508  non-null values
%LowCD4                     508  non-null values
dtypes: float64(36), int64(1)

In [14]:
pat_sum[sorted(pat_sum.columns)].to_excel('/home/will/DrugStuff/large_pat_group.xlsx')

In [13]:
non_users = pat_sum[(pat_sum[test_drug_cols]==0).all(axis=1)]

PN_pats = non_users.drop(test_drug_cols, axis=1)

In [15]:
#PN_pats.to_excel('/home/will/HIVTropism/PN_pats.xlsx')

In [20]:
import pickle

with open('/home/will/HIVTropism/trop_dict.pkl') as handle:
    trop_data = pickle.load(handle)
tmp = list(trop_data.items())

In [17]:
for key, val in tmp:
    if key.startswith('A0'):
        print key, val


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-17-3885af77b414> in <module>()
      1 for key, val in trop_data.items():
----> 2     if key.startswith('A0'):
      3         print key, val

AttributeError: 'numpy.int64' object has no attribute 'startswith'

In [27]:
(PN_pats['VisitNum']>=3).sum()


Out[27]:
105

In [25]:


In [28]:



Out[28]:
[('A0017-R00', '1'),
 ('A0017-R02', '1'),
 ('A0017-R03', '1'),
 ('A0037-R01', '1'),
 ('A0041-R00', '1'),
 ('A0041-R00', '1'),
 ('A0041-R04', '1'),
 ('A0059-R01', '1'),
 ('A0067-R03', '1'),
 ('A0067-R04', '1'),
 ('A0089-R00', '1'),
 ('A0094-R03', '1'),
 ('A0094-R03', '1'),
 ('A0098-R00', '1'),
 ('A0107-R00', '1'),
 ('A0107-R02', '1'),
 ('A0107-R03', '1'),
 ('A0107-R04', '1'),
 ('A0107-R05', '1'),
 ('A0107-R07', '1'),
 ('A0110-R03', '1'),
 ('A0110-R04', '1'),
 ('A0110-R05', '1'),
 ('A0110-R06', '1'),
 ('A0132-R04', '1'),
 ('A0159-R00', '1'),
 ('A0200-R01', '1'),
 ('A0200-R02', '1'),
 ('A0200-R02', '1'),
 ('A0208-R00', '1'),
 ('A0208-R01', '1'),
 ('A0208-R02', '1'),
 ('A0213-R00', '1'),
 ('A0238-R00', '1'),
 ('A0242-R00', '1'),
 ('A0370-R00', '1'),
 ('A0403-R00', '1'),
 ('A0403-R01', '1')]

In [ ]: