In [1]:
import os, os.path
import sys
sys.path.append('/home/will/PatientPicker/')
import LoadingTools

In [2]:
import glob
import pandas as pd

files = glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta')
#redcap_data = LoadingTools.load_redcap_data()
has_ltr = []
for f in files:
    fname = f.rsplit('/',1)[-1]
    pid, vnum, _ = fname.split('-')
    has_ltr.append({
                    'PatientID':pid,
                    'VisitNum':vnum,
                    'HasLTR':'has_ltr'
                    })
has_ltr = pd.DataFrame(has_ltr).groupby(['PatientID', 'VisitNum']).first()

In [3]:
redcap_data = LoadingTools.load_redcap_data().groupby(['Patient ID', 'VisitNum']).first()

In [4]:
jean_comments = pd.read_csv('/home/will/HIVVariation/ProblemPCRpatientsamples.csv').groupby(['Patient ID', 'VisitNum']).first()

In [5]:
left, right = jean_comments.align(has_ltr, join='outer')
ltr_comments = left.copy()
ltr_comments['HasLTR'] = ltr_comments['HasLTR'].combine_first(right['HasLTR'])

In [6]:
red_data = pd.merge(ltr_comments, redcap_data,
                    left_index=True, 
                    right_index=True,
                    how='outer')
red_data


Out[6]:
&ltclass 'pandas.core.frame.DataFrame'>
MultiIndex: 1434 entries, (A0001, R00) to (A0514, R00)
Columns: 429 entries, HasLTR to HAART-Missing
dtypes: float64(149), object(280)

In [7]:
group_key = 'HasLTR'
check_cols = ['Latest CD4 count (cells/uL)', 'Latest CD8 count (cells/uL)', 'LVL']

In [8]:
red_data['LVL'].describe()


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-8-fabe921485c8> in <module>()
----> 1 red_data['LVL'].describe()

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1926         else:
   1927             # get column
-> 1928             return self._get_item_cache(key)
   1929 
   1930     def _getitem_slice(self, key):

/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
    568             return cache[item]
    569         except Exception:
--> 570             values = self._data.get(item)
    571             res = self._box_item_values(item, values)
    572             cache[item] = res

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in get(self, item)
   1381 
   1382     def get(self, item):
-> 1383         _, block = self._find_block(item)
   1384         return block.get(item)
   1385 

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in _find_block(self, item)
   1523 
   1524     def _find_block(self, item):
-> 1525         self._check_have(item)
   1526         for i, block in enumerate(self.blocks):
   1527             if item in block:

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in _check_have(self, item)
   1530     def _check_have(self, item):
   1531         if item not in self.items:
-> 1532             raise KeyError('no item named %s' % com.pprint_thing(item))
   1533 
   1534     def reindex_axis(self, new_axis, method=None, axis=0, copy=True):

KeyError: u'no item named LVL'

In [10]:
from statsmodels.graphics.boxplots import violinplot
import numpy as np
red_data['LVL'] = red_data['Latest viral load'].map(np.log10)
fig, axs = plt.subplots(3,1, figsize=(10,10))

for col, ax in zip(check_cols, axs.flatten()):
    boxes = []
    labels = []
    for key, group in red_data.groupby(group_key):
        labels.append(key)
        print key, len(group[col].dropna().unique())
        boxes.append(group[col].dropna())
    #iolinplot(boxes, ax=ax, labels=labels)
    ax.boxplot(boxes)


Blank or poor peaks 4
Few failed traces 32
Many failed traces 19
Many traces 6
Mostly blank traces 2
Several failed traces 30
has_ltr 618
Blank or poor peaks 1
Few failed traces 16
Many failed traces 13
Many traces 2
Mostly blank traces 1
Several failed traces 19
has_ltr 675
Blank or poor peaks 4
Few failed traces 15
Many failed traces 11
Many traces 4
Mostly blank traces 2
Several failed traces 13
has_ltr 364

In [ ]:
list(red_data.columns)

In [ ]: