In [307]:
## Simulating an EKG by application 
## of the concept of Fourier Series
## http://en.wikipedia.org/wiki/Fourier_series

from numpy import sin

x = np.linspace(0,100,1000)

### consitituent waves ###
# might I suggest: if you want to turn off one of the sine waves,
# just set its a variable to 0

#this one is big and sets the overall waveform
a3=10
f3 = .5
y3 =  a3*sin(f3*x)
#second biggest wave
a2 = 5
f2 = 2
y2 = a2*sin(f2*x)
#this one didn't seem useful, so I ditched it using the a=0 technique
a1=0 #1
f1=7
y1 = a1*sin(f1*x)
#this one adds periodic noise.
a4 = .5
f4 = 20
y4 =a4*sin(f4*x) 

#random noise
n = len(x)
random_noise = np.random.rand(n)*np.random.randint(0,10,n)

# ekg signal
EKG = y1+y2+y3+y4+random_noise

#plot the complete ‘EKG'
plt.plot(x, EKG,'b',lw=3)

#plot the component waves
plt.plot(x, y4, 'm')
plt.plot(x, y3, 'r')
plt.plot(x, y2, 'y')
plt.plot(x, y1, 'g')
plt.plot(x, random_noise, 'c')


plt.show()

Non-Bass Way to do peak detect

requires installation of pypeaks (https://github.com/gopalkoduri/pypeaks)


In [177]:
import pypeaks as pypk
import pickle
import sys
import os
import os.path as path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from matplotlib import colors
import seaborn as sns
sns.set_style("whitegrid")


'''
data = pickle.load(file("./sample-histogram.pickle"))
#print data
hist = pypk.Data(data[0], data[1])
'''
#ji_intervals = pickle.load(file('./ji-intervals.pickle'))
#print ji_intervals

#pypk.Data.get_peaks??
#pd.DataFrame.from_csv(path, header=0, sep=',', index_col=0, parse_dates=True, 
#                        encoding=None, tupleize_cols=False, infer_datetime_format=False)
all_data = pd.DataFrame.from_csv('p01_r01_f_2.txt', header=[0,1], sep="\t")
data = all_data[0.5:]

#data.plot()
#plt.show()
data.head(2)


Out[177]:
Elapsed time iltered
(seconds) (mV)
0.500 0.040
0.502 0.035
Signature: hist.get_peaks(method='slope', peak_amp_thresh=5e-05, 
                          valley_thresh=3e-05, intervals=None, lookahead=20, avg_interval=100)
Docstring:
This function expects SMOOTHED histogram. If you run it on a raw histogram,
there is a high chance that it returns no peaks.

method can be interval/slope/hybrid.
    The interval-based method simply steps through the whole histogram
    and pick up the local maxima in each interval, from which irrelevant
    peaks are filtered out by looking at the proportion of points on 
    either side of the detected peak in each interval, and by applying
    peal_amp_thresh and valley_thresh bounds.

    Slope approach uses, of course slope information, to find peaks, 
    which are then filtered by applying peal_amp_thresh and 
    valley_thresh bounds. 

    Hybrid approach combines the peaks obtained using slope method and
    interval-based approach. It retains the peaks/valleys from slope method
    if there should be a peak around the same region from each of the methods.

peak_amp_thresh is the minimum amplitude/height that a peak should have
in a normalized smoothed histogram, to be qualified as a peak. 
valley_thresh is viceversa for valleys!

If the method is interval/hybrid, then the intervals argument must be passed
and it should be an instance of Intervals class.

If the method is slope/hybrid, then the lookahead and avg_window
arguments should be changed based on the application. 
They have some default values though.

The method stores peaks in self.peaks in the following format:
{"peaks":[[peak positions], [peak amplitudes]],
"valleys": [[valley positions], [valley amplitudes]]}

In [186]:
#things you can change
smooth_amount = 9
peak_amp_threshold = 3e-05
valley_amp_threshold = -2e-05

Extract Peaks and Valleys from Data


In [187]:
### Get peaks and valleys from data and make dataframes

hist = pypk.Data(data.index, data.values, smoothness=smooth_amount)
hist.get_peaks(method='slope', peak_amp_thresh=peak_amp_threshold, 
               valley_thresh=valley_amp_threshold)
valleys, peaks = hist.peaks.items()

def flatten_list_of_iterables(iterable_of_iterables):
    """
    iterable_of_iterables is an iterable of single-value iterables
    
    some of the data is a list or array of single-value lists or arrays
    this flattens them by iterating thru the iterable of arrays/lists 
    and append the first (and only) element from each into a new 1-D array.
    """
    return np.array([iterable[0] for iterable in iterable_of_iterables])

vx, vy = valleys[1]
px, py = peaks[1]
vy = flatten_list_of_iterables(vy)
py = flatten_list_of_iterables(py)

peaks_df = pd.DataFrame(data=zip(px,py), columns=["t","A"]).sort('t')
valleys_df = pd.DataFrame(data=zip(vx,vy), columns=["t","A"]).sort('t')
valleys_df.to_csv('valleys.csv')
peaks_df.to_csv('peaks.csv')

In [194]:
# plot peaks and valleys over original data
plt.plot(data.index, data.values)
plt.plot(hist.x, hist.y)
plt.plot(vx, vy, "o",c="tomato")
plt.plot(px, py, "o",c="orchid")
plt.show()

In [180]:
def map_event_time_to_amp(combined):
    event_times = []
    orig_amps = []
    types = []
    for k,df in combined.T.iteritems():

        time = float(df['t'])
        event_times.append(time)

        amp = data.loc[time].values
        orig_amps.append(amp)

        feature_type = df['type']
        types.append(feature_type)

    amps = flatten_list_of_iterables(orig_amps)
    temp_data = list(zip(event_times, amps, types))
    index_list = range(len(event_times))
    pv_orig_amps = pd.DataFrame(data = temp_data, index = index_list, 
                                columns=['t','A','type'])
    return pv_orig_amps

peaks_df['type'] = 'p'
valleys_df['type'] = 'v'
combined = pd.concat([peaks_df,valleys_df]).sort('t')
orig_amps = map_event_time_to_amp(combined)
orig_amps.head(3)


Out[180]:
t A type
0 0.500 0.040 v
1 0.552 0.055 p
2 0.644 -0.010 v

In [184]:
# plot peaks and valleys over original data
plt.plot(data.index, data.values)
plt.plot(orig_amps.t, orig_amps.A, "o")
plt.show()

In [45]:
# see the distribution of peaks
# if the R peaks are much larger than the other peaks,
# you can use calc_R_peak_lower_thresh to find the
# lower bound of the R peaks
# otherwise you may need to set 
# the variable R_peak_lower_bound (in the cell below) yourself.
groupby_type = orig_amps.groupby('type')
p = groupby_type.get_group('p')
plt.hist(p['A'].values,2)
plt.hist(p['A'].values)
plt.show()

In [75]:
import copy
# separate peaks from valleys
groupby_type = orig_amps.groupby('type')
p, v = [groupby_type.get_group(i) for i in groupby_type.groups]

def calc_R_peak_lower_thresh(peak_amplitudes):
    #now find R features using the np.histogram
    two_bin =  np.histogram(peak_amplitudes, 2)
    bin_edges = two_bin[1]
    mid = bin_edges[1]
    return mid

# you may need to set this yourself.
R_peak_lower_bound = calc_R_peak_lower_thresh(p['A'])

# separate out the R peaks, 
# all other peaks goes in other_peaks
# some are T peaks, some are noise
R_peaks = copy.deepcopy(p[p['A'] >= R_peak_lower_bound])
other_peaks = copy.deepcopy(p[p['A'] <= R_peak_lower_bound])
all_valleys = copy.deepcopy(v)

In [164]:
# check that R peaks were properly id'ed 
# (circles marking R peaks should now be different color from other peaks)
plt.plot(data.index, data.values, 'r', c='deepskyblue')
plt.plot(orig_amps.t, orig_amps.A, "o", c='cadetblue')
plt.plot(R_peaks.t, R_peaks.A, "o",c='teal')
plt.plot(all_valleys.t, all_valleys.A, "o",c='royalblue')
plt.plot(other_peaks.t, other_peaks.A, "o", c="turquoise")
plt.show()

In [121]:
#label R peaks
R_peaks.loc[:,'feature'] = 'R'

recombined = pd.concat([R_peaks, other_peaks, all_valleys]).fillna('')
sort_by_time = recombined.sort("t")
sort_by_time_T = sort_by_time.T

sort_by_time.head()


Out[121]:
t A type feature
0 0.500 0.040 v
1 0.552 0.055 p
2 0.644 -0.010 v
3 0.646 -0.010 v
4 0.702 0.620 p R

In [196]:
s = pd.Series(index=sort_by_time.index)
T_ends = []
vs = []
QT_intervals = []
skipped = []
for i in R_peaks.t.index:
    
    #print sort_by_time.loc[i-1:i+4]#, sort_by_time.loc[i+3]
    
    #try to locate the Q feature for this complex
    #if not found, skip this complex
    prev = sort_by_time.loc[i-1]
    if prev['type'] == 'v' and prev['feature'] == "":
        Qt = prev['t']
    else:
        skipped.append(i-1)
        continue
    
    # starting at i+1 (one after the R peak)
    # iterate until next peak (T feature) is found 
    while sort_by_time.loc[i+1]['type'] != 'p':
        i += 1
    T = sort_by_time.loc[i]['t']
    
    # starting one after i (b/c i is a peak)
    # iterate until next valley is found
    # optimally, this will be a valley 
    # immediately after T, but it doesn't matter that much
    # unless that data has a linear trend
    while sort_by_time.loc[i+1]['type'] != 'v':
        i += 1
    v = sort_by_time.loc[i]['t']
    
    if Q and v:
        
        T_end = v+ (v-T)/2
    else:
        continue

    QT_intervals.append(T_end-Qt)
    T_ends.append(T_end)
    vs.append(v)
        


T_ends


Out[196]:
[1.012,
 1.758,
 2.569,
 3.5119999999999996,
 4.4370000000000003,
 5.3669999999999991,
 6.2799999999999994,
 7.1910000000000007,
 8.0510000000000002,
 8.9000000000000021,
 9.7319999999999993,
 10.546000000000001,
 11.310000000000002,
 12.035,
 12.760000000000002,
 13.510999999999999,
 14.227,
 15.004999999999999,
 15.773999999999999,
 16.585000000000001,
 17.396999999999998,
 18.260000000000002,
 19.106999999999999,
 19.966999999999999]

In [198]:
# check that R peaks were properly id'ed 
# (circles marking R peaks should now be different color from other peaks)
plt.plot(data.index, data.values, 'r', c='deepskyblue')
plt.plot(orig_amps.t, orig_amps.A, "o", c='cadetblue')
plt.plot(R_peaks.t, R_peaks.A, "o",c='teal')
plt.plot(all_valleys.t, all_valleys.A, "o",c='royalblue')
plt.plot(other_peaks.t, other_peaks.A, "o", c="turquoise")

ymin, ymax = plt.ylim()
plt.vlines(vs,ymin, ymax, color='orange')
plt.vlines(T_ends,ymin, ymax, color='orchid')

#plt.plot()
plt.show()

Requires installation of PyBursts (https://github.com/rpoddighe/pybursts)


In [150]:
import pybursts
#this doesn't group entire QRST complex, but it does seem to get the QRS complex pretty well (h =1)

bursts = pybursts.kleinberg(combined.t, s=2, gamma=.001)
#print bursts
#b = [i[1:] for i in bursts]
bursts_df =  pd.DataFrame(data = bursts, index = range(len(bursts)), columns=['h','start','end'])
print len(bursts_df)
bursts_df.head()


211
Out[150]:
h start end
0 0 0.002 19.996
1 1 0.184 0.246
2 2 0.244 0.246
3 3 0.244 0.246
4 4 0.244 0.246

In [312]:
ymin, ymax = plt.ylim()
maxh = max(bursts_df['h'])
sorted_bursts = bursts_df.sort("h")
"""
for r,c in zip(sorted_bursts.iterrows(),colors.cnames):#zip(bursts_df.start, bursts_df.end, colors.cnames):#bursts_df.iterrows():
    #print r
    h,s,e = r[1]
    #print h
    if h != 1:
        continue
    plt.vlines(s, ymin, ymax, color=c, lw=maxh-h)
    plt.vlines(e, ymin, ymax, color=c, lw=maxh-h)
    plt.hlines(ymin, s,e, color=c, lw=maxh-h)
    plt.hlines(ymax, s, e, color=c, lw=maxh-h)
    #break"""
plt.plot(data.index, data.values,'y', label = "original")
plt.plot(hist.x,hist.y, "k",label="smooth", lw=2)
plt.plot(vx, vy ,"ro", label="valleys")
plt.plot(px, py, "co", label="peaks")

plt.xlim(5,10)
plt.legend()
plt.show()

In [15]:
import pybursts

offsets = [4, 17, 23, 27, 33, 35, 37, 76, 77, 82, 84, 88, 90, 92]
print pybursts.kleinberg(offsets, s=2, gamma=0.1)


[[0 4 92]
 [1.0 33 37]
 [1.0 76 92]
 [2.0 76 77]]

Using BASS


In [9]:
for key in ['Mean1']:
    #represent what a complete feature looks like for this particular ECG
    # s = burst start, e = burst end, v = valley, p=peak
    features_key = 'vspevspe'
    ''
    
    #count number of peaks and valleys in the features key
    peak_count = features_key.count('p')
    valley_count = features_key.count('v')

    
    peaks_df = Results['Peaks'][key].dropna() # R and T wave locations
    valleys_df = Results['Valleys'][key].dropna() # Q and S wave locations
    
    # make bursts_df which contains 'Burst Start' vals and 'Burst End' vals
    try:
        burst_ends = Results['Bursts'][key]['Burst End']
        burst_starts = Results['Bursts'][key]['Burst Start']
        bursts_df = pd.concat([burst_starts, burst_ends], axis=1)
        
        bursts_df.columns = ['Start','End']
    except:
        print "No bursts."
        
    #get peak and valley times
    print (peaks_df.head())
    peak_time = peaks_df.index.values
    valley_time = valleys_df.index.values
    
print peaks_df["Peaks Amplitude"].head()
n, pEdges = np.histogram(peaks_df["Peaks Amplitude"],bins=peak_count)
n, vEdges = np.histogram(valleys_df,bins=valley_count)
print pEdges, vEdges


       Peaks Amplitude  Intervals
Time                             
0.704            0.625      0.234
0.938            0.175      0.516
1.454            0.595      0.220
1.674            0.175      0.594
2.268            0.735      0.222
Time
0.704    0.625
0.938    0.175
1.454    0.595
1.674    0.175
2.268    0.735
Name: Peaks Amplitude, dtype: float64
0.175    4
0.620    4
0.210    3
0.550    3
0.165    3
0.650    2
0.195    2
0.610    2
0.575    1
0.615    1
0.690    1
0.150    1
0.285    1
0.595    1
0.495    1
0.180    1
0.490    1
0.600    1
0.160    1
0.670    1
0.245    1
0.655    1
0.215    1
0.200    1
0.795    1
0.185    1
0.480    1
0.225    1
0.325    1
0.735    1
0.190    1
0.625    1
dtype: int64
[ 0.15    0.4725  0.795 ] [-0.235 -0.135 -0.035]

In [ ]:

Getting Q, R, S, and T in the DataFrame and Ordered according to Time

Once you can see that Q, R, S, and T are being consistently detected (T should be marked by the burst ends) you can do the below process to label them.


In [193]:
peaks_df = Results['Peaks']['Mean1'] # R wave location
valleys_df = Results['Valleys']['Mean1'] # Q and S wave locations
burst_ends = Results['Bursts']['Mean1']['Burst End'] #burst ends indicate approx. T wave location

# make new DataFrame for Burst Ends which is indexed by Burst Ends 
# b.c. Results['Bursts']['Mean1'] is indexed by Burst Start.
bursts_df = pd.DataFrame(burst_ends.values, index=burst_ends.values, columns=['Burst Ends'])

#combine peaks and valleys dfs and sort by time
# this will allow us to easily find peaks, 
# and find valleys before and after.
peaks_df['time'] = peaks_df.index
valleys_df['time'] = valleys_df.index
bursts_df['time'] = bursts_df.index
peaks_df['amplitude'] = peaks_df["Peaks Amplitude"]
valleys_df['amplitude'] = valleys_df["Valley Amplitude"]
bursts_df['amplitude'] = float(Settings['Threshold'])

#Label R and T. Unfortunatly, setting Q and S (the valleys) is much harder (See next cell).
peaks_df['letter'] = 'R'
bursts_df['letter'] = "T"

combined = peaks_df.append(valleys_df).sort('time')

#now combine w. Burst Ends and remove unnecessary columns, sort by time again
times = combined.append(bursts_df).drop([u'Burst Ends', u'Intervals', u'Peaks Amplitude', u'Valley Amplitude'],axis=1)
sorted_times = times.sort('time')
#remove na vals only from amplitude col.
sorted_times['amplitude'] = sorted_times['amplitude']
sorted_times.head()


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-193-30ee97772687> in <module>()
      1 peaks_df = Results['Peaks']['Mean1'] # R wave location
      2 valleys_df = Results['Valleys']['Mean1'] # Q and S wave locations
----> 3 burst_ends = Results['Bursts']['Mean1']['Burst End'] #burst ends indicate approx. T wave location
      4 
      5 # make new DataFrame for Burst Ends which is indexed by Burst Ends

KeyError: 'Mean1'

The following is a hack-y way to set the Q and S valleys, but it works. Later I might try to do it with booleans. Feel free to try another way to do this!!! (Actually, really, please do. This method is pretty slow.)


In [165]:
#reindex by integer
sorted_times['num'] = range(len(sorted_times))
sorted_times_reindexed = sorted_times.set_index('num')

letters = pd.DataFrame(index=sorted_times_reindexed.index, columns=['letters'], dtype=str)
for i in sorted_times_reindexed.index:
    
    #try to get the previous and next values with error handeling for edge cases
    try:
        prev_value = sorted_times_reindexed.loc[i-1, "letter"]
    except:
        prev_value=''
    try:
        next_value = sorted_times_reindexed.loc[i+1,"letter"]
    except:
        next_value=""
    
    #letters['time'][i] = sorted_times_reindexed.loc[i,'time']
    if (next_value == "T") and (prev_value == "R"):
        letters['letters'][i] = "S"
    elif next_value == "R":
        letters['letters'][i] = "Q"
    else:
        letters['letters'][i] = sorted_times_reindexed.loc[i,'letter']
        
sorted_times_reindexed['letter'] = letters['letters']
sorted_times_reindexed.head(13) #see that valleys which are not immediately before or after R are left as NaN!


Out[165]:
amplitude letter time
num
0 -0.487 Q 0.12950
1 1.410 R 0.13475
2 -0.883 S 0.13825
3 0.030 T 0.15575
4 -0.455 Q 0.30125
5 1.514 R 0.30700
6 -0.934 S 0.31050
7 0.030 T 0.32575
8 -0.376 Q 0.47350
9 1.404 R 0.47900
10 -1.003 S 0.48225
11 0.030 T 0.49500
12 -0.116 NaN 0.59850

In [147]:
sorted_times_time_indexed = sorted_times_reindexed.set_index('time')
final = sorted_times_time_indexed.dropna()
final.head()


Out[147]:
amplitude letter
time
0.12950 -0.487 Q
0.13475 1.410 R
0.13825 -0.883 S
0.15575 0.030 T
0.30125 -0.455 Q

In [153]:
s1=0
e1=20
final[s1:e1].plot()
plt.show()