In [307]:
## Simulating an EKG by application
## of the concept of Fourier Series
## http://en.wikipedia.org/wiki/Fourier_series
from numpy import sin
x = np.linspace(0,100,1000)
### consitituent waves ###
# might I suggest: if you want to turn off one of the sine waves,
# just set its a variable to 0
#this one is big and sets the overall waveform
a3=10
f3 = .5
y3 = a3*sin(f3*x)
#second biggest wave
a2 = 5
f2 = 2
y2 = a2*sin(f2*x)
#this one didn't seem useful, so I ditched it using the a=0 technique
a1=0 #1
f1=7
y1 = a1*sin(f1*x)
#this one adds periodic noise.
a4 = .5
f4 = 20
y4 =a4*sin(f4*x)
#random noise
n = len(x)
random_noise = np.random.rand(n)*np.random.randint(0,10,n)
# ekg signal
EKG = y1+y2+y3+y4+random_noise
#plot the complete ‘EKG'
plt.plot(x, EKG,'b',lw=3)
#plot the component waves
plt.plot(x, y4, 'm')
plt.plot(x, y3, 'r')
plt.plot(x, y2, 'y')
plt.plot(x, y1, 'g')
plt.plot(x, random_noise, 'c')
plt.show()
In [177]:
import pypeaks as pypk
import pickle
import sys
import os
import os.path as path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from matplotlib import colors
import seaborn as sns
sns.set_style("whitegrid")
'''
data = pickle.load(file("./sample-histogram.pickle"))
#print data
hist = pypk.Data(data[0], data[1])
'''
#ji_intervals = pickle.load(file('./ji-intervals.pickle'))
#print ji_intervals
#pypk.Data.get_peaks??
#pd.DataFrame.from_csv(path, header=0, sep=',', index_col=0, parse_dates=True,
# encoding=None, tupleize_cols=False, infer_datetime_format=False)
all_data = pd.DataFrame.from_csv('p01_r01_f_2.txt', header=[0,1], sep="\t")
data = all_data[0.5:]
#data.plot()
#plt.show()
data.head(2)
Out[177]:
Signature: hist.get_peaks(method='slope', peak_amp_thresh=5e-05,
valley_thresh=3e-05, intervals=None, lookahead=20, avg_interval=100)
Docstring:
This function expects SMOOTHED histogram. If you run it on a raw histogram,
there is a high chance that it returns no peaks.
method can be interval/slope/hybrid.
The interval-based method simply steps through the whole histogram
and pick up the local maxima in each interval, from which irrelevant
peaks are filtered out by looking at the proportion of points on
either side of the detected peak in each interval, and by applying
peal_amp_thresh and valley_thresh bounds.
Slope approach uses, of course slope information, to find peaks,
which are then filtered by applying peal_amp_thresh and
valley_thresh bounds.
Hybrid approach combines the peaks obtained using slope method and
interval-based approach. It retains the peaks/valleys from slope method
if there should be a peak around the same region from each of the methods.
peak_amp_thresh is the minimum amplitude/height that a peak should have
in a normalized smoothed histogram, to be qualified as a peak.
valley_thresh is viceversa for valleys!
If the method is interval/hybrid, then the intervals argument must be passed
and it should be an instance of Intervals class.
If the method is slope/hybrid, then the lookahead and avg_window
arguments should be changed based on the application.
They have some default values though.
The method stores peaks in self.peaks in the following format:
{"peaks":[[peak positions], [peak amplitudes]],
"valleys": [[valley positions], [valley amplitudes]]}
In [186]:
#things you can change
smooth_amount = 9
peak_amp_threshold = 3e-05
valley_amp_threshold = -2e-05
In [187]:
### Get peaks and valleys from data and make dataframes
hist = pypk.Data(data.index, data.values, smoothness=smooth_amount)
hist.get_peaks(method='slope', peak_amp_thresh=peak_amp_threshold,
valley_thresh=valley_amp_threshold)
valleys, peaks = hist.peaks.items()
def flatten_list_of_iterables(iterable_of_iterables):
"""
iterable_of_iterables is an iterable of single-value iterables
some of the data is a list or array of single-value lists or arrays
this flattens them by iterating thru the iterable of arrays/lists
and append the first (and only) element from each into a new 1-D array.
"""
return np.array([iterable[0] for iterable in iterable_of_iterables])
vx, vy = valleys[1]
px, py = peaks[1]
vy = flatten_list_of_iterables(vy)
py = flatten_list_of_iterables(py)
peaks_df = pd.DataFrame(data=zip(px,py), columns=["t","A"]).sort('t')
valleys_df = pd.DataFrame(data=zip(vx,vy), columns=["t","A"]).sort('t')
valleys_df.to_csv('valleys.csv')
peaks_df.to_csv('peaks.csv')
In [194]:
# plot peaks and valleys over original data
plt.plot(data.index, data.values)
plt.plot(hist.x, hist.y)
plt.plot(vx, vy, "o",c="tomato")
plt.plot(px, py, "o",c="orchid")
plt.show()
In [180]:
def map_event_time_to_amp(combined):
event_times = []
orig_amps = []
types = []
for k,df in combined.T.iteritems():
time = float(df['t'])
event_times.append(time)
amp = data.loc[time].values
orig_amps.append(amp)
feature_type = df['type']
types.append(feature_type)
amps = flatten_list_of_iterables(orig_amps)
temp_data = list(zip(event_times, amps, types))
index_list = range(len(event_times))
pv_orig_amps = pd.DataFrame(data = temp_data, index = index_list,
columns=['t','A','type'])
return pv_orig_amps
peaks_df['type'] = 'p'
valleys_df['type'] = 'v'
combined = pd.concat([peaks_df,valleys_df]).sort('t')
orig_amps = map_event_time_to_amp(combined)
orig_amps.head(3)
Out[180]:
In [184]:
# plot peaks and valleys over original data
plt.plot(data.index, data.values)
plt.plot(orig_amps.t, orig_amps.A, "o")
plt.show()
In [45]:
# see the distribution of peaks
# if the R peaks are much larger than the other peaks,
# you can use calc_R_peak_lower_thresh to find the
# lower bound of the R peaks
# otherwise you may need to set
# the variable R_peak_lower_bound (in the cell below) yourself.
groupby_type = orig_amps.groupby('type')
p = groupby_type.get_group('p')
plt.hist(p['A'].values,2)
plt.hist(p['A'].values)
plt.show()
In [75]:
import copy
# separate peaks from valleys
groupby_type = orig_amps.groupby('type')
p, v = [groupby_type.get_group(i) for i in groupby_type.groups]
def calc_R_peak_lower_thresh(peak_amplitudes):
#now find R features using the np.histogram
two_bin = np.histogram(peak_amplitudes, 2)
bin_edges = two_bin[1]
mid = bin_edges[1]
return mid
# you may need to set this yourself.
R_peak_lower_bound = calc_R_peak_lower_thresh(p['A'])
# separate out the R peaks,
# all other peaks goes in other_peaks
# some are T peaks, some are noise
R_peaks = copy.deepcopy(p[p['A'] >= R_peak_lower_bound])
other_peaks = copy.deepcopy(p[p['A'] <= R_peak_lower_bound])
all_valleys = copy.deepcopy(v)
In [164]:
# check that R peaks were properly id'ed
# (circles marking R peaks should now be different color from other peaks)
plt.plot(data.index, data.values, 'r', c='deepskyblue')
plt.plot(orig_amps.t, orig_amps.A, "o", c='cadetblue')
plt.plot(R_peaks.t, R_peaks.A, "o",c='teal')
plt.plot(all_valleys.t, all_valleys.A, "o",c='royalblue')
plt.plot(other_peaks.t, other_peaks.A, "o", c="turquoise")
plt.show()
In [121]:
#label R peaks
R_peaks.loc[:,'feature'] = 'R'
recombined = pd.concat([R_peaks, other_peaks, all_valleys]).fillna('')
sort_by_time = recombined.sort("t")
sort_by_time_T = sort_by_time.T
sort_by_time.head()
Out[121]:
In [196]:
s = pd.Series(index=sort_by_time.index)
T_ends = []
vs = []
QT_intervals = []
skipped = []
for i in R_peaks.t.index:
#print sort_by_time.loc[i-1:i+4]#, sort_by_time.loc[i+3]
#try to locate the Q feature for this complex
#if not found, skip this complex
prev = sort_by_time.loc[i-1]
if prev['type'] == 'v' and prev['feature'] == "":
Qt = prev['t']
else:
skipped.append(i-1)
continue
# starting at i+1 (one after the R peak)
# iterate until next peak (T feature) is found
while sort_by_time.loc[i+1]['type'] != 'p':
i += 1
T = sort_by_time.loc[i]['t']
# starting one after i (b/c i is a peak)
# iterate until next valley is found
# optimally, this will be a valley
# immediately after T, but it doesn't matter that much
# unless that data has a linear trend
while sort_by_time.loc[i+1]['type'] != 'v':
i += 1
v = sort_by_time.loc[i]['t']
if Q and v:
T_end = v+ (v-T)/2
else:
continue
QT_intervals.append(T_end-Qt)
T_ends.append(T_end)
vs.append(v)
T_ends
Out[196]:
In [198]:
# check that R peaks were properly id'ed
# (circles marking R peaks should now be different color from other peaks)
plt.plot(data.index, data.values, 'r', c='deepskyblue')
plt.plot(orig_amps.t, orig_amps.A, "o", c='cadetblue')
plt.plot(R_peaks.t, R_peaks.A, "o",c='teal')
plt.plot(all_valleys.t, all_valleys.A, "o",c='royalblue')
plt.plot(other_peaks.t, other_peaks.A, "o", c="turquoise")
ymin, ymax = plt.ylim()
plt.vlines(vs,ymin, ymax, color='orange')
plt.vlines(T_ends,ymin, ymax, color='orchid')
#plt.plot()
plt.show()
In [150]:
import pybursts
#this doesn't group entire QRST complex, but it does seem to get the QRS complex pretty well (h =1)
bursts = pybursts.kleinberg(combined.t, s=2, gamma=.001)
#print bursts
#b = [i[1:] for i in bursts]
bursts_df = pd.DataFrame(data = bursts, index = range(len(bursts)), columns=['h','start','end'])
print len(bursts_df)
bursts_df.head()
Out[150]:
In [312]:
ymin, ymax = plt.ylim()
maxh = max(bursts_df['h'])
sorted_bursts = bursts_df.sort("h")
"""
for r,c in zip(sorted_bursts.iterrows(),colors.cnames):#zip(bursts_df.start, bursts_df.end, colors.cnames):#bursts_df.iterrows():
#print r
h,s,e = r[1]
#print h
if h != 1:
continue
plt.vlines(s, ymin, ymax, color=c, lw=maxh-h)
plt.vlines(e, ymin, ymax, color=c, lw=maxh-h)
plt.hlines(ymin, s,e, color=c, lw=maxh-h)
plt.hlines(ymax, s, e, color=c, lw=maxh-h)
#break"""
plt.plot(data.index, data.values,'y', label = "original")
plt.plot(hist.x,hist.y, "k",label="smooth", lw=2)
plt.plot(vx, vy ,"ro", label="valleys")
plt.plot(px, py, "co", label="peaks")
plt.xlim(5,10)
plt.legend()
plt.show()
In [15]:
import pybursts
offsets = [4, 17, 23, 27, 33, 35, 37, 76, 77, 82, 84, 88, 90, 92]
print pybursts.kleinberg(offsets, s=2, gamma=0.1)
In [9]:
for key in ['Mean1']:
#represent what a complete feature looks like for this particular ECG
# s = burst start, e = burst end, v = valley, p=peak
features_key = 'vspevspe'
''
#count number of peaks and valleys in the features key
peak_count = features_key.count('p')
valley_count = features_key.count('v')
peaks_df = Results['Peaks'][key].dropna() # R and T wave locations
valleys_df = Results['Valleys'][key].dropna() # Q and S wave locations
# make bursts_df which contains 'Burst Start' vals and 'Burst End' vals
try:
burst_ends = Results['Bursts'][key]['Burst End']
burst_starts = Results['Bursts'][key]['Burst Start']
bursts_df = pd.concat([burst_starts, burst_ends], axis=1)
bursts_df.columns = ['Start','End']
except:
print "No bursts."
#get peak and valley times
print (peaks_df.head())
peak_time = peaks_df.index.values
valley_time = valleys_df.index.values
print peaks_df["Peaks Amplitude"].head()
n, pEdges = np.histogram(peaks_df["Peaks Amplitude"],bins=peak_count)
n, vEdges = np.histogram(valleys_df,bins=valley_count)
print pEdges, vEdges
In [ ]:
Once you can see that Q, R, S, and T are being consistently detected (T should be marked by the burst ends) you can do the below process to label them.
In [193]:
peaks_df = Results['Peaks']['Mean1'] # R wave location
valleys_df = Results['Valleys']['Mean1'] # Q and S wave locations
burst_ends = Results['Bursts']['Mean1']['Burst End'] #burst ends indicate approx. T wave location
# make new DataFrame for Burst Ends which is indexed by Burst Ends
# b.c. Results['Bursts']['Mean1'] is indexed by Burst Start.
bursts_df = pd.DataFrame(burst_ends.values, index=burst_ends.values, columns=['Burst Ends'])
#combine peaks and valleys dfs and sort by time
# this will allow us to easily find peaks,
# and find valleys before and after.
peaks_df['time'] = peaks_df.index
valleys_df['time'] = valleys_df.index
bursts_df['time'] = bursts_df.index
peaks_df['amplitude'] = peaks_df["Peaks Amplitude"]
valleys_df['amplitude'] = valleys_df["Valley Amplitude"]
bursts_df['amplitude'] = float(Settings['Threshold'])
#Label R and T. Unfortunatly, setting Q and S (the valleys) is much harder (See next cell).
peaks_df['letter'] = 'R'
bursts_df['letter'] = "T"
combined = peaks_df.append(valleys_df).sort('time')
#now combine w. Burst Ends and remove unnecessary columns, sort by time again
times = combined.append(bursts_df).drop([u'Burst Ends', u'Intervals', u'Peaks Amplitude', u'Valley Amplitude'],axis=1)
sorted_times = times.sort('time')
#remove na vals only from amplitude col.
sorted_times['amplitude'] = sorted_times['amplitude']
sorted_times.head()
The following is a hack-y way to set the Q and S valleys, but it works. Later I might try to do it with booleans. Feel free to try another way to do this!!! (Actually, really, please do. This method is pretty slow.)
In [165]:
#reindex by integer
sorted_times['num'] = range(len(sorted_times))
sorted_times_reindexed = sorted_times.set_index('num')
letters = pd.DataFrame(index=sorted_times_reindexed.index, columns=['letters'], dtype=str)
for i in sorted_times_reindexed.index:
#try to get the previous and next values with error handeling for edge cases
try:
prev_value = sorted_times_reindexed.loc[i-1, "letter"]
except:
prev_value=''
try:
next_value = sorted_times_reindexed.loc[i+1,"letter"]
except:
next_value=""
#letters['time'][i] = sorted_times_reindexed.loc[i,'time']
if (next_value == "T") and (prev_value == "R"):
letters['letters'][i] = "S"
elif next_value == "R":
letters['letters'][i] = "Q"
else:
letters['letters'][i] = sorted_times_reindexed.loc[i,'letter']
sorted_times_reindexed['letter'] = letters['letters']
sorted_times_reindexed.head(13) #see that valleys which are not immediately before or after R are left as NaN!
Out[165]:
In [147]:
sorted_times_time_indexed = sorted_times_reindexed.set_index('time')
final = sorted_times_time_indexed.dropna()
final.head()
Out[147]:
In [153]:
s1=0
e1=20
final[s1:e1].plot()
plt.show()