In [1]:

    
import warnings
import pandas as pd
import numpy as np
import os
import sys # error msg
import operator # sorting
from math import *

from read_trace import *
from avgblkmodel import *

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

gpu info



In [2]:

    
gtx950 = DeviceInfo()
gtx950.sm_num = 6
gtx950.sharedmem_per_sm = 49152
gtx950.reg_per_sm = 65536
gtx950.maxthreads_per_sm = 2048

single stream info



In [3]:

    
data_size = 23000
trace_file = './1cke/trace_' + str(data_size) + '.csv'
df_trace = trace2dataframe(trace_file) # read the trace to the dataframe



In [4]:

    
df_trace









    Out[4]:






  
    
      
      Start
      Duration
      Grid X
      Grid Y
      Grid Z
      Block X
      Block Y
      Block Z
      Registers Per Thread
      Static SMem
      Dynamic SMem
      Size
      Throughput
      Device
      Context
      Stream
      Name
    
  
  
    
      0
      ms
      us
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      B
      B
      KB
      GB/s
      NaN
      NaN
      NaN
      NaN
    
    
      1
      526.961828
      16.672000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      89.843750
      5.139256
      GeForce GTX 950 (0)
      1.0
      13.0
      [CUDA memcpy HtoD]
    
    
      2
      526.979716
      16.224000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      89.843750
      5.281168
      GeForce GTX 950 (0)
      1.0
      13.0
      [CUDA memcpy HtoD]
    
    
      3
      527.157829
      61.056000
      90.0
      1.0
      1.0
      256.0
      1.0
      1.0
      28.0
      0
      0
      NaN
      NaN
      GeForce GTX 950 (0)
      1.0
      13.0
      kernel_vectorAdd(float const *, float const *,...
    
    
      4
      527.221349
      15.904000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      89.843750
      5.387429
      GeForce GTX 950 (0)
      1.0
      13.0
      [CUDA memcpy DtoH]



In [5]:

    
df_single_stream = model_param_from_trace(df_trace)
df_single_stream.head(20)









    Out[5]:






  
    
      
      seq
      api_type
      start
      end
      duration
    
  
  
    
      0
      0.0
      h2d
      526.961828
      526.978500
      0.016672
    
    
      1
      1.0
      h2d_h2d_ovhd
      526.978500
      526.979716
      0.001216
    
    
      2
      2.0
      h2d
      526.979716
      526.995940
      0.016224
    
    
      3
      3.0
      h2d_kern_ovhd
      526.995940
      527.157829
      0.161889
    
    
      4
      4.0
      kern
      527.157829
      527.218885
      0.061056
    
    
      5
      5.0
      kern_d2h_ovhd
      527.218885
      527.221349
      0.002464
    
    
      6
      6.0
      d2h
      527.221349
      527.237253
      0.015904

model 2 stream case

we need a trace table to track the timing for each kernel



In [6]:

    
df_single_stream_update = df_single_stream.copy(deep=True)
#df_cke['stream'] = 0 # add stream column with label 0



In [7]:

    
df_single_stream_update









    Out[7]:






  
    
      
      seq
      api_type
      start
      end
      duration
    
  
  
    
      0
      0.0
      h2d
      526.961828
      526.978500
      0.016672
    
    
      1
      1.0
      h2d_h2d_ovhd
      526.978500
      526.979716
      0.001216
    
    
      2
      2.0
      h2d
      526.979716
      526.995940
      0.016224
    
    
      3
      3.0
      h2d_kern_ovhd
      526.995940
      527.157829
      0.161889
    
    
      4
      4.0
      kern
      527.157829
      527.218885
      0.061056
    
    
      5
      5.0
      kern_d2h_ovhd
      527.218885
      527.221349
      0.002464
    
    
      6
      6.0
      d2h
      527.221349
      527.237253
      0.015904



In [8]:

    
# deduct the starting timing
offset = df_single_stream_update.start[0]
#print offset
df_single_stream_update.start = df_single_stream_update.start - offset
df_single_stream_update.end = df_single_stream_update.end - offset
print df_single_stream_update









    



   seq       api_type     start       end  duration
0  0.0            h2d  0.000000  0.016672  0.016672
1  1.0   h2d_h2d_ovhd  0.016672  0.017888  0.001216
2  2.0            h2d  0.017888  0.034112  0.016224
3  3.0  h2d_kern_ovhd  0.034112  0.196001  0.161889
4  4.0           kern  0.196001  0.257057  0.061056
5  5.0  kern_d2h_ovhd  0.257057  0.259521  0.002464
6  6.0            d2h  0.259521  0.275425  0.015904



In [9]:

    
## make duplication for the same trace
stream_num = 2

df_cke_list = []
for x in range(stream_num):
    df_cke_list.append(df_single_stream_update.copy(deep=True))



In [10]:

    
df_cke_list[0]









    Out[10]:






  
    
      
      seq
      api_type
      start
      end
      duration
    
  
  
    
      0
      0.0
      h2d
      0.000000
      0.016672
      0.016672
    
    
      1
      1.0
      h2d_h2d_ovhd
      0.016672
      0.017888
      0.001216
    
    
      2
      2.0
      h2d
      0.017888
      0.034112
      0.016224
    
    
      3
      3.0
      h2d_kern_ovhd
      0.034112
      0.196001
      0.161889
    
    
      4
      4.0
      kern
      0.196001
      0.257057
      0.061056
    
    
      5
      5.0
      kern_d2h_ovhd
      0.257057
      0.259521
      0.002464
    
    
      6
      6.0
      d2h
      0.259521
      0.275425
      0.015904



In [11]:

    
# 2nd stream trace
df_cke_list[1]









    Out[11]:






  
    
      
      seq
      api_type
      start
      end
      duration
    
  
  
    
      0
      0.0
      h2d
      0.000000
      0.016672
      0.016672
    
    
      1
      1.0
      h2d_h2d_ovhd
      0.016672
      0.017888
      0.001216
    
    
      2
      2.0
      h2d
      0.017888
      0.034112
      0.016224
    
    
      3
      3.0
      h2d_kern_ovhd
      0.034112
      0.196001
      0.161889
    
    
      4
      4.0
      kern
      0.196001
      0.257057
      0.061056
    
    
      5
      5.0
      kern_d2h_ovhd
      0.257057
      0.259521
      0.002464
    
    
      6
      6.0
      d2h
      0.259521
      0.275425
      0.015904



In [17]:

    
df_prev_stream = df_cke_list[0]

# compute the time for the init data transfer
H2D_H2D_OVLP_TH = 3.158431

# we need to know the data transfer time before the 1st kernel call
h2d_first_ind = -1
h2d_last_ind = -1
for index, row in df_prev_stream.iterrows():
    if row['api_type'] == 'h2d':
        h2d_first_ind = 0
        
    if row['api_type'] == 'h2d_kern_ovhd':
        h2d_last_ind = index -1
        break;
        
h2d_start = df_prev_stream.loc[h2d_first_ind]['start']
h2d_finish = df_prev_stream.loc[h2d_last_ind]['end']  
h2d_duration = h2d_finish - h2d_start
print("h2d : {} - {} = {}".format(h2d_start, h2d_finish, h2d_duration))

# check whether we need to start second stream data transfer till the previous one finishes
current_stream_start = 0
if h2d_duration > H2D_H2D_OVLP_TH:
    current_stream_start = H2D_H2D_OVLP_TH
else:
    current_stream_start = h2d_finish
    
print current_stream_start
#df_single_stream.loc[0]['api_type']









    



h2d : 0.0 - 0.0341120000001 = 0.0341120000001
0.0341120000001



In [19]:

    
# add the start_offset to 2nd stream trace
df_cke_list[1].start = df_cke_list[1].start + current_stream_start
df_cke_list[1].end = df_cke_list[1].end + current_stream_start



In [20]:

    
df_cke_list[1]









    Out[20]:






  
    
      
      seq
      api_type
      start
      end
      duration
    
  
  
    
      0
      0.0
      h2d
      0.034112
      0.050784
      0.016672
    
    
      1
      1.0
      h2d_h2d_ovhd
      0.050784
      0.052000
      0.001216
    
    
      2
      2.0
      h2d
      0.052000
      0.068224
      0.016224
    
    
      3
      3.0
      h2d_kern_ovhd
      0.068224
      0.230113
      0.161889
    
    
      4
      4.0
      kern
      0.230113
      0.291169
      0.061056
    
    
      5
      5.0
      kern_d2h_ovhd
      0.291169
      0.293633
      0.002464
    
    
      6
      6.0
      d2h
      0.293633
      0.309537
      0.015904

figure out whether there is h2d ovlp



In [21]:

    
def find_h2d_timing(df_trace):
    """
    find the h2d start and end for the current stream
    """
    # we need to know the data transfer time before the 1st kernel call
    h2d_first_ind = -1
    h2d_last_ind = -1
    for index, row in df_trace.iterrows():
        if row['api_type'] == 'h2d':
            h2d_first_ind = 0

        if row['api_type'] == 'h2d_kern_ovhd':
            h2d_last_ind = index -1
            break;
        
    h2d_start = df_trace.loc[h2d_first_ind]['start']
    h2d_finish = df_trace.loc[h2d_last_ind]['end']  
    
    return h2d_start, h2d_finish



In [24]:

    
prev_h2d_start, prev_h2d_end = find_h2d_timing(df_cke_list[0])
print("prev : {} - {}".format(prev_h2d_start, prev_h2d_end))

current_h2d_start, current_h2d_end = find_h2d_timing(df_cke_list[1])
print("current : {} - {}".format(current_h2d_start, current_h2d_end))









    



prev : 0.0 - 0.0341120000001
current : 0.0341120000001 - 0.0682240000001



In [25]:

    
h2d_ovlp = 0
if current_h2d_start < prev_h2d_end and current_h2d_start >= prev_h2d_start:
    h2d_ovlp = 1
print("h2d ovlap : {}".format(h2d_ovlp))









    



h2d ovlap : 0

figure out whether there is kernel ovlp



In [26]:

    
def find_kernel_timing(df_trace):
    """
    find the h2d start and end for the current stream
    """
    # we need to know the data transfer time before the 1st kernel call
    df_kern = df_trace.loc[df_trace['api_type'] == 'kern']
    kernel_timing_ls = []
    for index, row in df_kern.iterrows():
        kernel_timing_ls.append([row['start'], row['end']])
    
    return kernel_timing_ls



In [33]:

    
prev_kernel_timing = find_kernel_timing(df_cke_list[0])
print prev_kernel_timing
curr_kernel_timing = find_kernel_timing(df_cke_list[1])
print curr_kernel_timing

curr_start = curr_kernel_timing[0][0]
prev_start = prev_kernel_timing[0][0]
prev_end = prev_kernel_timing[0][1]

if curr_start >= prev_start and curr_start < prev_end:
    print("Overlapping betweent the kernel")









    



[[0.19600100000002385, 0.2570570000000316]]
[[0.23011300000007395, 0.2911690000000817]]
Overlapping betweent the kernel

	Start	Duration	Grid X	Grid Y	Grid Z	Block X	Block Y	Block Z	Registers Per Thread	Static SMem	Dynamic SMem	Size	Throughput	Device	Context	Stream	Name
0	ms	us	NaN	NaN	NaN	NaN	NaN	NaN	NaN	B	B	KB	GB/s	NaN	NaN	NaN	NaN
1	526.961828	16.672000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	89.843750	5.139256	GeForce GTX 950 (0)	1.0	13.0	[CUDA memcpy HtoD]
2	526.979716	16.224000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	89.843750	5.281168	GeForce GTX 950 (0)	1.0	13.0	[CUDA memcpy HtoD]
3	527.157829	61.056000	90.0	1.0	1.0	256.0	1.0	1.0	28.0	0	0	NaN	NaN	GeForce GTX 950 (0)	1.0	13.0	kernel_vectorAdd(float const , float const ,...
4	527.221349	15.904000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	89.843750	5.387429	GeForce GTX 950 (0)	1.0	13.0	[CUDA memcpy DtoH]

	seq	api_type	start	end	duration
0	0.0	h2d	526.961828	526.978500	0.016672
1	1.0	h2d_h2d_ovhd	526.978500	526.979716	0.001216
2	2.0	h2d	526.979716	526.995940	0.016224
3	3.0	h2d_kern_ovhd	526.995940	527.157829	0.161889
4	4.0	kern	527.157829	527.218885	0.061056
5	5.0	kern_d2h_ovhd	527.218885	527.221349	0.002464
6	6.0	d2h	527.221349	527.237253	0.015904

	seq	api_type	start	end	duration
0	0.0	h2d	0.000000	0.016672	0.016672
1	1.0	h2d_h2d_ovhd	0.016672	0.017888	0.001216
2	2.0	h2d	0.017888	0.034112	0.016224
3	3.0	h2d_kern_ovhd	0.034112	0.196001	0.161889
4	4.0	kern	0.196001	0.257057	0.061056
5	5.0	kern_d2h_ovhd	0.257057	0.259521	0.002464
6	6.0	d2h	0.259521	0.275425	0.015904

	seq	api_type	start	end	duration
0	0.0	h2d	0.034112	0.050784	0.016672
1	1.0	h2d_h2d_ovhd	0.050784	0.052000	0.001216
2	2.0	h2d	0.052000	0.068224	0.016224
3	3.0	h2d_kern_ovhd	0.068224	0.230113	0.161889
4	4.0	kern	0.230113	0.291169	0.061056
5	5.0	kern_d2h_ovhd	0.291169	0.293633	0.002464
6	6.0	d2h	0.293633	0.309537	0.015904