In [1]:

    
%load_ext autoreload
%autoreload 2

import warnings
import pandas as pd
import numpy as np
import os
import sys # error msg, add the modules
import operator # sorting
from math import *
import matplotlib.pyplot as plt

sys.path.append('../../')

import cuda_timeline
import read_trace
import avgblk
import cke
from model_param import *
from df_util import *

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

gpu info



In [2]:

    
gtx950 = DeviceInfo()
gtx950.sm_num = 6
gtx950.sharedmem_per_sm = 49152
gtx950.reg_per_sm = 65536
gtx950.maxthreads_per_sm = 2048



In [3]:

    
# init SM resources
SM_resList, SM_traceList = init_gpu(gtx950)



In [4]:

    
SM_resList[0]









    Out[4]:





<model_param.SM_Stat instance at 0x7f3ccd9b7bd8>



In [5]:

    
SM_traceList[0]









    Out[5]:






  
    
      
      sm_id
      block_id
      block_start
      block_end
      batch_id
      kernel_id
      active

kernel info



In [6]:

    
trace_file = '1stream_23000.csv'
df_trace = read_trace.Trace2dataframe(trace_file) # read the trace to the dataframe



In [7]:

    
#df_trace



In [8]:

    
# extract kernel info from trace
# warning: currently lmted to one kernel
kernel = read_trace.GetKernelInfo(df_trace, gtx950)
print kernel.runtime_ms
print kernel.avg_blk_time
print kernel.blockDim
print kernel.start_ms



In [9]:

    
# for each stream, have a dd for each kernel
stream_kernel_list = []

stream_num = 2
for sid in range(stream_num):
    print sid
    # key will be the kernel order
    # value will be the kernel info
    kern_dd = {}
    kern_dd[0] = Copy_kern_info(kernel)
    stream_kernel_list.append(kern_dd)

0
1



In [10]:

    
s1_kern_dd = stream_kernel_list[0]
print s1_kern_dd[0].runtime_ms
print s1_kern_dd[0].avg_blk_time
print s1_kern_dd[0].blockDim
print s1_kern_dd[0].start_ms



In [11]:

    
print len(stream_kernel_list)
print stream_kernel_list[1][0].runtime_ms
print stream_kernel_list[1][0].avg_blk_time
print stream_kernel_list[1][0].blockDim
print stream_kernel_list[1][0].start_ms

2 streams



In [12]:

    
trace_file_2cke = '2stream_kern_ovlp_23000.csv'
df_trace_2cke = read_trace.Trace2dataframe(trace_file_2cke)



In [13]:

    
df_trace_2cke









    Out[13]:






  
    
      
      Start
      Duration
      Grid X
      Grid Y
      Grid Z
      Block X
      Block Y
      Block Z
      Registers Per Thread
      Static SMem
      Dynamic SMem
      Size
      Throughput
      Device
      Context
      Stream
      Name
    
  
  
    
      0
      ms
      us
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      B
      B
      KB
      GB/s
      NaN
      NaN
      NaN
      NaN
    
    
      1
      806.383704
      16.704000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      89.843750
      5.129411
      GeForce GTX 950 (0)
      1.0
      13.0
      [CUDA memcpy HtoD]
    
    
      2
      806.401624
      16.417000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      89.843750
      5.219082
      GeForce GTX 950 (0)
      1.0
      13.0
      [CUDA memcpy HtoD]
    
    
      3
      806.419321
      16.576000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      89.843750
      5.169020
      GeForce GTX 950 (0)
      1.0
      14.0
      [CUDA memcpy HtoD]
    
    
      4
      806.437113
      20.993000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      89.843750
      4.081440
      GeForce GTX 950 (0)
      1.0
      14.0
      [CUDA memcpy HtoD]
    
    
      5
      806.716992
      69.089000
      90.0
      1.0
      1.0
      256.0
      1.0
      1.0
      28.0
      0
      0
      NaN
      NaN
      GeForce GTX 950 (0)
      1.0
      13.0
      kernel_vectorAdd(float const *, float const *,...
    
    
      6
      806.755233
      73.793000
      90.0
      1.0
      1.0
      256.0
      1.0
      1.0
      28.0
      0
      0
      NaN
      NaN
      GeForce GTX 950 (0)
      1.0
      14.0
      kernel_vectorAdd(float const *, float const *,...
    
    
      7
      806.788705
      15.937000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      89.843750
      5.376274
      GeForce GTX 950 (0)
      1.0
      13.0
      [CUDA memcpy DtoH]
    
    
      8
      806.832162
      15.873000
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      89.843750
      5.397951
      GeForce GTX 950 (0)
      1.0
      14.0
      [CUDA memcpy DtoH]



In [14]:

    
cuda_timeline.plot_trace(df_trace_2cke)









    



/home/leiming/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_base.py:1292: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if aspect == 'normal':
/home/leiming/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_base.py:1297: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  elif aspect in ('equal', 'auto'):

GroundTruth

1 cke



In [15]:

    
df_single_stream = read_trace.Get_timing_from_trace(df_trace)
# print df_single_stream

tot_runtime = read_trace.GetTotalRuntime(df_single_stream)
print tot_runtime

2 cke



In [16]:

    
df_2stream = read_trace.Get_timing_from_trace(df_trace_2cke)
print df_2stream

tot_runtime = read_trace.GetTotalRuntime(df_2stream)
print tot_runtime









    



   stream api_type       start         end      size  duration
0     0.0      h2d  806.383704  806.400408  89.84375  0.016704
1     0.0      h2d  806.401624  806.418041  89.84375  0.016417
2     0.0     kern  806.716992  806.786081   0.00000  0.069089
3     0.0      d2h  806.788705  806.804642  89.84375  0.015937
4     1.0      h2d  806.419321  806.435897  89.84375  0.016576
5     1.0      h2d  806.437113  806.458106  89.84375  0.020993
6     1.0     kern  806.755233  806.829026   0.00000  0.073793
7     1.0      d2h  806.832162  806.848035  89.84375  0.015873
0.464331

Start Modeling CKE



In [17]:

    
df_s1 = read_trace.Reset_starting(df_single_stream)
print df_s1









    



   stream api_type     start       end      size  duration
0     0.0      h2d  0.000000  0.016641  89.84375  0.016641
1     0.0      h2d  0.018081  0.034209  89.84375  0.016128
2     0.0     kern  0.200611  0.257572   0.00000  0.056961
3     0.0      d2h  0.263236  0.279141  89.84375  0.015905



In [18]:

    
df_2stream_trace = read_trace.Reset_starting(df_2stream)
print df_2stream_trace









    



   stream api_type     start       end      size  duration
0     0.0      h2d  0.000000  0.016704  89.84375  0.016704
1     0.0      h2d  0.017920  0.034337  89.84375  0.016417
2     0.0     kern  0.333288  0.402377   0.00000  0.069089
3     0.0      d2h  0.405001  0.420938  89.84375  0.015937
4     1.0      h2d  0.035617  0.052193  89.84375  0.016576
5     1.0      h2d  0.053409  0.074402  89.84375  0.020993
6     1.0     kern  0.371529  0.445322   0.00000  0.073793
7     1.0      d2h  0.448458  0.464331  89.84375  0.015873

model two streams



In [19]:

    
stream_num = 2

# find when to start the stream and update the starting pos for the trace
H2D_H2D_OVLP_TH = 3.158431

df_cke_list = cke.init_trace_list(df_s1, stream_num = stream_num, h2d_ovlp_th = H2D_H2D_OVLP_TH)



In [20]:

    
# df_cke_list[0]



In [21]:

    
# df_cke_list[1]



In [22]:

    
df_all_api = cke.init_sort_api_with_extra_cols(df_cke_list)



In [23]:

    
df_all_api









    Out[23]:






  
    
      
      start
      end
      api_type
      size_kb
      stream_id
      status
      bw
      bytes_done
      bytes_left
      current_pos
      pred_end
    
  
  
    
      0
      0.000000
      0.016641
      h2d
      89.84375
      0.0
      sleep
      5398.939367
      0.0
      89.84375
      0.0
      0.0
    
    
      1
      0.018081
      0.034209
      h2d
      89.84375
      0.0
      sleep
      5570.669023
      0.0
      89.84375
      0.0
      0.0
    
    
      4
      0.036209
      0.052850
      h2d
      89.84375
      1.0
      sleep
      5398.939367
      0.0
      89.84375
      0.0
      0.0
    
    
      5
      0.054290
      0.070418
      h2d
      89.84375
      1.0
      sleep
      5570.669023
      0.0
      89.84375
      0.0
      0.0
    
    
      2
      0.200611
      0.257572
      kern
      0.00000
      0.0
      sleep
      0.000000
      0.0
      0.00000
      0.0
      0.0
    
    
      6
      0.236820
      0.293781
      kern
      0.00000
      1.0
      sleep
      0.000000
      0.0
      0.00000
      0.0
      0.0
    
    
      3
      0.263236
      0.279141
      d2h
      89.84375
      0.0
      sleep
      5648.773970
      0.0
      89.84375
      0.0
      0.0
    
    
      7
      0.299445
      0.315350
      d2h
      89.84375
      1.0
      sleep
      5648.773970
      0.0
      89.84375
      0.0
      0.0

start algo



In [24]:

    
count = 0
# break_count = 7
break_count = 7

while not cke.AllDone(df_all_api):
    count = count + 1
    #if count == break_count: break
    
    #-----------------------
    # pick two api to model 
    #-----------------------
    df_all_api, r1, r2 = cke.PickTwo(df_all_api)
    
    #if count == break_count: break

    #-----------------------
    # check the last api or not 
    #-----------------------
    last_api = False
    if r1 == None and r2 == None:
        last_api = True
        
    if last_api == True:                          # go directly updating the last wake api
        df_all_api = cke.UpdateStream_lastapi(df_all_api)
        break
    
    
    #-----------------------
    # move the current_pos to the starting of coming api r2, and update r1 status
    #-----------------------
    df_all_api = cke.StartNext_byType(df_all_api, [r1, r2])
    
    #if count == break_count: break

    #-----------------------------
    # if one call is done, continue the next round
    #-----------------------------
    if cke.CheckRowDone(df_all_api, r1, r2):
        continue
        
    #if count == break_count: break
        
    #-----------------------------
    # when all calls are active
    #-----------------------------
    
    #-----------------------------
    # check whether the two calls are kerns, if yes
    #-----------------------------
    whichType = cke.CheckType(df_all_api, r1, r2) # check whether the same api
    if whichType == None:
        df_all_api = cke.Predict_noConflict(df_all_api, r1, r2)
    elif whichType in ['h2d', 'd2h']: # data transfer in the same direction
        df_all_api = cke.Predict_transferOvlp(df_all_api, r1, r2, ways = 2.0)
    else: # concurrent kernel: todo
        print('run cke model')
        
        #cke.model_2cke(df_all_api, r1, r2)
        #if count == break_count: break
            
        r1_sid, r1_kid =cke.FindStreamAndKernID(df_all_api, r1)
        #print('r1_stream_id {} , r1_kernel_id {}'.format(r1_sid, r1_kid))
        r2_sid, r2_kid =cke.FindStreamAndKernID(df_all_api, r2)
        #print('r2_stream_id {} , r2_kernel_id {}'.format(r2_sid, r2_kid))
        r1_start_ms = cke.GetStartTime(df_all_api, r1)
        r2_start_ms = cke.GetStartTime(df_all_api, r2)
        
        #print r1_start_ms
        #print r2_start_ms
        
        #print('before:')
        #print('r1 start :{} r2 start : {}'.format(stream_kernel_list[r1_sid][r1_kid].start_ms,
        #                                         stream_kernel_list[r2_sid][r2_kid].start_ms))
        
        stream_kernel_list[0][0].start_ms = r1_start_ms
        stream_kernel_list[1][0].start_ms = r2_start_ms
        
        
        #print('after:')
        #print('r1 start :{} r2 start : {}'.format(stream_kernel_list[r1_sid][r1_kid].start_ms,
        #                                        stream_kernel_list[r2_sid][r2_kid].start_ms))
        

        #Dump_kern_info(stream_kernel_list[r1_sid][r1_kid])
        #Dump_kern_info(stream_kernel_list[r2_sid][r2_kid])
        
        kernels_ = []
        kernels_.append(stream_kernel_list[r1_sid][r1_kid])
        kernels_.append(stream_kernel_list[r2_sid][r2_kid])

        SM_resList, SM_traceList = avgblk.cke_model(gtx950, SM_resList, SM_traceList, kernels_)
        
        # find the kernel execution time from the sm trace table
        result_kernel_runtime_dd = avgblk.Get_KernTime(SM_traceList)
        
        #print result_kernel_runtime_dd
        
        result_r1_start = result_kernel_runtime_dd[0][0]
        result_r1_end = result_kernel_runtime_dd[0][1]
        
        result_r2_start = result_kernel_runtime_dd[1][0]
        result_r2_end = result_kernel_runtime_dd[1][1]
        
        # r1 will be the 1st in dd, r2 will be the 2nd 
        df_all_api.set_value(r1, 'pred_end', result_r1_end)
        df_all_api.set_value(r2, 'pred_end', result_r2_end)   # Warning: it is better to have a pred_start
                                                              # Warning: but we care about the end timing for now

        
    #if count == break_count: break
        
        

    # check any of r1 and r2 has status done. if done, go to next 


    rangeT = cke.Get_pred_range(df_all_api)
    print rangeT

    #if count == break_count: break

    extra_conc = cke.Check_cc_by_time(df_all_api, rangeT) # check whether there is conc during the rangeT
    print('extra_conc {}'.format(extra_conc))
    
    #if count == break_count: break
        
    if extra_conc == 0:
        if whichType in ['h2d', 'd2h']:
            df_all_api = cke.Update_wake_transferOvlp(df_all_api, rangeT, ways = 2.0)
        elif whichType == 'kern':
            df_all_api = cke.Update_wake_kernOvlp(df_all_api)
        else: # no overlapping
            df_all_api = cke.Update_wake_noConflict(df_all_api, rangeT)

        #if count == break_count: break
            
        # check if any api is done, and update the timing for the other apis in that stream
        df_all_api = cke.UpdateStreamTime(df_all_api)
        
        #if count == break_count: break

    else: # todo : when there is additional overlapping
        pass

#         if count == break_count:
#             break









    



row:0 row:1
row:1 row:4
row:4 row:5
row:5 row:2
row:2 row:6
run cke model
[0.23682000000001335, 0.25757199999998193]
extra_conc 0
row:3 row:7
row:None row:None



In [25]:

    
df_all_api









    Out[25]:






  
    
      
      start
      end
      api_type
      size_kb
      stream_id
      status
      bw
      bytes_done
      bytes_left
      current_pos
      pred_end
    
  
  
    
      0
      0.000000
      0.016641
      h2d
      89.84375
      0.0
      done
      5398.939367
      89.84375
      0.0
      0.016641
      0.016641
    
    
      1
      0.018081
      0.034209
      h2d
      89.84375
      0.0
      done
      5570.669023
      89.84375
      0.0
      0.034209
      0.034209
    
    
      4
      0.036209
      0.052850
      h2d
      89.84375
      1.0
      done
      5398.939367
      89.84375
      0.0
      0.052850
      0.052850
    
    
      5
      0.054290
      0.070418
      h2d
      89.84375
      1.0
      done
      5570.669023
      89.84375
      0.0
      0.070418
      0.070418
    
    
      2
      0.200611
      0.257572
      kern
      0.00000
      0.0
      done
      0.000000
      0.00000
      0.0
      0.257572
      0.257572
    
    
      6
      0.236820
      0.314533
      kern
      0.00000
      1.0
      done
      0.000000
      0.00000
      0.0
      0.314533
      0.314533
    
    
      3
      0.263236
      0.279141
      d2h
      89.84375
      0.0
      done
      5648.773970
      89.84375
      0.0
      0.279141
      0.279141
    
    
      7
      0.320197
      0.336102
      d2h
      89.84375
      1.0
      done
      5648.773970
      89.84375
      0.0
      0.336102
      0.336102



In [26]:

    
df_2stream_trace



In [27]:

    
df_s1



In [28]:

    
#
# run above
#

	Start	Duration	Grid X	Grid Y	Grid Z	Block X	Block Y	Block Z	Registers Per Thread	Static SMem	Dynamic SMem	Size	Throughput	Device	Context	Stream	Name
0	ms	us	NaN	NaN	NaN	NaN	NaN	NaN	NaN	B	B	KB	GB/s	NaN	NaN	NaN	NaN
1	806.383704	16.704000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	89.843750	5.129411	GeForce GTX 950 (0)	1.0	13.0	[CUDA memcpy HtoD]
2	806.401624	16.417000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	89.843750	5.219082	GeForce GTX 950 (0)	1.0	13.0	[CUDA memcpy HtoD]
3	806.419321	16.576000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	89.843750	5.169020	GeForce GTX 950 (0)	1.0	14.0	[CUDA memcpy HtoD]
4	806.437113	20.993000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	89.843750	4.081440	GeForce GTX 950 (0)	1.0	14.0	[CUDA memcpy HtoD]
5	806.716992	69.089000	90.0	1.0	1.0	256.0	1.0	1.0	28.0	0	0	NaN	NaN	GeForce GTX 950 (0)	1.0	13.0	kernel_vectorAdd(float const , float const ,...
6	806.755233	73.793000	90.0	1.0	1.0	256.0	1.0	1.0	28.0	0	0	NaN	NaN	GeForce GTX 950 (0)	1.0	14.0	kernel_vectorAdd(float const , float const ,...
7	806.788705	15.937000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	89.843750	5.376274	GeForce GTX 950 (0)	1.0	13.0	[CUDA memcpy DtoH]
8	806.832162	15.873000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	89.843750	5.397951	GeForce GTX 950 (0)	1.0	14.0	[CUDA memcpy DtoH]

	start	end	api_type	size_kb	stream_id	status	bw	bytes_left
0	0.000000	0.016641	h2d	89.84375	0.0	sleep	5398.939367	89.84375
1	0.018081	0.034209	h2d	89.84375	0.0	sleep	5570.669023	89.84375
4	0.036209	0.052850	h2d	89.84375	1.0	sleep	5398.939367	89.84375
5	0.054290	0.070418	h2d	89.84375	1.0	sleep	5570.669023	89.84375
2	0.200611	0.257572	kern	0.00000	0.0	sleep	0.000000	0.00000
6	0.236820	0.293781	kern	0.00000	1.0	sleep	0.000000	0.00000
3	0.263236	0.279141	d2h	89.84375	0.0	sleep	5648.773970	89.84375
7	0.299445	0.315350	d2h	89.84375	1.0	sleep	5648.773970	89.84375

	stream	api_type	start	end	size	duration
0	0.0	h2d	0.000000	0.016704	89.84375	0.016704
1	0.0	h2d	0.017920	0.034337	89.84375	0.016417
2	0.0	kern	0.333288	0.402377	0.00000	0.069089
3	0.0	d2h	0.405001	0.420938	89.84375	0.015937
4	1.0	h2d	0.035617	0.052193	89.84375	0.016576
5	1.0	h2d	0.053409	0.074402	89.84375	0.020993
6	1.0	kern	0.371529	0.445322	0.00000	0.073793
7	1.0	d2h	0.448458	0.464331	89.84375	0.015873