In [1038]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [1039]:
import warnings
import pandas as pd
import numpy as np
import os
import sys # error msg, add the modules
import operator # sorting
from math import *
import matplotlib.pyplot as plt

sys.path.append('../')

import read_trace
import cuda_timeline
from avgblkmodel import *
import cke
from df_util import *
#from model_cke import *

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

gpu info


In [1040]:
gtx950 = DeviceInfo()
gtx950.sm_num = 6
gtx950.sharedmem_per_sm = 49152
gtx950.reg_per_sm = 65536
gtx950.maxthreads_per_sm = 2048

2 stream info


In [1041]:
# 10M for mem_mem : where the h2d between streams are overlapped
trace_file = 'trace_10M_s1.csv'
trace_file_2cke = 'trace_h2d_h2d_ovlp.csv'

df_trace = read_trace.trace2dataframe(trace_file) # read the trace to the dataframe
df_trace_2cke = read_trace.trace2dataframe(trace_file_2cke)

In [1042]:
#df_trace

In [1043]:
#cuda_timeline.plot_trace(df_trace)

In [1044]:
df_trace_2cke


Out[1044]:
Start Duration Grid X Grid Y Grid Z Block X Block Y Block Z Registers Per Thread Static SMem Dynamic SMem Size Throughput Device Context Stream Name
0 ms ms NaN NaN NaN NaN NaN NaN NaN B B MB GB/s NaN NaN NaN NaN
1 665.032627 10.783099 NaN NaN NaN NaN NaN NaN NaN NaN NaN 38.146973 3.454749 GeForce GTX 950 (0) 1.0 13.0 [CUDA memcpy HtoD]
2 668.190628 12.898855 NaN NaN NaN NaN NaN NaN NaN NaN NaN 38.146973 2.888078 GeForce GTX 950 (0) 1.0 14.0 [CUDA memcpy HtoD]
3 675.816942 12.913127 NaN NaN NaN NaN NaN NaN NaN NaN NaN 38.146973 2.884886 GeForce GTX 950 (0) 1.0 13.0 [CUDA memcpy HtoD]
4 681.090667 10.213144 NaN NaN NaN NaN NaN NaN NaN NaN NaN 38.146973 3.647545 GeForce GTX 950 (0) 1.0 14.0 [CUDA memcpy HtoD]
5 688.741397 1.377896 39063.0 1.0 1.0 256.0 1.0 1.0 8.0 0 0 NaN NaN GeForce GTX 950 (0) 1.0 13.0 kernel_vectorAdd(float const *, float const *,...
6 690.126461 10.658490 NaN NaN NaN NaN NaN NaN NaN NaN NaN 38.146973 3.495139 GeForce GTX 950 (0) 1.0 13.0 [CUDA memcpy DtoH]
7 691.316867 1.366120 39063.0 1.0 1.0 256.0 1.0 1.0 8.0 0 0 NaN NaN GeForce GTX 950 (0) 1.0 14.0 kernel_vectorAdd(float const *, float const *,...
8 693.294222 9.522868 NaN NaN NaN NaN NaN NaN NaN NaN NaN 38.146973 3.911942 GeForce GTX 950 (0) 1.0 14.0 [CUDA memcpy DtoH]

In [1045]:
cuda_timeline.plot_trace(df_trace_2cke)


1cke - read trace and reset the timeline


In [1046]:
df_single_stream = read_trace.get_timing(df_trace)

In [1047]:
df_single_stream


Out[1047]:
stream api_type start end size duration
0 0.0 h2d 610.840271 617.277086 38146.973 6.436815
1 0.0 h2d 617.278302 623.708301 38146.973 6.429999
2 0.0 kern 623.719789 624.996407 0.000 1.276618
3 0.0 d2h 625.003191 631.272837 38146.973 6.269646

In [1048]:
df_s1 = read_trace.reset_starting(df_single_stream)

In [1049]:
df_s1


Out[1049]:
stream api_type start end size duration
0 0.0 h2d 0.000000 6.436815 38146.973 6.436815
1 0.0 h2d 6.438031 12.868030 38146.973 6.429999
2 0.0 kern 12.879518 14.156136 0.000 1.276618
3 0.0 d2h 14.162920 20.432566 38146.973 6.269646

2cke case


In [1050]:
df_2stream = read_trace.get_timing(df_trace_2cke)

In [1051]:
df_2stream


Out[1051]:
stream api_type start end size duration
0 0.0 h2d 665.032627 675.815726 38146.973 10.783099
1 0.0 h2d 675.816942 688.730069 38146.973 12.913127
2 0.0 kern 688.741397 690.119293 0.000 1.377896
3 0.0 d2h 690.126461 700.784951 38146.973 10.658490
4 1.0 h2d 668.190628 681.089483 38146.973 12.898855
5 1.0 h2d 681.090667 691.303811 38146.973 10.213144
6 1.0 kern 691.316867 692.682987 0.000 1.366120
7 1.0 d2h 693.294222 702.817090 38146.973 9.522868

In [1052]:
tot_runtime = read_trace.getTotalRuntime(df_2stream)
print tot_runtime


37.784463

2 cke


In [1053]:
stream_num = 2

# find when to start the stream and update the starting pos for the trace
H2D_H2D_OVLP_TH = 3.158431

df_cke_list = cke.init_trace_list(df_s1, stream_num = stream_num, h2d_ovlp_th = H2D_H2D_OVLP_TH)

In [1054]:
df_cke_list[0]


Out[1054]:
stream api_type start end size duration
0 0 h2d 0.000000 6.436815 38146.973 6.436815
1 0 h2d 6.438031 12.868030 38146.973 6.429999
2 0 kern 12.879518 14.156136 0.000 1.276618
3 0 d2h 14.162920 20.432566 38146.973 6.269646

In [1055]:
df_cke_list[1]


Out[1055]:
stream api_type start end size duration
0 1 h2d 3.158431 9.595246 38146.973 6.436815
1 1 h2d 9.596462 16.026461 38146.973 6.429999
2 1 kern 16.037949 17.314567 0.000 1.276618
3 1 d2h 17.321351 23.590997 38146.973 6.269646

sort


In [1056]:
df_all_api = cke.init_sort_api_with_extra_cols(df_cke_list)

In [1057]:
df_all_api


Out[1057]:
start end api_type size_kb stream_id status bw bytes_done bytes_left current_pos time_left pred_end
0 0.000000 6.436815 h2d 38146.973 0.0 sleep 5926.373991 0.0 38146.973 0.0 0.0 0.0
4 3.158431 9.595246 h2d 38146.973 1.0 sleep 5926.373991 0.0 38146.973 0.0 0.0 0.0
1 6.438031 12.868030 h2d 38146.973 0.0 sleep 5932.656133 0.0 38146.973 0.0 0.0 0.0
5 9.596462 16.026461 h2d 38146.973 1.0 sleep 5932.656133 0.0 38146.973 0.0 0.0 0.0
2 12.879518 14.156136 kern 0.000 0.0 sleep 0.000000 0.0 0.000 0.0 0.0 0.0
3 14.162920 20.432566 d2h 38146.973 0.0 sleep 6084.390251 0.0 38146.973 0.0 0.0 0.0
6 16.037949 17.314567 kern 0.000 1.0 sleep 0.000000 0.0 0.000 0.0 0.0 0.0
7 17.321351 23.590997 d2h 38146.973 1.0 sleep 6084.390251 0.0 38146.973 0.0 0.0 0.0

start algo


In [1058]:
# count = 1
# break_count = 7

while not cke.AllDone(df_all_api):
    # pick two api to learn 
    df_all_api, r1, r2 = cke.PickTwo(df_all_api)
    
    if r1 == None and r2 == None:                          # go directly updating the last wake api
        df_all_api = cke.UpdateStream_lastapi(df_all_api)
    else:
        df_all_api = cke.StartNext_byType(df_all_api, [r1, r2])

        whichType = cke.CheckType(df_all_api, r1, r2) # check whether the same api
#         print whichType

        if whichType == None:
            df_all_api = cke.Predict_noConflict(df_all_api, r1, r2)
        elif whichType in ['h2d', 'd2h']: # data transfer in the same direction
            df_all_api = cke.Predict_transferOvlp(df_all_api, r1, r2, ways = 2.0)
        else: # concurrent kernel: todo
            pass

    #     if count == break_count:
    #         break

        rangeT = cke.Get_pred_range(df_all_api)
#         print rangeT

    #     if count == break_count:
    #         break

        extra_conc = cke.Check_cc_by_time(df_all_api, rangeT) # check whether there is conc during the rangeT

        if extra_conc == 0:
            if whichType in ['h2d', 'd2h']:
                df_all_api = cke.Update_wake_transferOvlp(df_all_api, rangeT, ways = 2.0)
            elif whichType == 'kern':
                pass
            else: # no overlapping
                df_all_api = cke.Update_wake_noConflict(df_all_api, rangeT)

            # check if any api is done, and update the timing for the other apis in that stream
            df_all_api = cke.UpdateStreamTime(df_all_api)

        else: # todo : when there is additional overlapping
            pass

#         if count == break_count:
#             break
        
    # next call
    count = count + 1


r1:0 r2:4
r1:4 r2:1
r1:1 r2:5
r1:5 r2:2
r1:5 r2:3
r1:3 r2:6
r1:3 r2:7
r1:None r2:None

In [1059]:
df_all_api


Out[1059]:
start end api_type size_kb stream_id status bw bytes_done bytes_left current_pos time_left pred_end
0 0.000000 9.715199 h2d 38146.973 0.0 done 5926.373991 38146.973 0.0 9.715199 0.0 9.715199
4 3.158431 16.030845 h2d 38146.973 1.0 done 5926.373991 38146.973 0.0 16.030845 0.0 16.030845
1 9.716415 22.575197 h2d 38146.973 0.0 done 5932.656133 38146.973 0.0 22.575197 0.0 22.575197
5 16.032061 25.733628 h2d 38146.973 1.0 done 5932.656133 38146.973 0.0 25.733628 0.0 25.733628
2 22.586685 23.863303 kern 0.000 0.0 done 0.000000 0.000 0.0 23.863303 0.0 23.863303
3 23.870087 33.250948 d2h 38146.973 0.0 done 6084.390251 38146.973 0.0 33.250948 0.0 33.250948
6 25.745116 27.021734 kern 0.000 1.0 done 0.000000 0.000 0.0 27.021734 0.0 27.021734
7 27.028518 36.409379 d2h 38146.973 1.0 done 6084.390251 38146.973 0.0 36.409379 0.0 36.409379

In [1060]:
df_all_api.loc[df_all_api.stream_id == 0]


Out[1060]:
start end api_type size_kb stream_id status bw bytes_done bytes_left current_pos time_left pred_end
0 0.000000 9.715199 h2d 38146.973 0.0 done 5926.373991 38146.973 0.0 9.715199 0.0 9.715199
1 9.716415 22.575197 h2d 38146.973 0.0 done 5932.656133 38146.973 0.0 22.575197 0.0 22.575197
2 22.586685 23.863303 kern 0.000 0.0 done 0.000000 0.000 0.0 23.863303 0.0 23.863303
3 23.870087 33.250948 d2h 38146.973 0.0 done 6084.390251 38146.973 0.0 33.250948 0.0 33.250948

In [1061]:
df_all_api.loc[df_all_api.stream_id == 1]


Out[1061]:
start end api_type size_kb stream_id status bw bytes_done bytes_left current_pos time_left pred_end
4 3.158431 16.030845 h2d 38146.973 1.0 done 5926.373991 38146.973 0.0 16.030845 0.0 16.030845
5 16.032061 25.733628 h2d 38146.973 1.0 done 5932.656133 38146.973 0.0 25.733628 0.0 25.733628
6 25.745116 27.021734 kern 0.000 1.0 done 0.000000 0.000 0.0 27.021734 0.0 27.021734
7 27.028518 36.409379 d2h 38146.973 1.0 done 6084.390251 38146.973 0.0 36.409379 0.0 36.409379

In [23]:
#
# run above
#

In [24]:
# # pick the 1st sleep call and wake up
# r1 = cke.Pick_first_in_sleep(df_all_api)
# if r1 is not None:
#     df_all_api = cke.SetWake(df_all_api, r1) 

# # if r1 == None:
# #     df_all_api = cke.UpdateStream_lastapi(df_all_api) # where all calls are either done or active

# # pick another in the sleep mode, if it is from the same stream, there is non ovlp
# r2 = cke.Pick_first_in_sleep(df_all_api)

# if r2 is not None:
#     df_all_api = cke.SetWake(df_all_api, r2)

# from_same_stream = cke.Check_stream_id(df_all_api, r1, r2)

# if from_same_stream == True:
#     # finish r1, update r2 (no need to update others, since it is the only active working stream.)
#     pass
# else: # two apis are from different stream
#     # check ovlp
#     ovlp = cke.Check_ovlp(df_all_api, r1, r2)
#     if ovlp == False: # if two apis no overlapp, finish the r1, update r2 status
#         pass
#     else: # when there is overlapping
#         df_all_api = cke.Update_before_ovlp(df_all_api, r1, r2)
#         # predict with concurrency for rows 0 and 1
#         df_all_api = cke.Predict_end(df_all_api, r1, r2, ways = 2.0) # given two way overlapping, predict end time
        
#         # get the time range from wake api, to check the next concurrent api
#         rangeT = cke.Get_predict_range(df_all_api)
        
#         extra_conc = cke.Check_cc_by_time(df_all_api, rangeT) # check whether there is conc during the rangeT
        
#         if extra_conc == 0: # update timing using the pred_end
#             df_all_api = cke.Update_ovlpTrans(df_all_api, rangeT, ways = 2.0)
#             # check if any api is done, and update the timing for the other apis in that stream
#             df_all_api = cke.UpdateStreamTime(df_all_api)

In [25]:
## select the next api/call to wake it up
r3 = cke.pick_first_in_sleep(df_all_api)
df_all_api = SetWake(df_all_api, r3)

df_all_api = cke.StartNext(df_all_api, [r2, r3])

df_all_api = cke.Predict_checkCC(df_all_api, r2, r3)

rangeT = cke.Get_next_range(df_all_api)

extra_conc = cke.check_cc_by_time(df_all_api, rangeT) # check whether there is conc during the rangeT
if extra_conc == 0: # update timing using the pred_end
    df_all_api = cke.Update_with_pred_end(df_all_api, rangeT, ways = 2.0)
    df_all_api = cke.UpdateStreamTime(df_all_api)


------------------------------------------------------------------
AttributeError                   Traceback (most recent call last)
<ipython-input-25-7f0d1a62ae08> in <module>()
      1 ## select the next api/call to wake it up
----> 2 r3 = cke.pick_first_in_sleep(df_all_api)
      3 df_all_api = SetWake(df_all_api, r3)
      4 
      5 df_all_api = cke.StartNext(df_all_api, [r2, r3])

AttributeError: 'module' object has no attribute 'pick_first_in_sleep'

In [ ]:
# pick another in the sleep mode, if it is from the same stream, there is non ovlp
r4 = cke.pick_first_in_sleep(df_all_api)
df_all_api = SetWake(df_all_api, r4)
df_all_api = cke.StartNext(df_all_api, [r3, r4])
df_all_api = cke.Predict_checkCC(df_all_api, r3, r4)
rangeT = cke.Get_next_range(df_all_api)
extra_conc = cke.check_cc_by_time(df_all_api, rangeT) # check whether there is conc during the rangeT
if extra_conc == 0: # update timing using the pred_end
    df_all_api = cke.Update_with_pred_end(df_all_api, rangeT, ways = 2.0)
    df_all_api = cke.UpdateStreamTime(df_all_api)

In [ ]:
# pick another in the sleep mode, if it is from the same stream, there is non ovlp
r5 = cke.pick_first_in_sleep(df_all_api)
df_all_api = SetWake(df_all_api, r5)
df_all_api = cke.StartNext_checktype(df_all_api, [r4, r5])
whichType = cke.checkType(df_all_api, r4, r5)

if whichType == None:
    df_all_api = cke.Predict_noConflict(df_all_api, r4, r5)

    rangeT = cke.Get_next_range(df_all_api)
extra_conc = cke.check_cc_by_time(df_all_api, rangeT) # check whether there is conc during the rangeT
if extra_conc == 0: # update timing using the pred_end
    df_all_api = cke.Update_wake_noConflict(df_all_api, rangeT)
    df_all_api = cke.UpdateStreamTime(df_all_api)

In [ ]:
# pick another in the sleep mode, if it is from the same stream, there is non ovlp
r6 = cke.pick_first_in_sleep(df_all_api)
df_all_api = SetWake(df_all_api, r6)
df_all_api = cke.StartNext_checktype(df_all_api, [r4, r6])  # look for earliest wake api

whichType = cke.checkType(df_all_api, r4, r6)
print whichType

if whichType == None:
    df_all_api = cke.Predict_noConflict(df_all_api, r4, r6) # api type is different, there is no conflict
    

rangeT = cke.Get_next_range(df_all_api)
# print rangeT

extra_conc = cke.check_cc_by_time(df_all_api, rangeT) # check whether there is conc during the rangeT
# print extra_conc

if extra_conc == 0: # update timing using the pred_end
    df_all_api = cke.Update_wake_noConflict(df_all_api, rangeT)
    # check if any api is done, and update the timing for the other apis in that stream
    df_all_api = cke.UpdateStreamTime(df_all_api)

In [ ]:
# pick another in the sleep mode, if it is from the same stream, there is non ovlp
r7 = cke.pick_first_in_sleep(df_all_api)
df_all_api = SetWake(df_all_api, r7)
print r7

# the current_pos is ahead of coming call start: move to the next start
df_all_api = cke.StartNext_checktype(df_all_api, [r6, r7])

# we need to check whether they are the same api type
# if not, there will be no conflict (in this case, we can directly predict their end time)
whichType = cke.checkType(df_all_api, r6, r7)
print whichType

if whichType == None:
    df_all_api = cke.Predict_noConflict(df_all_api, r6, r7) # api type is different, there is no conflict
    
# get the time range from wake api, to check the next concurrent api
rangeT = cke.Get_next_range(df_all_api)
print rangeT    

extra_conc = cke.check_cc_by_time(df_all_api, rangeT) # check whether there is conc during the rangeT
print extra_conc

if extra_conc == 0: # update timing using the pred_end
    df_all_api = cke.Update_wake_noConflict(df_all_api, rangeT)
    # check if any api is done, and update the timing for the other apis in that stream
    df_all_api = cke.UpdateStreamTime(df_all_api)

In [ ]:
# pick another in the sleep mode, if it is from the same stream, there is non ovlp
r8 = cke.pick_first_in_sleep(df_all_api)
df_all_api = SetWake(df_all_api, r8)
print r8

# the current_pos is ahead of coming call start: move to the next start
df_all_api = cke.StartNext_checktype(df_all_api, [r6, r8])

# we need to check whether r6 and r8 are the same api type
# if not, there will be no conflict (in this case, we can directly predict their end time)
# if they are overlapping, use conflict version
whichType = cke.checkType(df_all_api, r6, r8)
print whichType

if whichType in ['h2d', 'd2h']:
    df_all_api = cke.Predict_transferOvlp(df_all_api, r6, r8, ways = 2.0)
    
# get the time range from wake api, to check the next concurrent api
rangeT = cke.Get_next_range(df_all_api)
print rangeT    

extra_conc = cke.check_cc_by_time(df_all_api, rangeT) # check whether there is conc during the rangeT
print extra_conc

if extra_conc == 0: # update timing using the pred_end
    df_all_api = cke.Update_wake_transferOvlp(df_all_api, rangeT, ways = 2.0)
    # check if any api is done, and update the timing for the other apis in that stream
    df_all_api = cke.UpdateStreamTime(df_all_api)

In [ ]:
# pick another in the sleep mode, if it is from the same stream, there is non ovlp
r9 = cke.pick_first_in_sleep(df_all_api)
print r9
# df_all_api = SetWake(df_all_api, r8)
# print r9

if r9 == None:
    # work on current wake api (this is the last api call)
    # current_wake
    df_all_api = cke.UpdateStream_lastapi(df_all_api)

In [ ]:
df_all_api

In [ ]:
#
# run above
#

In [ ]:
df_all_api.loc[df_all_api.stream_id == 0]

In [ ]:
df_all_api.loc[df_all_api.stream_id == 1]

3cke

plot all the stream timeline


In [ ]:
cuda_timeline.plot_cke_list(df_cke_list, savefig=True)

In [ ]:
cuda_timeline.plot_cke_list(df_cke_list[0:2])

In [ ]:
tot_runtime = read_trace.getTotalRuntime(df_cke_list[0:2])
print tot_runtime