In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib as mp
import re

In [16]:
IN_F = './sparkvent-tpch.log'
fd = open(IN_F, 'r')

json_pt = r'(\{).*$'
time_pt = r'\d{2}\:\d{2}:\d{2}\.\d{0,6}'

raw_data_df = pd.DataFrame(columns=['time', 'data'])
stage_to_prgs = {}

for line in fd.readlines():
    ts = re.search(time_pt, line).group()
    js = json.loads(re.search(json_pt, line).group())
    
    for stage in js.keys():
        stage_id = stage.split(":")[1] + "-" + stage.split(":")[2]
        if stage_id in stage_to_prgs.keys():
            stage_to_prgs[stage_id].loc[ts] = js[stage].values()
        else:
            stage_to_prgs[stage_id] = pd.DataFrame(
                data = np.array([js[stage].values()]),
                index = [ts],
                columns = [
                "numFailedTasks", 
                "numActiveTasks", 
                "numCompleteTasks"]
            )      
        
stage_to_prgs['stages-7']


Out[16]:
numFailedTasks numActiveTasks numCompleteTasks
09:48:46.470027 0 0 0
09:48:47.510468 0 0 0
09:48:48.532865 0 0 0
09:48:49.557331 0 0 0
09:48:50.574422 0 0 0
09:48:51.629449 0 0 0
09:48:52.644684 0 0 0
09:48:53.664475 0 0 0
09:48:54.679609 0 0 0
09:48:55.694119 0 0 0
09:48:56.712309 0 0 0
09:48:57.730337 0 0 0
09:48:58.745462 0 0 0
09:48:59.759380 0 0 0
09:49:00.775344 0 0 0
09:49:01.793182 0 0 0
09:49:02.808683 0 0 0
09:49:03.824751 0 0 0
09:49:04.840113 0 0 0
09:49:05.856149 0 0 0
09:49:06.872012 0 0 0
09:49:07.898271 0 0 0
09:49:08.921121 0 0 0
09:49:09.942043 0 0 0
09:49:10.963247 0 0 0
09:49:11.981609 0 0 0
09:49:12.998297 0 4 30
09:49:14.019512 0 4 62
09:49:15.039779 0 4 100
09:49:16.059218 0 4 134
09:49:17.075285 0 4 175
09:49:18.091944 0 0 200
09:49:19.112538 0 0 200
09:49:20.132433 0 0 200
09:49:21.161295 0 0 200
09:49:22.183232 0 0 200
09:49:23.205336 0 0 200
09:49:24.227507 0 0 200
09:49:25.250286 0 0 200
09:49:26.288429 0 0 200
09:49:27.312403 0 0 200
09:49:28.344880 0 0 200
09:49:29.375188 0 0 200
09:49:30.403300 0 0 200
09:49:31.440044 0 0 200
09:49:32.472808 0 0 200
09:49:33.493956 0 0 200
09:49:34.520613 0 0 200

In [22]:
stage_to_prgs['stages-6']


Out[22]:
numFailedTasks numActiveTasks numCompleteTasks
09:48:46.470027 0 0 0
09:48:47.510468 0 0 0
09:48:48.532865 0 0 0
09:48:49.557331 0 0 0
09:48:50.574422 0 0 0
09:48:51.629449 0 0 0
09:48:52.644684 0 0 0
09:48:53.664475 0 0 0
09:48:54.679609 0 0 0
09:48:55.694119 0 0 0
09:48:56.712309 0 0 0
09:48:57.730337 0 0 0
09:48:58.745462 0 0 0
09:48:59.759380 0 4 0
09:49:00.775344 0 4 0
09:49:01.793182 0 4 0
09:49:02.808683 0 4 0
09:49:03.824751 0 4 0
09:49:04.840113 0 4 0
09:49:05.856149 0 4 0
09:49:06.872012 0 4 2
09:49:07.898271 0 2 4
09:49:08.921121 0 2 4
09:49:09.942043 0 2 4
09:49:10.963247 0 1 5
09:49:11.981609 0 1 5
09:49:12.998297 0 0 6
09:49:14.019512 0 0 6
09:49:15.039779 0 0 6
09:49:16.059218 0 0 6
09:49:17.075285 0 0 6
09:49:18.091944 0 0 6
09:49:19.112538 0 0 6
09:49:20.132433 0 0 6
09:49:21.161295 0 0 6
09:49:22.183232 0 0 6
09:49:23.205336 0 0 6
09:49:24.227507 0 0 6
09:49:25.250286 0 0 6
09:49:26.288429 0 0 6
09:49:27.312403 0 0 6
09:49:28.344880 0 0 6
09:49:29.375188 0 0 6
09:49:30.403300 0 0 6
09:49:31.440044 0 0 6
09:49:32.472808 0 0 6
09:49:33.493956 0 0 6
09:49:34.520613 0 0 6

In [ ]: