Example Scheme:

[{ "name": "kind", "mode": "nullable", "type": "string" }, { "name": "fullName", "type": "string", "mode": "required" }, { "name": "age", "type": "integer", "mode": "nullable" }, { "name": "gender", "type": "string", "mode": "nullable" }, { "name": "phoneNumber", "type": "record", "mode": "nullable", "fields": [ { "name": "areaCode", "type": "integer", "mode": "nullable" }, { "name": "number", "type": "integer", "mode": "nullable" } ] }, { "name": "children", "type": "record", "mode": "repeated", "fields": [ { "name": "name", "type": "string", "mode": "nullable" }, { "name": "gender", "type": "string", "mode": "nullable" }, { "name": "age", "type": "integer", "mode": "nullable" } ] }, { "name": "citiesLived", "type": "record", "mode": "repeated", "fields": [ { "name": "place", "type": "string", "mode": "nullable" }, { "name": "yearsLived", "type": "integer", "mode": "repeated" } ] } ]


In [2]:
import json
od = json.loads('''{"mmsi": 411041797, "labels": {"sublabel": {"max_label": "Squid", "label_scores": {"Cargo": 1.49017853612321e-09, "Supply": 1.202220656359998e-09, "Sailing": 1.2120705550344724e-09, "Trawlers": 1.2140348726319417e-09, "Seismic vessel": 1.237318580926683e-09, "Set gillnets": 1.194789045477762e-09, "Set longlines": 1.194789045477762e-09, "Squid": 0.9999995231628418, "Motor passenger": 1.1965610724473663e-09, "Reefer": 4.342152806202648e-07, "Pole and line": 1.5954383369987113e-09, "Purse seines": 1.3982492941266855e-09, "Pots and traps": 1.194789045477762e-09, "Drifting longlines": 1.4433721995388282e-09, "Tanker": 3.244188695461503e-09, "Pilot": 1.194793597392163e-09, "Tug": 1.19712029178487e-09, "Trollers": 1.194789045477762e-09}, "max_label_probability": 0.9999995231628418, "name": "Vessel detailed class"}, "length": {"name": "Vessel length regression", "value": 28.348167419433594}, "is_fishing": {"max_label": "Fishing", "label_scores": {"Fishing": 0.9995298385620117, "Non-fishing": 0.00047023396473377943}, "max_label_probability": 0.9995298385620117, "name": "Fishing"}, "label": {"max_label": "Squid", "label_scores": {"Passenger": 1.463510534982504e-09, "Cargo/Tanker": 5.384665602292671e-09, "Trawlers": 1.4110513868459407e-09, "Seismic vessel": 1.7059856860512923e-09, "Pole and line": 3.6103113831842393e-09, "Fixed gear": 1.3594144698814148e-09, "Squid": 0.9999984502792358, "Reefer": 1.5216498923109611e-06, "Purse seines": 1.9657095950975645e-09, "Drifting longlines": 3.154397631988104e-09, "Tug/Pilot/Supply": 1.36162026098674e-09, "Trollers": 1.3594196879296305e-09}, "max_label_probability": 0.9999984502792358, "name": "Vessel class"}}, "start_time": "2013-01-01T00:00:00", "end_time": "2013-07-01T00:00:00"}''')

In [55]:
schema = []

the_type = {'int':'integer','dict':'record','float':'float','unicode':'string'}
the_mode = {'int':'nullable','dict':'repeated','float':'nullable','unicode':'nullable'}

def get_fields(d):
    sub_schema = []
    for o in d:
        v = get_field(o,d)
        sub_schema.append(v)
    return sub_schema


def get_field(o, od):
    v = {}
    name = o.replace(" ","_").replace("-","_").replace("/","_")
    if " " in o or "-" in o or '/' in o:
        print 'line = line.replace("{}","{}")'.format(o,name)
    the_class = od[o].__class__.__name__
    tt = the_type[the_class]
    tm = the_mode[the_class]    
    v['name'] = name
    v['type'] = tt
    v['mode'] = tm
    if the_class == 'dict':
        v['fields'] = get_fields(od[o])
    return v

for o in od:
    ve = get_field(o,od)    
    schema.append(ve)
            
            
#     if isinstance(od[o], dict):


line = line.replace("Seismic vessel","Seismic_vessel")
line = line.replace("Set gillnets","Set_gillnets")
line = line.replace("Set longlines","Set_longlines")
line = line.replace("Motor passenger","Motor_passenger")
line = line.replace("Pole and line","Pole_and_line")
line = line.replace("Purse seines","Purse_seines")
line = line.replace("Pots and traps","Pots_and_traps")
line = line.replace("Drifting longlines","Drifting_longlines")
line = line.replace("Non-fishing","Non_fishing")
line = line.replace("Cargo/Tanker","Cargo_Tanker")
line = line.replace("Seismic vessel","Seismic_vessel")
line = line.replace("Pole and line","Pole_and_line")
line = line.replace("Fixed gear","Fixed_gear")
line = line.replace("Purse seines","Purse_seines")
line = line.replace("Drifting longlines","Drifting_longlines")
line = line.replace("Tug/Pilot/Supply","Tug_Pilot_Supply")

In [50]:
schema


Out[50]:
[{'mode': 'nullable', 'name': u'mmsi', 'type': 'integer'},
 {'fields': [{'fields': [{'mode': 'nullable',
      'name': u'max_label',
      'type': 'string'},
     {'fields': [{'mode': 'nullable', 'name': u'Cargo', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Squid', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Sailing', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Trawlers', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Seismic_vessel', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Set_gillnets', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Trollers', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Set_longlines', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Supply', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Motor_passenger', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Reefer', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Pole_and_line', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Purse_seines', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Pots_and_traps', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Drifting_longlines', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Tanker', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Tug', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Pilot', 'type': 'float'}],
      'mode': 'repeated',
      'name': u'label_scores',
      'type': 'record'},
     {'mode': 'nullable', 'name': u'max_label_probability', 'type': 'float'},
     {'mode': 'nullable', 'name': u'name', 'type': 'string'}],
    'mode': 'repeated',
    'name': u'sublabel',
    'type': 'record'},
   {'fields': [{'mode': 'nullable', 'name': u'name', 'type': 'string'},
     {'mode': 'nullable', 'name': u'value', 'type': 'float'}],
    'mode': 'repeated',
    'name': u'length',
    'type': 'record'},
   {'fields': [{'mode': 'nullable', 'name': u'max_label', 'type': 'string'},
     {'fields': [{'mode': 'nullable', 'name': u'Fishing', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Non_fishing', 'type': 'float'}],
      'mode': 'repeated',
      'name': u'label_scores',
      'type': 'record'},
     {'mode': 'nullable', 'name': u'max_label_probability', 'type': 'float'},
     {'mode': 'nullable', 'name': u'name', 'type': 'string'}],
    'mode': 'repeated',
    'name': u'is_fishing',
    'type': 'record'},
   {'fields': [{'mode': 'nullable', 'name': u'max_label', 'type': 'string'},
     {'fields': [{'mode': 'nullable', 'name': u'Passenger', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Cargo_Tanker', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Trawlers', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Seismic_vessel', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Pole_and_line', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Fixed_gear', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Squid', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Reefer', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Purse_seines', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Drifting_longlines', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Trollers', 'type': 'float'},
       {'mode': 'nullable', 'name': u'Tug_Pilot_Supply', 'type': 'float'}],
      'mode': 'repeated',
      'name': u'label_scores',
      'type': 'record'},
     {'mode': 'nullable', 'name': u'max_label_probability', 'type': 'float'},
     {'mode': 'nullable', 'name': u'name', 'type': 'string'}],
    'mode': 'repeated',
    'name': u'label',
    'type': 'record'}],
  'mode': 'repeated',
  'name': u'labels',
  'type': 'record'},
 {'mode': 'nullable', 'name': u'start_time', 'type': 'string'},
 {'mode': 'nullable', 'name': u'end_time', 'type': 'string'}]

In [47]:
with open("data_schema.json", 'w') as f:
    f.write(json.dumps(schema))

In [48]:
od


Out[48]:
{u'end_time': u'2013-07-01T00:00:00',
 u'labels': {u'is_fishing': {u'label_scores': {u'Fishing': 0.9995298385620117,
    u'Non-fishing': 0.00047023396473377943},
   u'max_label': u'Fishing',
   u'max_label_probability': 0.9995298385620117,
   u'name': u'Fishing'},
  u'label': {u'label_scores': {u'Cargo/Tanker': 5.384665602292671e-09,
    u'Drifting longlines': 3.154397631988104e-09,
    u'Fixed gear': 1.3594144698814148e-09,
    u'Passenger': 1.463510534982504e-09,
    u'Pole and line': 3.6103113831842393e-09,
    u'Purse seines': 1.9657095950975645e-09,
    u'Reefer': 1.5216498923109611e-06,
    u'Seismic vessel': 1.7059856860512923e-09,
    u'Squid': 0.9999984502792358,
    u'Trawlers': 1.4110513868459407e-09,
    u'Trollers': 1.3594196879296305e-09,
    u'Tug/Pilot/Supply': 1.36162026098674e-09},
   u'max_label': u'Squid',
   u'max_label_probability': 0.9999984502792358,
   u'name': u'Vessel class'},
  u'length': {u'name': u'Vessel length regression',
   u'value': 28.348167419433594},
  u'sublabel': {u'label_scores': {u'Cargo': 1.49017853612321e-09,
    u'Drifting longlines': 1.4433721995388282e-09,
    u'Motor passenger': 1.1965610724473663e-09,
    u'Pilot': 1.194793597392163e-09,
    u'Pole and line': 1.5954383369987113e-09,
    u'Pots and traps': 1.194789045477762e-09,
    u'Purse seines': 1.3982492941266855e-09,
    u'Reefer': 4.342152806202648e-07,
    u'Sailing': 1.2120705550344724e-09,
    u'Seismic vessel': 1.237318580926683e-09,
    u'Set gillnets': 1.194789045477762e-09,
    u'Set longlines': 1.194789045477762e-09,
    u'Squid': 0.9999995231628418,
    u'Supply': 1.202220656359998e-09,
    u'Tanker': 3.244188695461503e-09,
    u'Trawlers': 1.2140348726319417e-09,
    u'Trollers': 1.194789045477762e-09,
    u'Tug': 1.19712029178487e-09},
   u'max_label': u'Squid',
   u'max_label_probability': 0.9999995231628418,
   u'name': u'Vessel detailed class'}},
 u'mmsi': 411041797,
 u'start_time': u'2013-01-01T00:00:00'}

In [1]:
with open('vessel-classification-all_v2.json','w') as big_outfile:
    with open('vessel-classification-all.json','rU') as bigfile:
        lines = bigfile.readlines()
        for line in lines:
            line = line.replace("Seismic vessel","Seismic_vessel")
            line = line.replace("Set gillnets","Set_gillnets")
            line = line.replace("Set longlines","Set_longlines")
            line = line.replace("Motor passenger","Motor_passenger")
            line = line.replace("Pole and line","Pole_and_line")
            line = line.replace("Purse seines","Purse_seines")
            line = line.replace("Pots and traps","Pots_and_traps")
            line = line.replace("Drifting longlines","Drifting_longlines")
            line = line.replace("Non-fishing","Non_fishing")
            line = line.replace("Cargo/Tanker","Cargo_Tanker")
            line = line.replace("Seismic vessel","Seismic_vessel")
            line = line.replace("Pole and line","Pole_and_line")
            line = line.replace("Fixed gear","Fixed_gear")
            line = line.replace("Purse seines","Purse_seines")
            line = line.replace("Drifting longlines","Drifting_longlines")
            line = line.replace("Tug/Pilot/Supply","Tug_Pilot_Supply")
            big_outfile.write(line)

In [59]:
import os
command = "gsutil cp -z json vessel_classification_all_20161122_v2.json gs://david-scratch/vessel-classification-all_v2.json"
os.system(command)


Out[59]:
32512

In [ ]: