Example Scheme:

[{ "name": "kind", "mode": "nullable", "type": "string" }, { "name": "fullName", "type": "string", "mode": "required" }, { "name": "age", "type": "integer", "mode": "nullable" }, { "name": "gender", "type": "string", "mode": "nullable" }, { "name": "phoneNumber", "type": "record", "mode": "nullable", "fields": [ { "name": "areaCode", "type": "integer", "mode": "nullable" }, { "name": "number", "type": "integer", "mode": "nullable" } ] }, { "name": "children", "type": "record", "mode": "repeated", "fields": [ { "name": "name", "type": "string", "mode": "nullable" }, { "name": "gender", "type": "string", "mode": "nullable" }, { "name": "age", "type": "integer", "mode": "nullable" } ] }, { "name": "citiesLived", "type": "record", "mode": "repeated", "fields": [ { "name": "place", "type": "string", "mode": "nullable" }, { "name": "yearsLived", "type": "integer", "mode": "repeated" } ] } ]


In [6]:
import json
od = json.loads('''{"sublabel": {"max_label": "Tug", "label_scores": {"Cargo": 0.06757091730833054, "Sailing": 0.018900442868471146, "Trawlers": 0.07967807352542877, "Seismic_vessel": 0.02684229239821434, "Set_gillnets": 0.014477293007075787, "Set_longlines": 0.01454586535692215, "Squid": 0.015574329532682896, "Reefer": 0.2641301453113556, "Pole_and_line": 0.01445907261222601, "Purse_seines": 0.04525424912571907, "Pots_and_traps": 0.01445205882191658, "Drifting_longlines": 0.01447854470461607, "Tanker": 0.04711627960205078, "Pilot": 0.014536352828145027, "Tug": 0.34798410534858704}, "max_label_probability": 0.34798410534858704, "name": "Vessel detailed class"}, "length": {"name": "Vessel length regression", "value": 48.71369171142578}, "start_time": "2013-06-24T00:00:00", "end_time": "2013-12-21T00:00:00", "mmsi": 14, "is_fishing": {"max_label": "Non_fishing", "label_scores": {"Fishing": 0.1785549521446228, "Non_fishing": 0.8214449882507324}, "max_label_probability": 0.8214449882507324, "name": "Fishing"}, "label": {"max_label": "Tug/Pilot", "label_scores": {"Passenger": 0.06774556636810303, "Cargo_Tanker": 0.09529703855514526, "Trawlers": 0.07941653579473495, "Seismic_vessel": 0.023908106610178947, "Pole_and_line": 0.009239018894731998, "Fixed_gear": 0.009984415955841541, "Squid": 0.010400119237601757, "Reefer": 0.2778398096561432, "Purse_seines": 0.04067355766892433, "Tug/Pilot": 0.37623798847198486, "Drifting_longlines": 0.009257831610739231}, "max_label_probability": 0.37623798847198486, "name": "Vessel class"}}''')

In [7]:
schema = []

the_type = {'int':'integer','dict':'record','float':'float','unicode':'string'}
the_mode = {'int':'nullable','dict':'repeated','float':'nullable','unicode':'nullable'}

def get_fields(d):
    sub_schema = []
    for o in d:
        v = get_field(o,d)
        sub_schema.append(v)
    return sub_schema


def get_field(o, od):
    v = {}
    name = o.replace(" ","_").replace("-","_").replace("/","_")
    if " " in o or "-" in o or '/' in o:
        print 'line = line.replace("{}","{}")'.format(o,name)
    the_class = od[o].__class__.__name__
    tt = the_type[the_class]
    tm = the_mode[the_class]    
    v['name'] = name
    v['type'] = tt
    v['mode'] = tm
    if the_class == 'dict':
        v['fields'] = get_fields(od[o])
    return v

for o in od:
    ve = get_field(o,od)    
    schema.append(ve)
            
            
#     if isinstance(od[o], dict):


line = line.replace("Tug/Pilot","Tug_Pilot")

In [8]:
schema


Out[8]:
[{'mode': 'nullable', 'name': u'start_time', 'type': 'string'},
 {'mode': 'nullable', 'name': u'mmsi', 'type': 'integer'},
 {'fields': [{'mode': 'nullable', 'name': u'max_label', 'type': 'string'},
   {'fields': [{'mode': 'nullable', 'name': u'Passenger', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Tug_Pilot', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Trawlers', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Pole_and_line', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Squid', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Reefer', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Cargo_Tanker', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Seismic_vessel', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Purse_seines', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Fixed_gear', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Drifting_longlines', 'type': 'float'}],
    'mode': 'repeated',
    'name': u'label_scores',
    'type': 'record'},
   {'mode': 'nullable', 'name': u'max_label_probability', 'type': 'float'},
   {'mode': 'nullable', 'name': u'name', 'type': 'string'}],
  'mode': 'repeated',
  'name': u'label',
  'type': 'record'},
 {'fields': [{'mode': 'nullable', 'name': u'max_label', 'type': 'string'},
   {'fields': [{'mode': 'nullable', 'name': u'Cargo', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Tanker', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Sailing', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Trawlers', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Set_gillnets', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Pole_and_line', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Squid', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Reefer', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Set_longlines', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Pots_and_traps', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Seismic_vessel', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Purse_seines', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Drifting_longlines', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Tug', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Pilot', 'type': 'float'}],
    'mode': 'repeated',
    'name': u'label_scores',
    'type': 'record'},
   {'mode': 'nullable', 'name': u'max_label_probability', 'type': 'float'},
   {'mode': 'nullable', 'name': u'name', 'type': 'string'}],
  'mode': 'repeated',
  'name': u'sublabel',
  'type': 'record'},
 {'fields': [{'mode': 'nullable', 'name': u'name', 'type': 'string'},
   {'mode': 'nullable', 'name': u'value', 'type': 'float'}],
  'mode': 'repeated',
  'name': u'length',
  'type': 'record'},
 {'mode': 'nullable', 'name': u'end_time', 'type': 'string'},
 {'fields': [{'mode': 'nullable', 'name': u'max_label', 'type': 'string'},
   {'fields': [{'mode': 'nullable', 'name': u'Fishing', 'type': 'float'},
     {'mode': 'nullable', 'name': u'Non_fishing', 'type': 'float'}],
    'mode': 'repeated',
    'name': u'label_scores',
    'type': 'record'},
   {'mode': 'nullable', 'name': u'max_label_probability', 'type': 'float'},
   {'mode': 'nullable', 'name': u'name', 'type': 'string'}],
  'mode': 'repeated',
  'name': u'is_fishing',
  'type': 'record'}]

In [9]:
with open("data_schema_2.json", 'w') as f:
    f.write(json.dumps(schema))

In [10]:
od


Out[10]:
{u'end_time': u'2013-12-21T00:00:00',
 u'is_fishing': {u'label_scores': {u'Fishing': 0.1785549521446228,
   u'Non_fishing': 0.8214449882507324},
  u'max_label': u'Non_fishing',
  u'max_label_probability': 0.8214449882507324,
  u'name': u'Fishing'},
 u'label': {u'label_scores': {u'Cargo_Tanker': 0.09529703855514526,
   u'Drifting_longlines': 0.009257831610739231,
   u'Fixed_gear': 0.009984415955841541,
   u'Passenger': 0.06774556636810303,
   u'Pole_and_line': 0.009239018894731998,
   u'Purse_seines': 0.04067355766892433,
   u'Reefer': 0.2778398096561432,
   u'Seismic_vessel': 0.023908106610178947,
   u'Squid': 0.010400119237601757,
   u'Trawlers': 0.07941653579473495,
   u'Tug/Pilot': 0.37623798847198486},
  u'max_label': u'Tug/Pilot',
  u'max_label_probability': 0.37623798847198486,
  u'name': u'Vessel class'},
 u'length': {u'name': u'Vessel length regression',
  u'value': 48.71369171142578},
 u'mmsi': 14,
 u'start_time': u'2013-06-24T00:00:00',
 u'sublabel': {u'label_scores': {u'Cargo': 0.06757091730833054,
   u'Drifting_longlines': 0.01447854470461607,
   u'Pilot': 0.014536352828145027,
   u'Pole_and_line': 0.01445907261222601,
   u'Pots_and_traps': 0.01445205882191658,
   u'Purse_seines': 0.04525424912571907,
   u'Reefer': 0.2641301453113556,
   u'Sailing': 0.018900442868471146,
   u'Seismic_vessel': 0.02684229239821434,
   u'Set_gillnets': 0.014477293007075787,
   u'Set_longlines': 0.01454586535692215,
   u'Squid': 0.015574329532682896,
   u'Tanker': 0.04711627960205078,
   u'Trawlers': 0.07967807352542877,
   u'Tug': 0.34798410534858704},
  u'max_label': u'Tug',
  u'max_label_probability': 0.34798410534858704,
  u'name': u'Vessel detailed class'}}

In [15]:
with open('vessel-classification-all_v2.json','w') as big_outfile:
    with open('vessel-classification-all.json','rU') as bigfile:
        lines = bigfile.readlines()
        for line in lines:
            line = line.replace("Seismic vessel","Seismic_vessel")
            line = line.replace("Set gillnets","Set_gillnets")
            line = line.replace("Set longlines","Set_longlines")
            line = line.replace("Motor passenger","Motor_passenger")
            line = line.replace("Pole and line","Pole_and_line")
            line = line.replace("Purse seines","Purse_seines")
            line = line.replace("Pots and traps","Pots_and_traps")
            line = line.replace("Drifting longlines","Drifting_longlines")
            line = line.replace("Non-fishing","Non_fishing")
            line = line.replace("Cargo/Tanker","Cargo_Tanker")
            line = line.replace("Seismic vessel","Seismic_vessel")
            line = line.replace("Pole and line","Pole_and_line")
            line = line.replace("Fixed gear","Fixed_gear")
            line = line.replace("Purse seines","Purse_seines")
            line = line.replace("Drifting longlines","Drifting_longlines")
            line = line.replace("Tug/Pilot/Supply","Tug_Pilot_Supply")
            line = line.replace("Tug/Pilot","Tug_Pilot")
            big_outfile.write(line)

In [11]:
import os
command = "gsutil cp -z json vessel-classification-all_v2.json gs://david-scratch/vessel-classification-all_v2.json"
os.system(command)


Out[11]:
0

In [14]:
# bq load <destination_table> <data_source_uri> <table_schema>
command = "bq load --source_format=NEWLINE_DELIMITED_JSON world-fishing-827:scratch_david_mmsi_lists.nn_labels_20161201 gs://david-scratch/vessel-classification-all_v2.json data_schema_2.json"
os.system(command)


bq load --source_format=NEWLINE_DELIMITED_JSON world-fishing-827:scratch_david_mmsi_lists.nn_labels_20161201 gs://david-scratch/vessel-classification-all_v2.json data_schema_2.json

In [ ]: