Example Scheme:
[{ "name": "kind", "mode": "nullable", "type": "string" }, { "name": "fullName", "type": "string", "mode": "required" }, { "name": "age", "type": "integer", "mode": "nullable" }, { "name": "gender", "type": "string", "mode": "nullable" }, { "name": "phoneNumber", "type": "record", "mode": "nullable", "fields": [ { "name": "areaCode", "type": "integer", "mode": "nullable" }, { "name": "number", "type": "integer", "mode": "nullable" } ] }, { "name": "children", "type": "record", "mode": "repeated", "fields": [ { "name": "name", "type": "string", "mode": "nullable" }, { "name": "gender", "type": "string", "mode": "nullable" }, { "name": "age", "type": "integer", "mode": "nullable" } ] }, { "name": "citiesLived", "type": "record", "mode": "repeated", "fields": [ { "name": "place", "type": "string", "mode": "nullable" }, { "name": "yearsLived", "type": "integer", "mode": "repeated" } ] } ]
In [6]:
import json
od = json.loads('''{"sublabel": {"max_label": "Tug", "label_scores": {"Cargo": 0.06757091730833054, "Sailing": 0.018900442868471146, "Trawlers": 0.07967807352542877, "Seismic_vessel": 0.02684229239821434, "Set_gillnets": 0.014477293007075787, "Set_longlines": 0.01454586535692215, "Squid": 0.015574329532682896, "Reefer": 0.2641301453113556, "Pole_and_line": 0.01445907261222601, "Purse_seines": 0.04525424912571907, "Pots_and_traps": 0.01445205882191658, "Drifting_longlines": 0.01447854470461607, "Tanker": 0.04711627960205078, "Pilot": 0.014536352828145027, "Tug": 0.34798410534858704}, "max_label_probability": 0.34798410534858704, "name": "Vessel detailed class"}, "length": {"name": "Vessel length regression", "value": 48.71369171142578}, "start_time": "2013-06-24T00:00:00", "end_time": "2013-12-21T00:00:00", "mmsi": 14, "is_fishing": {"max_label": "Non_fishing", "label_scores": {"Fishing": 0.1785549521446228, "Non_fishing": 0.8214449882507324}, "max_label_probability": 0.8214449882507324, "name": "Fishing"}, "label": {"max_label": "Tug/Pilot", "label_scores": {"Passenger": 0.06774556636810303, "Cargo_Tanker": 0.09529703855514526, "Trawlers": 0.07941653579473495, "Seismic_vessel": 0.023908106610178947, "Pole_and_line": 0.009239018894731998, "Fixed_gear": 0.009984415955841541, "Squid": 0.010400119237601757, "Reefer": 0.2778398096561432, "Purse_seines": 0.04067355766892433, "Tug/Pilot": 0.37623798847198486, "Drifting_longlines": 0.009257831610739231}, "max_label_probability": 0.37623798847198486, "name": "Vessel class"}}''')
In [7]:
schema = []
the_type = {'int':'integer','dict':'record','float':'float','unicode':'string'}
the_mode = {'int':'nullable','dict':'repeated','float':'nullable','unicode':'nullable'}
def get_fields(d):
sub_schema = []
for o in d:
v = get_field(o,d)
sub_schema.append(v)
return sub_schema
def get_field(o, od):
v = {}
name = o.replace(" ","_").replace("-","_").replace("/","_")
if " " in o or "-" in o or '/' in o:
print 'line = line.replace("{}","{}")'.format(o,name)
the_class = od[o].__class__.__name__
tt = the_type[the_class]
tm = the_mode[the_class]
v['name'] = name
v['type'] = tt
v['mode'] = tm
if the_class == 'dict':
v['fields'] = get_fields(od[o])
return v
for o in od:
ve = get_field(o,od)
schema.append(ve)
# if isinstance(od[o], dict):
In [8]:
schema
Out[8]:
In [9]:
with open("data_schema_2.json", 'w') as f:
f.write(json.dumps(schema))
In [10]:
od
Out[10]:
In [15]:
with open('vessel-classification-all_v2.json','w') as big_outfile:
with open('vessel-classification-all.json','rU') as bigfile:
lines = bigfile.readlines()
for line in lines:
line = line.replace("Seismic vessel","Seismic_vessel")
line = line.replace("Set gillnets","Set_gillnets")
line = line.replace("Set longlines","Set_longlines")
line = line.replace("Motor passenger","Motor_passenger")
line = line.replace("Pole and line","Pole_and_line")
line = line.replace("Purse seines","Purse_seines")
line = line.replace("Pots and traps","Pots_and_traps")
line = line.replace("Drifting longlines","Drifting_longlines")
line = line.replace("Non-fishing","Non_fishing")
line = line.replace("Cargo/Tanker","Cargo_Tanker")
line = line.replace("Seismic vessel","Seismic_vessel")
line = line.replace("Pole and line","Pole_and_line")
line = line.replace("Fixed gear","Fixed_gear")
line = line.replace("Purse seines","Purse_seines")
line = line.replace("Drifting longlines","Drifting_longlines")
line = line.replace("Tug/Pilot/Supply","Tug_Pilot_Supply")
line = line.replace("Tug/Pilot","Tug_Pilot")
big_outfile.write(line)
In [11]:
import os
command = "gsutil cp -z json vessel-classification-all_v2.json gs://david-scratch/vessel-classification-all_v2.json"
os.system(command)
Out[11]:
In [14]:
# bq load <destination_table> <data_source_uri> <table_schema>
command = "bq load --source_format=NEWLINE_DELIMITED_JSON world-fishing-827:scratch_david_mmsi_lists.nn_labels_20161201 gs://david-scratch/vessel-classification-all_v2.json data_schema_2.json"
os.system(command)
In [ ]: