Example Scheme:
[{ "name": "kind", "mode": "nullable", "type": "string" }, { "name": "fullName", "type": "string", "mode": "required" }, { "name": "age", "type": "integer", "mode": "nullable" }, { "name": "gender", "type": "string", "mode": "nullable" }, { "name": "phoneNumber", "type": "record", "mode": "nullable", "fields": [ { "name": "areaCode", "type": "integer", "mode": "nullable" }, { "name": "number", "type": "integer", "mode": "nullable" } ] }, { "name": "children", "type": "record", "mode": "repeated", "fields": [ { "name": "name", "type": "string", "mode": "nullable" }, { "name": "gender", "type": "string", "mode": "nullable" }, { "name": "age", "type": "integer", "mode": "nullable" } ] }, { "name": "citiesLived", "type": "record", "mode": "repeated", "fields": [ { "name": "place", "type": "string", "mode": "nullable" }, { "name": "yearsLived", "type": "integer", "mode": "repeated" } ] } ]
In [2]:
import json
od = json.loads('''{"mmsi": 411041797, "labels": {"sublabel": {"max_label": "Squid", "label_scores": {"Cargo": 1.49017853612321e-09, "Supply": 1.202220656359998e-09, "Sailing": 1.2120705550344724e-09, "Trawlers": 1.2140348726319417e-09, "Seismic vessel": 1.237318580926683e-09, "Set gillnets": 1.194789045477762e-09, "Set longlines": 1.194789045477762e-09, "Squid": 0.9999995231628418, "Motor passenger": 1.1965610724473663e-09, "Reefer": 4.342152806202648e-07, "Pole and line": 1.5954383369987113e-09, "Purse seines": 1.3982492941266855e-09, "Pots and traps": 1.194789045477762e-09, "Drifting longlines": 1.4433721995388282e-09, "Tanker": 3.244188695461503e-09, "Pilot": 1.194793597392163e-09, "Tug": 1.19712029178487e-09, "Trollers": 1.194789045477762e-09}, "max_label_probability": 0.9999995231628418, "name": "Vessel detailed class"}, "length": {"name": "Vessel length regression", "value": 28.348167419433594}, "is_fishing": {"max_label": "Fishing", "label_scores": {"Fishing": 0.9995298385620117, "Non-fishing": 0.00047023396473377943}, "max_label_probability": 0.9995298385620117, "name": "Fishing"}, "label": {"max_label": "Squid", "label_scores": {"Passenger": 1.463510534982504e-09, "Cargo/Tanker": 5.384665602292671e-09, "Trawlers": 1.4110513868459407e-09, "Seismic vessel": 1.7059856860512923e-09, "Pole and line": 3.6103113831842393e-09, "Fixed gear": 1.3594144698814148e-09, "Squid": 0.9999984502792358, "Reefer": 1.5216498923109611e-06, "Purse seines": 1.9657095950975645e-09, "Drifting longlines": 3.154397631988104e-09, "Tug/Pilot/Supply": 1.36162026098674e-09, "Trollers": 1.3594196879296305e-09}, "max_label_probability": 0.9999984502792358, "name": "Vessel class"}}, "start_time": "2013-01-01T00:00:00", "end_time": "2013-07-01T00:00:00"}''')
In [55]:
schema = []
the_type = {'int':'integer','dict':'record','float':'float','unicode':'string'}
the_mode = {'int':'nullable','dict':'repeated','float':'nullable','unicode':'nullable'}
def get_fields(d):
sub_schema = []
for o in d:
v = get_field(o,d)
sub_schema.append(v)
return sub_schema
def get_field(o, od):
v = {}
name = o.replace(" ","_").replace("-","_").replace("/","_")
if " " in o or "-" in o or '/' in o:
print 'line = line.replace("{}","{}")'.format(o,name)
the_class = od[o].__class__.__name__
tt = the_type[the_class]
tm = the_mode[the_class]
v['name'] = name
v['type'] = tt
v['mode'] = tm
if the_class == 'dict':
v['fields'] = get_fields(od[o])
return v
for o in od:
ve = get_field(o,od)
schema.append(ve)
# if isinstance(od[o], dict):
In [50]:
schema
Out[50]:
In [47]:
with open("data_schema.json", 'w') as f:
f.write(json.dumps(schema))
In [48]:
od
Out[48]:
In [1]:
with open('vessel-classification-all_v2.json','w') as big_outfile:
with open('vessel-classification-all.json','rU') as bigfile:
lines = bigfile.readlines()
for line in lines:
line = line.replace("Seismic vessel","Seismic_vessel")
line = line.replace("Set gillnets","Set_gillnets")
line = line.replace("Set longlines","Set_longlines")
line = line.replace("Motor passenger","Motor_passenger")
line = line.replace("Pole and line","Pole_and_line")
line = line.replace("Purse seines","Purse_seines")
line = line.replace("Pots and traps","Pots_and_traps")
line = line.replace("Drifting longlines","Drifting_longlines")
line = line.replace("Non-fishing","Non_fishing")
line = line.replace("Cargo/Tanker","Cargo_Tanker")
line = line.replace("Seismic vessel","Seismic_vessel")
line = line.replace("Pole and line","Pole_and_line")
line = line.replace("Fixed gear","Fixed_gear")
line = line.replace("Purse seines","Purse_seines")
line = line.replace("Drifting longlines","Drifting_longlines")
line = line.replace("Tug/Pilot/Supply","Tug_Pilot_Supply")
big_outfile.write(line)
In [59]:
import os
command = "gsutil cp -z json vessel_classification_all_20161122_v2.json gs://david-scratch/vessel-classification-all_v2.json"
os.system(command)
Out[59]:
In [ ]: