Extract specific parts from JSON documents and store them as columns in a CSV/TSV for further data analysis.
Configuration example:
mongo:
url : "mongodb://mongo:27017"
database : "articles"
collection : "comments"
json2csv:
# extract all article comments
item_selector : "$.article[*].comment[*]"
item_fields : [ "id", "author", "text" ]
item_count : False
output_path : "../data/comments.tsv"
In [ ]:
from pymongo import MongoClient
from jsonpath_rw import jsonpath, parse
import json
import yaml
import pandas as pd
import re
In [ ]:
config_file = 'config_json2csv.yaml'
In [ ]:
with open(config_file) as yaml_file:
cfg = yaml.load(yaml_file)
In [ ]:
mongo_cfg = cfg['mongo']
client = MongoClient(mongo_cfg['url'])
db = client[mongo_cfg['database']][mongo_cfg['collection']]
print "%d entries in database." % db.find().count()
In [ ]:
def clean(data):
"""Replace new line and tab; remove leading and trailing spaces."""
if isinstance(data, (str, unicode)):
return re.sub('[\t\n\r]', ' ', data).strip().encode('utf-8')
else:
return data
def as_list(value):
if isinstance(value, list):
return value
else:
return [value]
def extract(parser, json):
match = parser.find(json)
if (len(match) == 0):
return None
else:
return match[0].value
def extract_data(cursor):
return [ tuple(clean(extract(parser, item)) for parser in field_parser)
for result in cursor
for match in item_selector.find(result)
for item in as_list(match.value) ]
In [ ]:
item_selector = parse(cfg['json2csv']['item_selector'])
item_fields = cfg['json2csv']['item_fields']
field_parser = [ parse(field) for field in item_fields ]
In [ ]:
from collections import Counter
if cfg['json2csv']['item_count']:
counter = Counter(extract_data(db.find()))
df = pd.DataFrame.from_records([ list(x) + [count] for (x, count) in counter.iteritems()], columns=item_fields+["count"])
else:
records = extract_data(db.find())
df = pd.DataFrame.from_records(records, columns=item_fields)
In [ ]:
print "%d items." % len(df)
In [ ]:
output_file = cfg['json2csv']['output_file']
df.to_csv(output_file, sep="\t", index=False, quoting=3, encoding="utf-8")