Trying to extract data from full low level JSON dump. In this case it's release year.
In [57]:
from __future__ import print_function
from collections import defaultdict, OrderedDict
from datetime import date
import tarfile
import simplejson
import os
import re
years = defaultdict(int)
missing_count = 0
year_regex = re.compile('\d{4}')
archive_path = os.path.join('data', 'dump', 'acousticbrainz-lowlevel-json-20141119-json.tar')
with tarfile.open(archive_path, mode='r') as tar:
for member in tar:
if member.name.endswith('.json'):
json = simplejson.load(tar.extractfile(member))
# Extracting year from JSON
tags = json['metadata']['tags']
if 'date' in tags:
d = tags['date'][0]
m = year_regex.match(d)
if m:
year = int(d[m.start():m.end()])
if year > 1890 and year <= date.today().year:
years[year] += 1
continue
missing_count += 1
In [58]:
%matplotlib inline
import matplotlib.pyplot as plt
ordered = OrderedDict(sorted(years.items()))
keys = ordered.keys()
values = ordered.values()
plt.plot(keys, values)
plt.title('Release year distribution')
plt.axis([min(keys), max(keys), 0, max(values) * 1.2])
plt.xlabel('Years')
plt.ylabel('Tracks')
plt.show()