Experiments with low level data dumps

Trying to extract data from full low level JSON dump. In this case it's release year.


In [57]:
from __future__ import print_function
from collections import defaultdict, OrderedDict
from datetime import date
import tarfile
import simplejson
import os
import re

years = defaultdict(int)
missing_count = 0

year_regex = re.compile('\d{4}')

archive_path = os.path.join('data', 'dump', 'acousticbrainz-lowlevel-json-20141119-json.tar')
with tarfile.open(archive_path, mode='r') as tar:    
    for member in tar:        
        
        if member.name.endswith('.json'):
            json = simplejson.load(tar.extractfile(member))
            
            # Extracting year from JSON
            tags = json['metadata']['tags']
            if 'date' in tags:
                d = tags['date'][0]
                m = year_regex.match(d)
                if m:
                    year = int(d[m.start():m.end()])
                    if year > 1890 and year <= date.today().year:
                        years[year] += 1
                        continue                        
            missing_count += 1


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-57-449d0f1454bf> in <module>()
     14 archive_path = os.path.join('data', 'dump', 'acousticbrainz-lowlevel-json-20141119-json.tar')
     15 with tarfile.open(archive_path, mode='r') as tar:
---> 16     for member in tar:
     17 
     18         if member.name.endswith('.json'):

C:\Python27\lib\tarfile.pyc in next(self)
   2487         # which will cause TarIter to stop prematurely.
   2488 
-> 2489         if self.index == 0 and self.tarfile.firstmember is not None:
   2490             tarinfo = self.tarfile.next()
   2491         elif self.index < len(self.tarfile.members):

KeyboardInterrupt: 

In [58]:
%matplotlib inline
import matplotlib.pyplot as plt

ordered = OrderedDict(sorted(years.items()))
keys = ordered.keys()
values = ordered.values()

plt.plot(keys, values)
plt.title('Release year distribution')
plt.axis([min(keys), max(keys), 0, max(values) * 1.2])
plt.xlabel('Years')
plt.ylabel('Tracks')
plt.show()