Catalogue Statistics


In [1]:
%matplotlib inline
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import eqcat.parsers.isf_catalogue_reader as icr
import eqcat.catalogue_query_tools as cqt
from IPython.utils import io

In [2]:
# read the catalogue - why do we bother making an hdf5?
raw_file_name = "Marmaries-catalogue1.txt"
base = os.path.basename(raw_file_name)
db_file_name = os.path.splitext(base)[0] + '.hdf5'

rejection_keywords = ["mining", "geothermal", "explosion", "quarry", 
                      "reservoir", "induced", "rockburst"]
reader = icr.ISFReader(raw_file_name, 
                       rejection_keywords=rejection_keywords)
catalogue = reader.read_file("TUR", "ISC")

In [3]:
# summarize reasons for rejections
if not os.path.isfile(db_file_name) and len(reader.rejected_catalogue) > 0:
    num_rejections = len(reader.rejected_catalogue.events)
    keyword_counts = dict(zip(rejection_keywords, [0]*num_rejections))
    for event in reader.rejected_catalogue.events:
        for keyword in rejection_keywords:
            if keyword.lower() in event.comment.lower():
                keyword_counts[keyword] = keyword_counts[keyword] + 1

    num_keywords_found = sum(keyword_counts.values())
    print "Note: %d rejection keywords found in %d rejected events." % \
        (num_keywords_found, num_rejections)

    print keyword_counts

In [4]:
# build the catalogue if we haven't already
if not os.path.isfile(db_file_name):
    _ = catalogue.build_dataframe(db_file_name)
db1 = cqt.CatalogueDB(db_file_name)

In [5]:
db1.origins
temp = db1.origins["eventID"]
temp = list(temp)
uniqueIDs = set(temp)
print "Number of magnitudes: ", len(db1.magnitudes)
print "Number of origins: ", len(db1.origins)
print "Number of unique events: ", len(uniqueIDs)


Number of magnitudes:  116530
Number of origins:  128554
Number of unique events:  36179

In [ ]:
overlap_threshhold = 50
agency_mag_file_name = os.path.splitext(base)[0] + ('_%d_AgencyPairs.txt' % overlap_threshhold)
#if not os.path.isfile(agency_mag_file_name):
#with open(agency_mag_file_name, 'w') as sys.stdout: 
with io.capture_output() as captured:
    cqt.mine_agency_magnitude_combinations(
        db1,agency_magnitude_stats, overlap_threshhold,
        no_case=True, quiet=True)
with

In [ ]:
pairs = np.genfromtxt(agency_mag_file_name, 
                      dtype="S64,S64,S64,S64,S64,S64,S64,i,S64")

In [ ]:
# have a peek at the data
pairs[:5]

In [ ]:
# parse the data and look at sets
agencies1 = [pair[1][1:-1].replace('_','') for pair in pairs]
mags1 = [pair[2][:-1].upper() for pair in pairs]
agencies2 = [pair[4][1:-1].replace('_','') for pair in pairs]
mags2 = [pair[5][:-1].upper() for pair in pairs]
nums_shared = [pair[7] for pair in pairs]

print set(mags1)
print set(mags2)
print list(set(mags1) - set(mags2))
print list(set(mags2) - set(mags1))

print set(agencies1)
print set(agencies2)
print list(set(agencies1) - set(agencies2))
print list(set(agencies2) - set(agencies1))

In [ ]:
# prioritize magnitudes and print pairs in order of size of overlap
mag_priority = ['MW','MS','MD','MB','ML']
tex_format = '%s, %s <- [%d] -> %s, %s'

data = zip(agencies1, mags1, agencies2, mags2, nums_shared)
data = list(set(data))
data = sorted(data, key=lambda pair: pair[4], reverse=True)

for i, m1 in enumerate(mag_priority):
    for m2 in mag_priority[i:]:
        for agency1, mag1, agency2, mag2, n_shared in data:
            if (mag1 == m1 and mag2 == m2) or (mag1 == m2 and mag2 == m1):
                n1 = agency_magnitude_stats[agency1]['Magnitudes'][mag1]
                n2 = agency_magnitude_stats[agency2]['Magnitudes'][mag2]
                
                if mag_priority.index(m1) >= mag_priority.index(m2):
                    print tex_format % (agency1, mag1,  n_shared,agency2, mag2)
                else:
                    print tex_format % ( gency2, mag2, n_shared,
                                        agency1, mag1)

In [ ]: