GBT Raster Scan 1.4-1.5GHz Data Analysis

First Run: Clustering Signals in Frequency:

  • Histogram of Frequencies as motivation for clustering.
    • Uniformly distribute frequencies into bins with equiprobability.
  • K-Means Clustering to strengthen the original evidence.
    • Minimize within cluster variance
  • Clusters weighted by inverse-variance.
    • More Scattering: Lower Score
  • Feature Extraction of Spectra.
    • PCA-Whitening with dimensionality reduction factor of 3.33x and information threshold of 92%
  • Anomaly Detection.
    • Local Outlier Factor to identify basis for spectral clustering
  • Nearest-neighbor subpopulation clustering.
    • Agglomerative Clustering with average linkage to isolate iso-spectra families

Second Run: Spectral Clusters within Subpopulation

  • Feature Extraction of Spectra with Spectral Band as features followed by Whitening the Data Matrix.
  • Re-apply Clustering to Isolate similar Spectral Bands with Parent Cluster.
  • Apply K-Nearest Neighbors to identify distribution of Iso-Spectral Bands in the Sky.
  • Identify Distribution and Likelihood of Signals given their respective Sky Coordinates.

In [179]:
# General Imports.
import os

# Analytics Imports.
import numpy as np
import scipy.linalg as sp
import scipy.optimize as sciopt
import pandas as pd
from sklearn import cluster as skc

#Visualization Imports.
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [2]:
DATADIR = './KeplerGBT1.4-1.5GHz'
CATFILE = 'KeplerRasterScan_1.4-1.5Ghz.trimmed.txt'

In [3]:
catalog = pd.read_csv(CATFILE,sep=' ',skipinitialspace=True,usecols=[0,1,2,3],skiprows=1,header=None,names=['path','freq','ra','dec'])

In [4]:
# Get General Statistics.
catalog.describe()


Out[4]:
freq ra dec
count 94884.000000 94884.000000 94884.000000
mean 1454.306281 291.819570 44.976942
std 19.191726 5.363247 3.375471
min 1400.000122 279.845500 36.770200
25% 1453.466792 287.114025 42.296575
50% 1459.362224 292.357700 45.284400
75% 1465.927102 296.571600 47.780000
max 1499.999994 301.643200 52.196300

In [5]:
# Use Histogram (Uniform Density Kernel) to Gauge Total Clusters.
plt.figure(figsize=(14,3))

hist = catalog.freq.hist(bins=500,edgecolor='none')
plt.ylabel('count')
plt.xlabel('frequency (MHz)')

plt.show()



In [6]:
# Apply K-Means (Inertia-Minimizing Kernel).
NUM_CLUSTERS = 25
kmeans = skc.KMeans(n_clusters=NUM_CLUSTERS)
data = catalog.freq.reshape(len(catalog.freq),1)
kmeans.fit(data)


Out[6]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=25, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [7]:
plt.figure(figsize=(14,3))

hist = plt.bar(kmeans.cluster_centers_,np.bincount(kmeans.labels_),edgecolor='none')
# plt.yscale('log')
plt.xlim([1400,1500])
plt.xlabel('Frequency (MHz)')
plt.ylabel('count')

plt.show()



In [8]:
catalog.insert(4,'cluster',kmeans.labels_)

In [9]:
scatter_func = lambda g: 10/(np.var(g['ra'])+np.var(g['dec']))
res = catalog.groupby('cluster').apply(scatter_func)

In [10]:
cluster_stat = pd.DataFrame({'mean_freq': kmeans.cluster_centers_.flatten(),
                             'scatter': res.values})

In [23]:
plt.figure(figsize=(14,3))

sbars = plt.bar(cluster_stat.mean_freq.values,cluster_stat.scatter.values,edgecolor='none')
for label, rect in enumerate(sbars):
    plt.gca().text(rect.get_x() + rect.get_width()/2, rect.get_height() + 0.05, label, ha='center', va='bottom')

plt.xlim([1400,1500])
plt.ylim([0,2])
plt.xlabel('Mean Frequency (MHz)')
plt.ylabel('Score')
plt.title('Subpopulation Spread in Sky (Higher Score: Lower Spread)')

plt.show()



In [197]:
def plotSpectra(s,fname):
    # Initialize Figure.
    freqs,s = s[0,:],s[1:,:]
    plt.figure(figsize=(8,5))
    plt.imshow(s, aspect='auto',cmap='Blues')
    cbar = plt.colorbar(pad=.2)
    cbar.set_label('SNR',rotation='270',verticalalignment='bottom')

    # Configure X-Axis.
    band = float(fname.rsplit('_')[-1].rsplit('.',1)[0])
    xpts = np.round((freqs[np.r_[:100:20,-1]]-band)*1000,2)
    plt.gca().set_xticks(np.r_[:100:20,100])
    plt.gca().set_xticklabels(xpts)
    plt.gca().xaxis.grid(False)
    plt.xlabel('Relative Frequency (kHz)')

    # Configure Left Y-Axis.
    yl_pts = np.linspace(-0.15,0.15,6)
    plt.gca().set_yticks(np.r_[:100:20,100])
    plt.gca().set_yticklabels(yl_pts)
    plt.gca().yaxis.grid(False)
    plt.ylabel('Time (s)')

    # Configure Right Y-Axis.
    axr = plt.gca().twinx()
    axr.set_yticks(np.r_[:100:20,100])
    axr.yaxis.tick_right()
    yr_pts = np.linspace(-0.265,0.265,6)
    axr.set_yticklabels(yr_pts)
    axr.yaxis.grid(False)
    axr.set_ylabel('Angular Separation (arcmin)',rotation='270',verticalalignment='bottom')

    plt.title(fname.rsplit('.',1)[0],verticalalignment='bottom')
    plt.show()
    
def plotSpecBand(s):
    # Initialize Figure.
    k = np.argmax(np.var(s[1:,:],0))
    freq,s = s[0,k],s[1:,k]
    plt.figure(figsize=(10,5))
    xpts = np.linspace(-0.265,0.265,100)
    bools = [True] + [False if s[i] < s[i-1] or s[i] < s[i+1] else True for i in range(1,99)] + [True]
    gauss_func = lambda x,a,mu,sigma,b: a * np.exp(-0.5*(((x-mu)/sigma)**2)) + b
    
    # Generate Data & Gaussian Fit.
    cfs,_ = sciopt.curve_fit(gauss_func,xpts[np.nonzero(bools)],s[np.nonzero(bools)],p0=(s.max(),0.02,0.1,0.))
    # cfs,_ = sciopt.curve_fit(gauss_func,xpts,s,p0=(s.max(),0.02,0.1,0.))
    cfs[1] = 0.5*(xpts[s.argmax()] + cfs[1])
    
    # Render Plot.
    plt.plot(xpts,s)
    plt.plot(xpts,gauss_func(xpts,*cfs))
    plt.xlabel('Angular Separation (arcmin)')
    plt.ylabel('Amplitude')
    plt.title('Spectral Band: {0} MHz\n Gaussian Width: {1} arcmin'.format(freq,np.abs(round(cfs[2]*2,3))))
    plt.show()

In [156]:
# Examine Spectra of Cluster Member.
N = 11
selCluster = catalog[catalog.cluster == N]
clusterPaths = selCluster.path.values
sample = clusterPaths[np.random.randint(0,clusterPaths.size)]
spectra = np.load(os.path.join(DATADIR,sample))
plotSpectra(spectra,sample)



In [51]:
# PCA-Whitening on Spectogram for Feature Extraction.
def extractSpectraFeatures(s):
    X = s[1:,:]
    D,U = sp.eigh(X.dot(X.T),eigvals=(70,99))
    W = (U.dot(np.diag(D ** (-0.5)))).T
    return W.dot(X).flatten()

# Euclidean Similarity Matrix for kNN.
def nearestNeighbor(M):
    b = M.dot(M.T)
    d = np.diag(b)
    a,c = np.meshgrid(d,d)
    return a - 2*b + c

# Local Outlier Factor (Anomaly Detection).
def localOutlierFactor(k,S):
    D,I = np.sort(S)[:,1:],np.argsort(S)[:,1:]
    reachDist = np.vectorize(lambda a,b: max(D[b,k-1],S[a,b]))
    lrd_func = lambda a: float(k)/np.sum(reachDist(a,I[a,:k]))
    lrd = np.vectorize(lrd_func)
    lof = np.vectorize(lambda a: np.sum(lrd(I[a,:k]))/(k*lrd_func(a)))
    return lof(np.r_[0:S.shape[0]])

In [46]:
dataMatrix = np.empty([clusterPaths.size,3000])
for i,path in enumerate(clusterPaths):
    dataMatrix[i,:] = extractSpectraFeatures(np.load(os.path.join(DATADIR,path)))

In [48]:
S = nearestNeighbor(dataMatrix)

In [69]:
LOF = localOutlierFactor(5,S)

In [72]:
plt.figure(figsize=(14,3))
plt.hist(LOF,bins=100,edgecolor='none')
plt.xlabel('Local Outlier Factor')
plt.ylabel('Count')
plt.show()



In [123]:
agc = skc.AgglomerativeClustering(n_clusters=3,linkage='average')
agc.fit(dataMatrix)


Out[123]:
AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='average',
            memory=Memory(cachedir=None), n_clusters=3, n_components=None,
            pooling_func=<function mean at 0x10b5a70c8>)

In [124]:
AGCatalog = pd.DataFrame({'path':clusterPaths,'cluster':agc.labels_})

In [178]:
# Examine Agglomerated Cluster
K = 0
cpaths = AGCatalog[AGCatalog.cluster == K].path.values
cpath = cpaths[np.random.randint(0,cpaths.size)]
plotSpectra(np.load(os.path.join(DATADIR,cpath)),cpath)



In [153]:
localDist = AGCatalog.groupby('cluster').count()
plt.figure(figsize=(6,3))
dbars = plt.bar(np.r_[0:3],localDist.path.values,edgecolor='none',width=0.25,align='center')
for i, rect in enumerate(dbars):
    plt.gca().text(rect.get_x() + rect.get_width()/2, rect.get_height() + 0.05, localDist.path.values[i], ha='center', va='bottom')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.title('Distribution of Spectra with Cluster 11')
plt.show()



In [198]:
plotSpecBand(np.load(os.path.join(DATADIR,cpath)))



In [199]:
selCluster[selCluster.path == cpath]


Out[199]:
path freq ra dec cluster
18011 Row_33.1448.437500/56_1439.77122.npy 1439.771218 283.2495 48.6069 11

In [ ]: