In [1]:
#Sept 8 2015
#This code is to examine the ferengi data to see whether a smoothing function can be applied to determine
# the 'correctable' regions of p_features, z, and surface brightness space. 
%matplotlib inline
from astropy.io import fits as pyfits
from astropy.table import Table,Column

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import random
from mpl_toolkits.mplot3d import Axes3D
from scipy.spatial import ConvexHull
import requests
from datetime import datetime
import os

#mpl.rcParams['text.usetex']=True
#mpl.rcParams['axes.linewidth'] = 3

In [2]:
def download_from_dropbox(url):
    
    local_filename = "{:}".format(url.split("/")[-1].split("?")[0])
    r = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                f.flush()
            
    return local_filename

In [3]:
ferengi_filename = download_from_dropbox("https://www.dropbox.com/s/688ej834jwrdr3l/ferengi_data_with_categories_slope_method.fits?dl=1")
data = pyfits.open(ferengi_filename)  
data=data[1].data

In [5]:
#Column names:
#p_features = p_features 
#redshift = sim_redshift
#surface brightness = mu_max_i

In [4]:
correctable=(data['Correctable_Category']=='correctable')
uncorrectable=(data['Correctable_Category']=='uncorrectable')
nei=(data['Correctable_Category']=='nei')
not_nei=(data['Correctable_Category']!='nei')

In [5]:
correctable_array=np.zeros((3,len(data[correctable])))
uncorrectable_array=np.zeros((3,len(data[uncorrectable])))
nei_array=np.zeros((3,len(data[nei])))
not_nei_array=np.zeros((3,len(data[not_nei])))

In [6]:
#Create array to store redshift, SB and p_features info for each category
#change discrete redshift into continuous for visualization purposes 
#ex: if sim_redshift = 0.3, chose random number between 0.25 and 0.35 instead. 
for i,row in enumerate(data[correctable]):
    z=round(row['sim_redshift']*1000,1)
#    correctable_array[0][i]=random.randrange(z-50,z+50)/1000.
    correctable_array[0][i]=row['sim_redshift']
    correctable_array[1][i]=row['mu_max_i']
    correctable_array[2][i]=row['p_features']
for i,row in enumerate(data[uncorrectable]):
    z=round(row['sim_redshift']*1000,1)
#    uncorrectable_array[0][i]=random.randrange(z-50,z+50)/1000.
    uncorrectable_array[0][i]=row['sim_redshift']
    uncorrectable_array[1][i]=row['mu_max_i']
    uncorrectable_array[2][i]=row['p_features']
for i,row in enumerate(data[not_nei]):
    z=round(row['sim_redshift']*1000,1)
#    not_nei_array[0][i]=random.randrange(z-50,z+50)/1000.
    not_nei_array[0][i]=row['sim_redshift']
    not_nei_array[1][i]=row['mu_max_i']
    not_nei_array[2][i]=row['p_features']
for i,row in enumerate(data[nei]):
    z=round(row['sim_redshift']*1000,1)
#    nei_array[0][i]=random.randrange(z-50,z+50)/1000.
    nei_array[0][i]=row['sim_redshift']
    nei_array[1][i]=row['mu_max_i']
    nei_array[2][i]=row['p_features']

In [7]:
#Below: quick check to make sure the arrays were put together properly

In [8]:
f=plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2, 2)
ax1 = f.add_subplot(gs[0,0])
plt.scatter(data[correctable]['mu_max_i'],data[correctable]['p_features'])
plt.xlabel('mu~max~i')
plt.ylabel('p~features')
plt.title('table~data')
ax2 = f.add_subplot(gs[0,1])
plt.scatter(correctable_array[1],correctable_array[2])
plt.xlabel('mu~max~i')
plt.ylabel('p~features')
plt.title('new~array~data')
ax1 = f.add_subplot(gs[1,0])
plt.scatter(data[correctable]['sim_redshift'],data[correctable]['mu_max_i'])
plt.xlabel('sim~redshift')
plt.ylabel('mu~max~i')
plt.title('table~data')
ax2 = f.add_subplot(gs[1,1])
plt.scatter(correctable_array[0],correctable_array[1])
plt.xlabel('sim~redshift')
plt.ylabel('mu~max~i')
plt.title('new~array~data')


Out[8]:
<matplotlib.text.Text at 0x7f9fdf8762d0>

In [9]:
#First define a hull to encompass all non-nei galaxies: 
#Anything outside this hull in Hubble will be not be corrected 
#Anything inside this hull in Hubble will either be correctable or uncorrectable
all_points=not_nei_array.T
all_hull=ConvexHull(all_points)

#Just uncorrectable:
un_points=uncorrectable_array.T
hull=ConvexHull(un_points)

#3D plot of hull 
fig = plt.figure(figsize=(20,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(correctable_array[0], correctable_array[1], correctable_array[2], facecolors='none',edgecolors='b',alpha=.2,s=100,label='correctable')
ax.scatter(uncorrectable_array[0], uncorrectable_array[1], uncorrectable_array[2], facecolors='none',edgecolors='g',alpha=.2,s=100,label='uncorrectable')
ax.scatter(nei_array[0], nei_array[1], nei_array[2],facecolors='m',alpha=.8,s=100,label='nei')

for s in all_hull.simplices:
	ax.plot(all_points[s,0],all_points[s,1],all_points[s,2],'k-',alpha=.2)
ax.set_xlabel('redshift')
ax.set_ylabel('mu max i')
ax.set_zlabel('p features')
plt.title('correctable + uncorrectable region')


Out[9]:
<matplotlib.text.Text at 0x7f9fdf4ff690>

In [10]:
#function to determine whether a point 'p' is inside convex hull defined by points 'points'. 
def is_p_inside_hull(points,p):
    hull=ConvexHull(points)
    new_points=np.concatenate((points,p),axis=0)
    new_hull=ConvexHull(new_points)
    if list(hull.vertices)==list(new_hull.vertices):
        return True
    else:
        return False

In [11]:
#Function to count how many points are in hull:

def number_of_points_in_hull(this_array,hull_array):
    n_in_hull=0
    for row in this_array:
        this_point = np.array([row])
        if is_p_inside_hull(hull_array,this_point)==True:
            n_in_hull+=1
    return n_in_hull

In [12]:
#Function to remove points from uncorrectable hull until there is minimal overlap between the two regions
def new_convex_hull(total_points_to_remove,points_at_a_time):
    #This is the original array. 
    un_points=uncorrectable_array.T
    hull=ConvexHull(un_points)

    #first find the center of the 3D data: 
    z_center=np.median(un_points[:,0])
    mu_center=np.median(un_points[:,1])
    p_center=np.median(un_points[:,2])

    #now figure out distance to center for each point:
    distance_to_center=[]
    for row in un_points:
        distance_to_center.append(np.sqrt((row[0]-z_center)**2+(row[1]-mu_center)**2+(row[2]-p_center)**2))
    un_points=np.insert(un_points,3,distance_to_center,axis=1)


    #Eliminate the point farthest from the center, repeat until things are cool 

    #start by sorting array by distance from center 
    new_un_array=un_points[np.argsort(un_points[:,3])]
    #now delete that column because convexhull hates it
    new_un_array=np.delete(new_un_array,3,axis=1)


    #Now loop - eliminate point by point, recalculate hull, then check the ratio of correctable:uncorrectable points. Find the minimum. 
    i_list=[]
    blue_outside_hull_fraction=[]
    green_inside_hull_fraction=[]
    N=len(new_un_array) # number of points in original hull 
    for i in range(0,total_points_to_remove/points_at_a_time):
        #delete furthest point from center
        for j in range(0,points_at_a_time):
            new_un_array=np.delete(new_un_array,len(new_un_array)-1,0)
        #new_un_array is the new uncorrectable points considered for hull
        #new hull with furthest point removed
        new_hull=ConvexHull(new_un_array)
        #now we want to know how many points are in new hull (uncorrectable + correctable)
        #number of green in hull is just the length of new_un_array
        #number of blue in hull:
        N_blue_in_hull = number_of_points_in_hull(correctable_array.T,new_un_array)
        N_in_hull = len(new_un_array) + N_blue_in_hull
        #fraction of points in hull that are green:
        green_frac_in_hull = float(len(new_un_array))/N_in_hull
        #now want fraction of blue points outside of hull:
        #total number of points outside hull:
        total_outside_hull = len(correctable_array.T)+len(uncorrectable_array.T) - len(new_un_array)
        N_blue_outside_hull = len(correctable_array.T) - N_blue_in_hull
        blue_frac_outside_hull = float(N_blue_outside_hull)/total_outside_hull
        #track how many correctable and uncorrectable points are in hull with each iteration: 
        i_list.append(points_at_a_time*i)
        green_inside_hull_fraction.append(green_frac_in_hull)
        blue_outside_hull_fraction.append(blue_frac_outside_hull)
    return i_list,blue_outside_hull_fraction,green_inside_hull_fraction,new_hull,new_un_array

In [12]:
#Try removing a 200 points, 10 at a time:
i_list,co_fraction,un_fraction,new_hull,new_un_array=new_convex_hull(400,10)

In [13]:
f=plt.figure(figsize=(10,5))
plt.scatter(i_list,co_fraction,label='correctable points',s=60)
plt.scatter(i_list,un_fraction,label='uncorrectable points',s=60,c='g')
plt.xlabel('number of uncorrectable points removed')
plt.ylabel('fraction of points in hull') 
plt.axhline(y=0.95,ls='dashed',label='95%',c='r')
plt.axhline(y=0.9,ls='dashed',label='90%',c='g')
plt.axhline(y=0.15,ls='dashed',label='15%',c='b')
plt.legend(loc='center right')
plt.ylim(0,1.1)
plt.xlim(-3,400)

max_frac=np.max(co_fraction)
for i,val in enumerate(co_fraction):
    if val==max_frac:
        max_points=i*10
print 'points to remove for highest purity in correctible region is %i'%max_points


points to remove for highest purity in correctible region is 180

In [13]:
#Based on above plot, determine how many points are appropriate to cut from hull to make final version:
N_to_cut=180
i_list,co_fraction,un_fraction,new_hull,new_un_array=new_convex_hull(N_to_cut,1)
print 'By excluding %s of %s uncorrectable galaxies when computing the uncorrectable hull region, the correctable region consists of %s%% correctable galaxies and the uncorrectable region consists of %s%% of uncorrectable galaxies.' %(N_to_cut,len(uncorrectable_array.T),round(np.max(co_fraction)*100,2),round(np.max(un_fraction)*100,2)
)


By excluding 180 of 1826 uncorrectable galaxies when computing the uncorrectable hull region, the correctable region consists of 84.17% correctable galaxies and the uncorrectable region consists of 90.44% of uncorrectable galaxies.

In [14]:
#3D plot of old hull 
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(correctable_array[0], correctable_array[1], correctable_array[2], facecolors='none',edgecolors='b',alpha=.5,s=100,label='correctable')
ax.scatter(uncorrectable_array[0], uncorrectable_array[1], uncorrectable_array[2], facecolors='none', edgecolors='g',label='uncorrectable')
for s in hull.simplices:
	ax.plot(un_points[s,0],un_points[s,1],un_points[s,2],'k-')
for s in all_hull.simplices:
	ax.plot(all_points[s,0],all_points[s,1],all_points[s,2],'m-',alpha=.5)

#ax.scatter(nei_array[0], nei_array[1], nei_array[2], facecolors='none',edgecolors='m',label='nei')
ax.set_xlabel('redshift')
ax.set_ylabel('mu max i')
ax.set_zlabel('p features')
plt.title('old')

#3D plot of new hull  
ax = fig.add_subplot(122, projection='3d')
ax.scatter(correctable_array[0], correctable_array[1], correctable_array[2], facecolors='none',edgecolors='b',alpha=.5,s=100,label='correctable')
#ax.scatter(new_un_array[:,0], new_un_array[:,1], new_un_array[:,2], facecolors='none', edgecolors='g',label='uncorrectable')
ax.scatter(uncorrectable_array[0], uncorrectable_array[1], uncorrectable_array[2], facecolors='none', edgecolors='g',label='uncorrectable')
for s in new_hull.simplices:
	ax.plot(new_un_array[s,0],new_un_array[s,1],new_un_array[s,2],'k-')
for s in all_hull.simplices:
	ax.plot(all_points[s,0],all_points[s,1],all_points[s,2],'m-',alpha=.5)
#ax.scatter(nei_array[0], nei_array[1], nei_array[2], facecolors='none',edgecolors='m',label='nei')
ax.set_xlabel('redshift')
ax.set_ylabel('mu max i')
ax.set_zlabel('p features')
plt.title('new')
#plt.savefig('3D_hull.pdf')
plt.show()



In [29]:
#Okay, we have a convex hull that defines the uncorrectable region pretty well. 
#Anything outside pink = nei
#Anything inside pink but outside black = correctable
#Anything inside black = uncorrectable

In [20]:
hubble_data_file=download_from_dropbox('https://www.dropbox.com/s/ds5ltq0fvpl2iky/gzh_task01_categorized.fits?dl=1')
hubble_data=pyfits.open(hubble_data_file)
hubble_data=hubble_data[1].data

In [21]:
#make new table with extra column of the new categories based on hull
#have to make a whole new table because python doesn't recognize bintablehdu.from_columns for some reason and I'm sick of dealing with it

#first redefine old table:
new_category_table=Table()
for name in hubble_data.columns.names:
    c0 = Column([x[name] for x in hubble_data], name=name) 
    new_category_table.add_columns([c0])
c1 = Column([x['Correctable_Category'] for x in hubble_data],name='new_hull_category')
new_category_table.add_columns([c1])

In [27]:
#now determine new category for each point based on hull
z = 'Z_BEST_COMBINED'
for r,row in enumerate(new_category_table):
    this_point = np.array([[row[z],row['MU_HI'],row['p_features']]])

    if row[z] < 0.3 and row[z] > 0: #redshift less than 0.3, not getting corrected
        row['new_hull_category']='z_lt_3'
    elif row[z] <= 0 or row[z] > 9: #galaxy doesn't have redshift info
        row['new_hull_category']='nei_needs_redshift'
    #if galaxy is in the all_hull, check if correctable or un. Else, is nei.:
    elif is_p_inside_hull(all_points,this_point) == True:
            if is_p_inside_hull(uncorrectable_array.T,this_point)==True:
                row['new_hull_category']='uncorrectable'
            else: #in all_hull, but not in uncorrectable, therefore correctable
                row['new_hull_category']='correctable'
    else: #not nei_needs_redshift, not z < .3, and not in all_hull - just nei. 
        row['new_hull_category']='nei'

In [28]:
#Count how many of each category we have using the old (discrete) method
imaging_list=set(new_category_table['IMAGING'])

for survey in imaging_list:
    print '%s: correctable with old method = %i, new method = %i' %(survey,((new_category_table['IMAGING'] == survey) & (new_category_table['Correctable_Category'] == 'correctable')).sum(),((new_category_table['IMAGING'] == survey) & (new_category_table['new_hull_category'] == 'correctable')).sum())
    print '%s: uncorrectable with old method = %i, new method = %i'%(survey,((new_category_table['IMAGING'] == survey) & (new_category_table['Correctable_Category'] == 'uncorrectable')).sum(),((new_category_table['IMAGING'] == survey) & (new_category_table['new_hull_category'] == 'uncorrectable')).sum())
    print '%s: z < 0.3 with old method = %i, new method = %i' %(survey,((new_category_table['IMAGING'] == survey) & (new_category_table['Correctable_Category'] == 'z_lt_3')).sum(),((new_category_table['IMAGING'] == survey) & (new_category_table['new_hull_category'] == 'z_lt_3')).sum())
    print '%s: NEI (due to not enough Ferengi galaxies in bin) with old method =  %i, new method = %i'%(survey,((new_category_table['IMAGING'] == survey) & (new_category_table['Correctable_Category'] == 'nei')).sum(),((new_category_table['IMAGING'] == survey) & (new_category_table['new_hull_category'] == 'nei')).sum())
    print '%s: NEI galaxies (due to needing redshift measurements) with old method = %i, new mtehod = %i' %(survey,((new_category_table['IMAGING'] == survey) & (new_category_table['Correctable_Category'] == 'nei_needs_redshift')).sum(),((new_category_table['IMAGING'] == survey) & (new_category_table['new_hull_category'] == 'nei_needs_redshift')).sum())

print 'total correctable: old method: %i, new method: %i' %((new_category_table['Correctable_Category']=='correctable').sum(),(new_category_table['new_hull_category']=='correctable').sum())
print 'total uncorrectable: old method: %i, new method: %i'%((new_category_table['Correctable_Category']=='uncorrectable').sum(),(new_category_table['new_hull_category']=='uncorrectable').sum())
print 'total z less than .3: old method: %i, new method: %i' %((new_category_table['Correctable_Category']=='z_lt_3').sum(),(new_category_table['new_hull_category']=='z_lt_3').sum())
print 'total nei: old method: %i, new method: %i' %((new_category_table['Correctable_Category']=='nei').sum(),(new_category_table['new_hull_category']=='nei').sum())
print 'total nr: old method: %i, new method: %i' %((new_category_table['Correctable_Category']=='nei_needs_redshift').sum(),(new_category_table['new_hull_category']=='nei_needs_redshift').sum())
print 'total: %i' %len(new_category_table)


GEMS: correctable with old method = 2748, new method = 2242
GEMS: uncorrectable with old method = 1832, new method = 2993
GEMS: z < 0.3 with old method = 1190, new method = 1199
GEMS: NEI (due to not enough Ferengi galaxies in bin) with old method =  1886, new method = 2295
GEMS: NEI galaxies (due to needing redshift measurements) with old method = 1648, new mtehod = 575
GOODS-S: correctable with old method = 1022, new method = 691
GOODS-S: uncorrectable with old method = 980, new method = 1081
GOODS-S: z < 0.3 with old method = 247, new method = 259
GOODS-S: NEI (due to not enough Ferengi galaxies in bin) with old method =  2023, new method = 2707
GOODS-S: NEI galaxies (due to needing redshift measurements) with old method = 641, new mtehod = 175
COSMOS: correctable with old method = 20669, new method = 16863
COSMOS: uncorrectable with old method = 19626, new method = 23891
COSMOS: z < 0.3 with old method = 11693, new method = 12142
COSMOS: NEI (due to not enough Ferengi galaxies in bin) with old method =  33727, new method = 34860
COSMOS: NEI galaxies (due to needing redshift measurements) with old method = 7093, new mtehod = 5052
AEGIS: correctable with old method = 2325, new method = 2014
AEGIS: uncorrectable with old method = 1403, new method = 1646
AEGIS: z < 0.3 with old method = 967, new method = 1032
AEGIS: NEI (due to not enough Ferengi galaxies in bin) with old method =  2465, new method = 2762
AEGIS: NEI galaxies (due to needing redshift measurements) with old method = 1347, new mtehod = 1053
GOODS-N: correctable with old method = 1074, new method = 904
GOODS-N: uncorrectable with old method = 271, new method = 384
GOODS-N: z < 0.3 with old method = 280, new method = 271
GOODS-N: NEI (due to not enough Ferengi galaxies in bin) with old method =  691, new method = 833
GOODS-N: NEI galaxies (due to needing redshift measurements) with old method = 235, new mtehod = 159
total correctable: old method: 27838, new method: 22714
total uncorrectable: old method: 24112, new method: 29995
total z less than .3: old method: 14377, new method: 14903
total nei: old method: 40792, new method: 43457
total nr: old method: 10964, new method: 7014
total: 118083

In [31]:
fname = 'hubble_category_table_hull_method_%i_%i_%i.fits'%(datetime.now().month,datetime.now().day,datetime.now().year)

if os.path.exists(fname):
    os.remove(fname)
new_category_table.write(fname,format='fits')

In [30]:
22714 - 22048


Out[30]:
666

In [ ]:
#

In [ ]:


In [ ]:


In [ ]: