In [1]:

    
import sys, os
import pandas as pd
from sklearn.model_selection import train_test_split

from lcc.db_tier.connectors import FileManager, OgleII, OgleII
from lcc.data_manager.package_reader import PackageReader
from lcc.stars_processing.systematic_search.stars_searcher import StarsSearcher, StarsSearcherRedis
from lcc.stars_processing.tools.visualization import plotProbabSpace
from lcc.stars_processing.tools.params_estim import ParamsEstimator
from lcc.utils.output_process_modules import saveIntoFile, loadFromFile
from lcc.utils.stars import saveStars, plotStarsPicture
from lcc.utils.helpers import get_combinations
from lcc.api.input_parse import parse_tun_query

%matplotlib inline

from matplotlib import pylab
pylab.rcParams['figure.figsize'] = (24.0, 8.0)









    



---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
RuntimeError: module compiled against API version 0xb but this version of numpy is 0xa

Define the task

In this example our task is to make a model which would distinguish quasars from Be stars which light curves are very simmilar.



In [2]:

    
# We will be using Abbe value and variogram slope to describe each light curve
# and the "brain" of the filter will be Gradient Boosting
descr_name1 = "AbbeValueDescr"
descr_name2 = "VariogramSlopeDescr"
decid_name = "GradBoostDec"


# Parameters to tune and their ranges
tun_param1 = "bins"
bin_from = 10
bin_to = 150
bin_step = 50

tun_param2 = "days_per_bin"
dpb_from = 30
dpb_to = 110
dpb_step = 10


# Load example stars which are included in the package
obt_method = "FileManager"
quasars_path = PackageReader.getSamplePath("qso")
be_stars_path = PackageReader.getSamplePath("be_stars")

Show available descriptors and classifiers

Note: You can arbitrary number of descriptors and classifiers



In [3]:

    
all_descriptors = PackageReader().getClassesDict("descriptors")
all_deciders = PackageReader().getClassesDict("deciders")

print("Descriptors: {}\n".format(", ".join(all_descriptors.keys())))
print("Deciders: {}".format(", ".join(all_deciders.keys())))









    



Using TensorFlow backend.






    



Descriptors: AbbeValueDescr, VariogramSlopeDescr, CurvesShapeDescr, HistShapeDescr, VariogramShapeDescr, PropertyDescr, ColorIndexDescr, SkewnessDescr, KurtosisDescr, CurveDensityDescr, PositionDescriptor, CurveDescr

Deciders: CustomDecider, AdaBoostDec, ExtraTreesDec, GaussianNBDec, GradBoostDec, LDADec, QDADec, RandomForestDec, SVCDec, TreeDec, NeuronDecider

Note: all_descriptors and all_deciders are dictionaries of descriptors and deciders classes which can be directly used. For example:

my_descriptor = all_descriptors["AbbeValueDescr"](bins=100)

is same as:

from lcc.stars_processing.descriptors import AbbeValueDescr

my_descriptor = AbbeValueDescr(bins=100)

Get descriptors and a classifier



In [4]:

    
abbe_descr = all_descriptors.get(descr_name1)
vario_slope_descr = all_descriptors.get(descr_name2)
decider = all_deciders.get(decid_name)

Make list of all parameters combination



In [5]:

    
combinations = get_combinations([":".join([descr_name1, tun_param1]), ":".join([descr_name2, tun_param2])],
                                 range(bin_from, bin_to, bin_step), range(dpb_from, dpb_to, dpb_step))
tun_params = parse_tun_query(combinations)



In [6]:

    
tun_params[:3]









    Out[6]:





[{'AbbeValueDescr': {'bins': 10}, 'VariogramSlopeDescr': {'days_per_bin': 30}},
 {'AbbeValueDescr': {'bins': 10}, 'VariogramSlopeDescr': {'days_per_bin': 40}},
 {'AbbeValueDescr': {'bins': 10}, 'VariogramSlopeDescr': {'days_per_bin': 50}}]

Load stars



In [7]:

    
quasars = FileManager({"path": quasars_path}).getStars()
be_stars = FileManager({"path": be_stars_path}).getStars()









    



Loading FITS files:: 100%|██████████| 137/137 [00:15<00:00,  8.90it/s]
Loading FITS files:: 100%|██████████| 221/221 [00:18<00:00, 12.26it/s]

Clean sample

Keep just stars with light curves which have certain size



In [8]:

    
quasars = [st for st in quasars if st.lightCurve and len(st.lightCurve.mag) >= bin_to]
be_stars = [st for st in be_stars if st.lightCurve and len(st.lightCurve.mag) >= bin_to]

Train - test split



In [9]:

    
quasars_train, quasars_test = train_test_split(quasars, train_size=0.8)
be_stars_train, be_stars_test = train_test_split(be_stars, train_size=0.8)

Show random quasars



In [10]:

    
import random

random.shuffle(quasars)
for star in quasars[:3]:
    print("*"*20)
    print(star)
    print()









    



********************
MACHO identifier:	name: 64.8092.454	


********************
xray identifier:	name: 	
mqs identifier:	name: MQS J053212.24-693130.9	
radio identifier:	name: 	
OgleII identifier:	name: LMC_SC2_407170	db_ident: {'field': 'LMC_SC2', 'starid': 407170}	
	Coordinate: 05h32m12.3684s -69d31m30.792s

********************
xray identifier:	name: 3XMM J050713.6-685952	
mqs identifier:	name: MQS J050713.52-685952.5	
radio identifier:	name: 	
OgleII identifier:	name: LMC_SC13_220502	db_ident: {'field': 'LMC_SC13', 'starid': 220502}	
	Coordinate: 05h07m13.6308s -68d59m52.512s

Plot distribution of magnitudes, variogram and light curve of random quasars



In [11]:

    
plotStarsPicture(quasars[:3])



In [12]:

    
# Light curves description
quasars[0].lightCurve.meta









    Out[12]:





{'color': 'V',
 'origin': '',
 'xlabel': 'Time',
 'xlabel_unit': 'days',
 'ylabel': 'Magnitudes',
 'ylabel_unit': 'mag'}

Find optimal parameters



In [13]:

    
# Estimate all combinations and get the best one
es = ParamsEstimator(searched=quasars_train,
                     others=be_stars_train,
                     descriptors=[abbe_descr, vario_slope_descr],
                     deciders=[decider],
                     tuned_params=tun_params)

star_filter, best_stats, best_params = es.fit()









    



All 24 combinations have been evaluated/Users/martinvo/workspace/private2/LightCurvesClassifier/lcc/stars_processing/tools/params_estim.py:200: UserWarning: 
Error during saving outputs...:
	module 'types' has no attribute 'InstanceType'
  warnings.warn("\nError during saving outputs...:\n\t%s" % e)



In [14]:

    
print("Optimal parameters: {}".format(es.tuned_params[0]))
pd.DataFrame(es.stats)









    



Optimal parameters: {'AbbeValueDescr': {'bins': 10}, 'VariogramSlopeDescr': {'days_per_bin': 30}}






    Out[14]:

Show features for some quasars



In [15]:

    
star_filter.getSpaceCoordinates(quasars[:5])









    Out[15]:







  
    
      
      Abbe value
      Light curve's variogram slope
    
  
  
    
      64.8092.454
      0.234116
      0.678967
    
    
      LMC_SC2_407170
      0.018131
      1.625663
    
    
      LMC_SC13_220502
      0.371647
      0.258574
    
    
      SMC_SC3_130350
      0.159039
      1.620601
    
    
      LMC_SC4_21565
      0.682249
      0.197449

Evaluate model

Now we can use tuned filter to predict whether given stars are quasars or not. Note that we're evaluating on the test sample.



In [16]:

    
prediction_quasars = star_filter.evaluateStars(quasars_test)
prediction_be_stars = star_filter.evaluateStars(be_stars_test)



In [17]:

    
prediction_quasars.head()









    Out[17]:





2.5873.82          0.12
LMC_SC13_214352    0.98
SMC_SC9_87380      0.99
69.12549.21        0.02
SMC_SC1_45081      0.21
dtype: float64

Probability plot

Purple stars are quasars, black are Be stars. Color on the background represents probability of an object to be quasar (scaled by colorbar on the right)



In [18]:

    
plotProbabSpace(star_filter)

Find a quasar candidates in OGLEII

NOTE: Sometimes OgleII doesn't work because their database, so if the query takes too long, maybe it's that case



In [19]:

    
# This should return a star from OGLEII
ogle_star = OgleII({"starid": 1, "field_num": 2, "target": "lmc"}).getStars()



In [20]:

    
# Query for OgleII
db_name = "OgleII"
starid_from = 1
starid_to = 10
field_num_from = 1
field_num_to = 2
target = "lmc"



In [21]:

    
# Prepare queries and run sequential systematic search by using filter
queries = get_combinations(["starid", "field_num", "target"],
                           range(starid_from, starid_to),
                           range(field_num_from, field_num_to),
                           [target])

searcher = StarsSearcher([star_filter],
                         db_connector=db_name,
                         stat_file_path="/tmp/lcc_status.csv",
                        save_coords=True)
searcher.queryStars(queries)

passed_stars = searcher.getPassedStars()









    



/Users/martinvo/workspace/private2/LightCurvesClassifier/lcc/stars_processing/systematic_search/stars_searcher.py:181: UserWarning: Removing existing status file /tmp/lcc_status.csv
  warnings.warn("Removing existing status file {}".format(stat_file_path))
Loading FITS files:: 100%|██████████| 9/9 [00:00<00:00, 13.70it/s]



In [22]:

    
searcher.getStatus()









    Out[22]:







  
    
      
      Abbe value
      GradBoostDec
      Light curve's variogram slope
      found
      lc
      passed
      passed_GradBoostDec
    
    
      star_name
      
      
      
      
      
      
      
    
  
  
    
      LMC_SC1_1
      0.271766
      0.743301
      0.385947
      True
      True
      True
      True
    
    
      LMC_SC1_2
      0.401306
      0.993926
      -0.128386
      True
      True
      True
      True
    
    
      LMC_SC1_3
      0.507551
      0.993926
      -0.178724
      True
      True
      True
      True
    
    
      LMC_SC1_4
      0.302269
      0.953597
      0.637023
      True
      True
      True
      True
    
    
      LMC_SC1_5
      0.135854
      0.976475
      -0.153439
      True
      True
      True
      True
    
    
      LMC_SC1_6
      0.552128
      0.993926
      0.153158
      True
      True
      True
      True
    
    
      LMC_SC1_7
      0.050737
      0.061906
      -0.470107
      True
      True
      False
      False
    
    
      LMC_SC1_8
      0.140007
      0.957434
      0.560850
      True
      True
      True
      True
    
    
      LMC_SC1_9
      0.068299
      0.588846
      -0.189427
      True
      True
      True
      True



In [23]:

    
saveStars(quasars[:5], "/tmp")









    Out[23]:





['64.8092.454',
 'LMC_SC2_407170',
 'LMC_SC13_220502',
 'SMC_SC3_130350',
 'LMC_SC4_21565']

Redis searcher



In [24]:

    
redis_searcher = StarsSearcherRedis([star_filter], db_connector=db_name, save_coords=True)
redis_searcher.queryStars(queries)

passed_stars = redis_searcher.getPassedStars()









    



Remaining jobs: 5
Remaining jobs: 5
Remaining jobs: 5
Remaining jobs: 1
Remaining jobs: 1
Remaining jobs: 1






    



Loading FITS files::  11%|█         | 1/9 [00:00<00:00,  8.75it/s]





    



Remaining jobs: 0






    



Loading FITS files:: 100%|██████████| 9/9 [00:00<00:00, 10.11it/s]



In [26]:

    
# redis_searcher.getStatus()



In [ ]:

	Abbe value	Light curve's variogram slope
64.8092.454	0.234116	0.678967
LMC_SC2_407170	0.018131	1.625663
LMC_SC13_220502	0.371647	0.258574
SMC_SC3_130350	0.159039	1.620601
LMC_SC4_21565	0.682249	0.197449

	Abbe value	GradBoostDec	Light curve's variogram slope	found	lc	passed	passed_GradBoostDec
star_name
LMC_SC1_1	0.271766	0.743301	0.385947	True	True	True	True
LMC_SC1_2	0.401306	0.993926	-0.128386	True	True	True	True
LMC_SC1_3	0.507551	0.993926	-0.178724	True	True	True	True
LMC_SC1_4	0.302269	0.953597	0.637023	True	True	True	True
LMC_SC1_5	0.135854	0.976475	-0.153439	True	True	True	True
LMC_SC1_6	0.552128	0.993926	0.153158	True	True	True	True
LMC_SC1_7	0.050737	0.061906	-0.470107	True	True	False	False
LMC_SC1_8	0.140007	0.957434	0.560850	True	True	True	True
LMC_SC1_9	0.068299	0.588846	-0.189427	True	True	True	True