Query AAS abstract database

Based on this blog post.



In [1]:

    
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import graphscheduler

% matplotlib inline



In [2]:

    
from datetime import datetime



In [3]:

    
conn = sqlite3.connect('aas_abstracts.sqlite')



In [4]:

    
engine = conn.cursor()
engine.execute("""SELECT DISTINCT(session.type) FROM session 
WHERE session.meeting_code = 'aas227';""")









    Out[4]:





<sqlite3.Cursor at 0x110a73dc0>



In [5]:

    
engine.fetchall()









    Out[5]:





[(u'Special Session',),
 (u'Workshop',),
 (u'Splinter Meeting',),
 (u'Private Splinter Meeting',),
 (u'Attendee Event',),
 (u'Invitation-Only Event',),
 (u'Town Hall',),
 (u'Plenary Session',),
 (u'Poster Session',),
 (u'Oral Session',),
 (u'Open Event',),
 (u'Public Event',)]



In [6]:

    
query = """
SELECT session.title, session.start_date_time, session.end_date_time, session.so_id
FROM session
WHERE session.meeting_code = 'aas227'
    AND session.type IN (
        'Oral Session',
        'Special Session',
        'Splinter Meeting'
    )
ORDER BY session.so_id;
"""
session_results = engine.execute(query).fetchall()



In [7]:

    
session_results[0]









    Out[7]:





(u'Astrophysical Constraints of Dark Matter Properties (DUPLICATE)',
 u'1969-12-31 19:00:00',
 u'1969-12-31 19:00:00',
 212362)



In [8]:

    
# load the presentation data into a Pandas DataFrame
sess_cols = ['title', 'start_date_time', 'end_date_time', 'so_id']
session_df = pd.DataFrame(session_results) #, session_results[0].keys())
session_df.columns = sess_cols



In [9]:

    
# turn the timestamps into datetime
session_df['start_date_time'] = pd.to_datetime(session_df['start_date_time']) 
session_df['end_date_time'] = pd.to_datetime(session_df['end_date_time'])
session_df = session_df[1:] # zero-th entry has a corrupt date
session_df.head()









    Out[9]:






  
    
      
      title
      start_date_time
      end_date_time
      so_id
    
  
  
    
      1
      Lectures in AstroStatistics
      2016-01-06 10:00:00
      2016-01-06 11:30:00
      212363
    
    
      2
      Hubble Space Telescope: a Vision to 2020 and B...
      2016-01-06 14:00:00
      2016-01-06 15:30:00
      212364
    
    
      3
      The Astrophysics of Exoplanet Orbital Phase Cu...
      2016-01-06 14:00:00
      2016-01-06 15:30:00
      212365
    
    
      4
      The Cosmic History of Light: New Results and F...
      2016-01-08 10:00:00
      2016-01-08 11:30:00
      212366
    
    
      5
      A Report from the Inclusive Astronomy 2015 Mee...
      2016-01-06 14:00:00
      2016-01-06 15:30:00
      212367



In [10]:

    
query = """
SELECT presentation.title, presentation.abstract, presentation.id, session.so_id
FROM session, presentation
WHERE session.meeting_code = 'aas227'
    AND session.so_id = presentation.session_so_id
    AND presentation.status IN ('Sessioned', '')
    AND session.type IN (
        'Oral Session', 
        'Special Session',
        'Splinter Meeting'
    )
ORDER BY presentation.id;
"""
presentation_results = engine.execute(query).fetchall()



In [11]:

    
# sort the presentatons by session
presentation_results = sorted(presentation_results, key=lambda x: x[-1])# ['so_id'])
pres_cols = ['title', 'abstract', 'id', 'so_id']
# load the presentation data into a Pandas DataFrame and clean out HTML tags
presentation_df = pd.DataFrame(presentation_results) #, columns=presentation_results[0].keys())
presentation_df.columns = pres_cols
presentation_df['abstract'] = presentation_df['abstract'].str.replace('<[^<]+?>', '')
presentation_df['title'] = presentation_df['title'].str.replace('<[^<]+?>', '')



In [12]:

    
presentation_df.head()









    Out[12]:






  
    
      
      title
      abstract
      id
      so_id
    
  
  
    
      0
      A Plan for Astrophysical Constraints of Dark M...
      We present conclusions and challenges for a co...
      21704
      212362
    
    
      1
      The Likelihood Function and Likelihood Statistics
      The likelihood function is a necessary compone...
      21706
      212363
    
    
      2
      From least squares to multilevel modeling: A g...
      This tutorial presentation will introduce some...
      21707
      212363
    
    
      3
      Topics in Machine Learning for Astronomers
      As astronomical datasets continue to increase ...
      21708
      212363
    
    
      4
      Maximizing the Scientific Return and Legacy of...
      The Hubble Space Telescope has pushed the fore...
      21709
      212364



In [13]:

    
nsessions = len(session_df)
npresentations = len(presentation_df)
print(nsessions, npresentations)



In [14]:

    
import nltk
from nltk.stem.porter import PorterStemmer
import re



In [15]:

    
def tokenize(text, stemmer=PorterStemmer()):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # tokenize
    tokens = nltk.word_tokenize(text)
    
    # stem
    stems = [stemmer.stem(token) for token in tokens]
    
    return stems



In [16]:

    
tokenize("This image contains one foreground galaxy and many background galaxies")









    Out[16]:





[u'Thi',
 u'imag',
 u'contain',
 u'one',
 u'foreground',
 u'galaxi',
 u'and',
 u'mani',
 u'background',
 u'galaxi']



In [17]:

    
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=tokenize,
    lowercase=True,
    stop_words='english',
)



In [18]:

    
example_titles = ["Exoplanets, exoplanets, exoplanets", "I found an exoplanet", 
                  "Stuff that explodes in space"]
example_count_matrix = vectorizer.fit_transform(example_titles).toarray()
vectorizer.get_feature_names()









    Out[18]:





[u'exoplanet', u'explod', u'space', u'stuff']



In [19]:

    
example_count_matrix[0]









    Out[19]:





array([3, 0, 0, 0])



In [20]:

    
title_count_matrix = vectorizer.fit_transform(presentation_df['title']).toarray()
title_count_matrix.shape









    Out[20]:





(675, 1503)



In [21]:

    
title_counts = title_count_matrix.sum(axis=0)
sort_by_count_idx = title_counts.argsort()[::-1] # reverse sort
words = np.array(vectorizer.get_feature_names())

# print the word stem and the number of occurrences
for idx in sort_by_count_idx[:10]:
    print(words[idx], title_counts[idx])









    



(u'galaxi', 101)
(u'star', 82)
(u'survey', 66)
(u'ray', 58)
(u'mass', 50)
(u'format', 45)
(u'observ', 44)
(u'stellar', 36)
(u'cluster', 34)
(u'disk', 33)



In [22]:

    
abs_count_matrix = vectorizer.fit_transform(presentation_df['abstract']).toarray()
abs_count_matrix.shape









    Out[22]:





(675, 5567)



In [23]:

    
abs_counts = abs_count_matrix.sum(axis=0)
sort_by_count_idx = abs_counts.argsort()[::-1] # reverse sort
words = np.array(vectorizer.get_feature_names())

# print the word stem and the number of occurrences
for idx in sort_by_count_idx[:10]:
    print(words[idx], abs_counts[idx])









    



(u'galaxi', 1042)
(u'star', 948)
(u'thi', 850)
(u'observ', 798)
(u'mass', 686)
(u'use', 672)
(u'model', 441)
(u'survey', 437)
(u'format', 422)
(u'present', 387)



In [24]:

    
def similarity(v1, v2):
    numer = v1.dot(v2)
    denom = np.linalg.norm(v1) * np.linalg.norm(v2)

    if numer < 1: # if no common words, the vectors are orthogonal
        return 0.
    else:
        return numer / denom



In [25]:

    
example_count_matrix









    Out[25]:





array([[3, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 1, 1]])



In [26]:

    
sim_01 = similarity(example_count_matrix[0], example_count_matrix[1])
sim_02 = similarity(example_count_matrix[0], example_count_matrix[2])
sim_12 = similarity(example_count_matrix[1], example_count_matrix[2])

sim_01, sim_02, sim_12









    Out[26]:





(1.0, 0.0, 0.0)



In [27]:

    
similarity_matrix = np.zeros((npresentations, npresentations))
for ix1 in range(npresentations):
    for ix2 in range(npresentations):
        similarity_matrix[ix1,ix2] = similarity(abs_count_matrix[ix1], abs_count_matrix[ix2])



In [28]:

    
plt.figure(figsize=(8,8))
vmax = similarity_matrix[similarity_matrix < 0.99].max()
plt.imshow(similarity_matrix, cmap='magma', interpolation='nearest',
          vmax=vmax)









    Out[28]:





<matplotlib.image.AxesImage at 0x10bf6b190>



In [29]:

    
triu_similarity_matrix = np.triu(similarity_matrix)
similarity_matrix_1d = triu_similarity_matrix.ravel()
sorted_sim = np.sort(similarity_matrix_1d[~np.isclose(similarity_matrix_1d, 1.)])[::-1]

most_similar_idx = np.where((triu_similarity_matrix >= sorted_sim[9]) & 
                            (~np.isclose(similarity_matrix, 1.)))
most_similar_idx = np.vstack(most_similar_idx).T



In [30]:

    
for ix1,ix2 in most_similar_idx:
    pres1 = presentation_df.iloc[ix1]
    pres2 = presentation_df.iloc[ix2]
    print(pres1['title'])
    print(pres2['title'])
    print()









    



Constraining the atmosphere of exoplanet WASP-34b
Analysis of Secondary Eclipse Observations of Hot-Jupiters WASP-26b and CoRoT-1b
()
Constraining the atmosphere of exoplanet WASP-34b
Atmospheric, Orbital and Secondary Eclipse Analysis of HAT-P-30-WASP-51b
()
Constraining the atmosphere of exoplanet WASP-34b
Secondary Eclipse Observations and Orbital Analysis of WASP-32b
()
Analysis of Secondary Eclipse Observations of Hot-Jupiters WASP-26b and CoRoT-1b
Atmospheric, Orbital and Secondary Eclipse Analysis of HAT-P-30-WASP-51b
()
Analysis of Secondary Eclipse Observations of Hot-Jupiters WASP-26b and CoRoT-1b
Secondary Eclipse Observations and Orbital Analysis of WASP-32b
()
Atmospheric, Orbital and Secondary Eclipse Analysis of HAT-P-30-WASP-51b
Secondary Eclipse Observations and Orbital Analysis of WASP-32b
()
How Giant Planets Shape the Characteristics of Terrestrial Planets
The Fragility of the Terrestrial Planets During a Giant Planet Instability
()
Galaxy Structure as a Driver of the Star Formation Sequence Slope and Scatter
Star formation histories of z~2 galaxies and their intrinsic characteristics on the SFR-M* plane
()
The Legacy of NASA Astrophysics E/PO: Conducting Professional Development, Developing Key Themes & Resources, and Broadening E/PO Audiences
The Legacy of NASA Astrophysics E/PO: Scientist Engagement and Higher Education
()
The Undergraduate ALFALFA Team: Collaborative Research Projects
The Undergraduate ALFALFA Team: A Model for Involving Undergraduates in Large Astronomy Collaborations
()



In [31]:

    
from sklearn.utils.extmath import cartesian

def session_similarity(so_id1, so_id2):
    """
    Getting the sub-matrix of the similarity matrix for all pairs 
    of presentations between two session ID's.
    """
    presentations_session1 = presentation_df[presentation_df['so_id'] == so_id1]
    presentations_session2 = presentation_df[presentation_df['so_id'] == so_id2]
    
    if len(presentations_session1) == 0 or len(presentations_session2) == 0:
        # no presentations in session
        return np.array([])
    
    index_pairs = cartesian((presentations_session1.index,presentations_session2.index)).T
    sub_matrix = similarity_matrix[(index_pairs[0],index_pairs[1])]
    
    shape = (len(presentations_session1), len(presentations_session2))
    sub_matrix = sub_matrix.reshape(shape)
    
    return sub_matrix



In [32]:

    
session_df = session_df[:-1]
session_df.head()









    Out[32]:






  
    
      
      title
      start_date_time
      end_date_time
      so_id
    
  
  
    
      1
      Lectures in AstroStatistics
      2016-01-06 10:00:00
      2016-01-06 11:30:00
      212363
    
    
      2
      Hubble Space Telescope: a Vision to 2020 and B...
      2016-01-06 14:00:00
      2016-01-06 15:30:00
      212364
    
    
      3
      The Astrophysics of Exoplanet Orbital Phase Cu...
      2016-01-06 14:00:00
      2016-01-06 15:30:00
      212365
    
    
      4
      The Cosmic History of Light: New Results and F...
      2016-01-08 10:00:00
      2016-01-08 11:30:00
      212366
    
    
      5
      A Report from the Inclusive Astronomy 2015 Mee...
      2016-01-06 14:00:00
      2016-01-06 15:30:00
      212367



In [ ]:

    
len(session_df)



In [33]:

    
#grouped_sessions = session_df[session_df['start_date_time'] >= 
# datetime(2016, 1, 5)].groupby('start_date_time')
empty_session = 0

#for start_time,group in grouped_sessions:
titles = []
max_scores = []
for title1,so_id1 in zip(session_df['title'],session_df['so_id']):
    for title2,so_id2 in zip(session_df['title'],session_df['so_id']):
        if so_id1 >= so_id2: continue # only fill the upper triangle

        # similarity between all pairs of presentations between the two sessions
        scores = session_similarity(so_id1, so_id2)

        if scores.size == 0: # no presentations in one of the sessions
            empty_session += 1
            continue

        max_scores.append(scores.max())
        titles.append([title1, title2])

#if len(max_scores) < 1:
#    continue

#print(start_time) # the start_time
max_idx = np.argmax(max_scores)
for t in titles[max_idx]:
    print(t)
print(max_scores[max_idx])
print()









    



Extrasolar Planets: Hosts, Interactions, Formation, and Interiors
Astrobiology/Laboratory Astrophysics - Atoms and Plasmas
0.681945681113
()



In [34]:

    
print(len(max_scores))



In [35]:

    
empty_session









    Out[35]:





4200



In [36]:

    
session_df.count()









    Out[36]:





title              138
start_date_time    138
end_date_time      138
so_id              138
dtype: int64



In [37]:

    
139*138/2.









    Out[37]:





9591.0



In [38]:

    
import networkx as nx



In [54]:

    
G = nx.Graph()
empty_session = 0

#for start_time,group in grouped_sessions:
titles = []
max_scores = []
for title1,so_id1 in zip(session_df['title'],session_df['so_id']):
    for title2,so_id2 in zip(session_df['title'],session_df['so_id']):
        if so_id1 >= so_id2: continue # only fill the upper triangle

        # similarity between all pairs of presentations between the two sessions
        scores = session_similarity(so_id1, so_id2)

        if scores.size == 0: # no presentations in one of the sessions
            empty_session += 1
            continue

        #max_scores.append(scores.max())
        #titles.append([title1, title2])
        G.add_node(title1)
        G.add_node(title2)
        G.add_edge(title1, title2, weight=1.-scores.max())



In [55]:

    
nx.draw(G)



In [56]:

    
G.number_of_nodes()









    Out[56]:





103



In [57]:

    
session_df.count()









    Out[57]:





title              138
start_date_time    138
end_date_time      138
so_id              138
dtype: int64



In [ ]:

    
n_elements = 3
results = graphscheduler.find_solution_numerical(G, n_elements)



In [ ]:

    
G.number_of_nodes() % n_elements



In [ ]:

	title	start_date_time	end_date_time	so_id
1	Lectures in AstroStatistics	2016-01-06 10:00:00	2016-01-06 11:30:00	212363
2	Hubble Space Telescope: a Vision to 2020 and B...	2016-01-06 14:00:00	2016-01-06 15:30:00	212364
3	The Astrophysics of Exoplanet Orbital Phase Cu...	2016-01-06 14:00:00	2016-01-06 15:30:00	212365
4	The Cosmic History of Light: New Results and F...	2016-01-08 10:00:00	2016-01-08 11:30:00	212366
5	A Report from the Inclusive Astronomy 2015 Mee...	2016-01-06 14:00:00	2016-01-06 15:30:00	212367

	title	abstract	id	so_id
0	A Plan for Astrophysical Constraints of Dark M...	We present conclusions and challenges for a co...	21704	212362
1	The Likelihood Function and Likelihood Statistics	The likelihood function is a necessary compone...	21706	212363
2	From least squares to multilevel modeling: A g...	This tutorial presentation will introduce some...	21707	212363
3	Topics in Machine Learning for Astronomers	As astronomical datasets continue to increase ...	21708	212363
4	Maximizing the Scientific Return and Legacy of...	The Hubble Space Telescope has pushed the fore...	21709	212364