Based on this blog post.
In [1]:
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import graphscheduler
% matplotlib inline
In [2]:
from datetime import datetime
In [3]:
conn = sqlite3.connect('aas_abstracts.sqlite')
In [4]:
engine = conn.cursor()
engine.execute("""SELECT DISTINCT(session.type) FROM session
WHERE session.meeting_code = 'aas227';""")
Out[4]:
In [5]:
engine.fetchall()
Out[5]:
In [6]:
query = """
SELECT session.title, session.start_date_time, session.end_date_time, session.so_id
FROM session
WHERE session.meeting_code = 'aas227'
AND session.type IN (
'Oral Session',
'Special Session',
'Splinter Meeting'
)
ORDER BY session.so_id;
"""
session_results = engine.execute(query).fetchall()
In [7]:
session_results[0]
Out[7]:
In [8]:
# load the presentation data into a Pandas DataFrame
sess_cols = ['title', 'start_date_time', 'end_date_time', 'so_id']
session_df = pd.DataFrame(session_results) #, session_results[0].keys())
session_df.columns = sess_cols
In [9]:
# turn the timestamps into datetime
session_df['start_date_time'] = pd.to_datetime(session_df['start_date_time'])
session_df['end_date_time'] = pd.to_datetime(session_df['end_date_time'])
session_df = session_df[1:] # zero-th entry has a corrupt date
session_df.head()
Out[9]:
In [10]:
query = """
SELECT presentation.title, presentation.abstract, presentation.id, session.so_id
FROM session, presentation
WHERE session.meeting_code = 'aas227'
AND session.so_id = presentation.session_so_id
AND presentation.status IN ('Sessioned', '')
AND session.type IN (
'Oral Session',
'Special Session',
'Splinter Meeting'
)
ORDER BY presentation.id;
"""
presentation_results = engine.execute(query).fetchall()
In [11]:
# sort the presentatons by session
presentation_results = sorted(presentation_results, key=lambda x: x[-1])# ['so_id'])
pres_cols = ['title', 'abstract', 'id', 'so_id']
# load the presentation data into a Pandas DataFrame and clean out HTML tags
presentation_df = pd.DataFrame(presentation_results) #, columns=presentation_results[0].keys())
presentation_df.columns = pres_cols
presentation_df['abstract'] = presentation_df['abstract'].str.replace('<[^<]+?>', '')
presentation_df['title'] = presentation_df['title'].str.replace('<[^<]+?>', '')
In [12]:
presentation_df.head()
Out[12]:
In [13]:
nsessions = len(session_df)
npresentations = len(presentation_df)
print(nsessions, npresentations)
In [14]:
import nltk
from nltk.stem.porter import PorterStemmer
import re
In [15]:
def tokenize(text, stemmer=PorterStemmer()):
# remove non letters
text = re.sub("[^a-zA-Z]", " ", text)
# tokenize
tokens = nltk.word_tokenize(text)
# stem
stems = [stemmer.stem(token) for token in tokens]
return stems
In [16]:
tokenize("This image contains one foreground galaxy and many background galaxies")
Out[16]:
In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
analyzer='word',
tokenizer=tokenize,
lowercase=True,
stop_words='english',
)
In [18]:
example_titles = ["Exoplanets, exoplanets, exoplanets", "I found an exoplanet",
"Stuff that explodes in space"]
example_count_matrix = vectorizer.fit_transform(example_titles).toarray()
vectorizer.get_feature_names()
Out[18]:
In [19]:
example_count_matrix[0]
Out[19]:
In [20]:
title_count_matrix = vectorizer.fit_transform(presentation_df['title']).toarray()
title_count_matrix.shape
Out[20]:
In [21]:
title_counts = title_count_matrix.sum(axis=0)
sort_by_count_idx = title_counts.argsort()[::-1] # reverse sort
words = np.array(vectorizer.get_feature_names())
# print the word stem and the number of occurrences
for idx in sort_by_count_idx[:10]:
print(words[idx], title_counts[idx])
In [22]:
abs_count_matrix = vectorizer.fit_transform(presentation_df['abstract']).toarray()
abs_count_matrix.shape
Out[22]:
In [23]:
abs_counts = abs_count_matrix.sum(axis=0)
sort_by_count_idx = abs_counts.argsort()[::-1] # reverse sort
words = np.array(vectorizer.get_feature_names())
# print the word stem and the number of occurrences
for idx in sort_by_count_idx[:10]:
print(words[idx], abs_counts[idx])
In [24]:
def similarity(v1, v2):
numer = v1.dot(v2)
denom = np.linalg.norm(v1) * np.linalg.norm(v2)
if numer < 1: # if no common words, the vectors are orthogonal
return 0.
else:
return numer / denom
In [25]:
example_count_matrix
Out[25]:
In [26]:
sim_01 = similarity(example_count_matrix[0], example_count_matrix[1])
sim_02 = similarity(example_count_matrix[0], example_count_matrix[2])
sim_12 = similarity(example_count_matrix[1], example_count_matrix[2])
sim_01, sim_02, sim_12
Out[26]:
In [27]:
similarity_matrix = np.zeros((npresentations, npresentations))
for ix1 in range(npresentations):
for ix2 in range(npresentations):
similarity_matrix[ix1,ix2] = similarity(abs_count_matrix[ix1], abs_count_matrix[ix2])
In [28]:
plt.figure(figsize=(8,8))
vmax = similarity_matrix[similarity_matrix < 0.99].max()
plt.imshow(similarity_matrix, cmap='magma', interpolation='nearest',
vmax=vmax)
Out[28]:
In [29]:
triu_similarity_matrix = np.triu(similarity_matrix)
similarity_matrix_1d = triu_similarity_matrix.ravel()
sorted_sim = np.sort(similarity_matrix_1d[~np.isclose(similarity_matrix_1d, 1.)])[::-1]
most_similar_idx = np.where((triu_similarity_matrix >= sorted_sim[9]) &
(~np.isclose(similarity_matrix, 1.)))
most_similar_idx = np.vstack(most_similar_idx).T
In [30]:
for ix1,ix2 in most_similar_idx:
pres1 = presentation_df.iloc[ix1]
pres2 = presentation_df.iloc[ix2]
print(pres1['title'])
print(pres2['title'])
print()
In [31]:
from sklearn.utils.extmath import cartesian
def session_similarity(so_id1, so_id2):
"""
Getting the sub-matrix of the similarity matrix for all pairs
of presentations between two session ID's.
"""
presentations_session1 = presentation_df[presentation_df['so_id'] == so_id1]
presentations_session2 = presentation_df[presentation_df['so_id'] == so_id2]
if len(presentations_session1) == 0 or len(presentations_session2) == 0:
# no presentations in session
return np.array([])
index_pairs = cartesian((presentations_session1.index,presentations_session2.index)).T
sub_matrix = similarity_matrix[(index_pairs[0],index_pairs[1])]
shape = (len(presentations_session1), len(presentations_session2))
sub_matrix = sub_matrix.reshape(shape)
return sub_matrix
In [32]:
session_df = session_df[:-1]
session_df.head()
Out[32]:
In [ ]:
len(session_df)
In [33]:
#grouped_sessions = session_df[session_df['start_date_time'] >=
# datetime(2016, 1, 5)].groupby('start_date_time')
empty_session = 0
#for start_time,group in grouped_sessions:
titles = []
max_scores = []
for title1,so_id1 in zip(session_df['title'],session_df['so_id']):
for title2,so_id2 in zip(session_df['title'],session_df['so_id']):
if so_id1 >= so_id2: continue # only fill the upper triangle
# similarity between all pairs of presentations between the two sessions
scores = session_similarity(so_id1, so_id2)
if scores.size == 0: # no presentations in one of the sessions
empty_session += 1
continue
max_scores.append(scores.max())
titles.append([title1, title2])
#if len(max_scores) < 1:
# continue
#print(start_time) # the start_time
max_idx = np.argmax(max_scores)
for t in titles[max_idx]:
print(t)
print(max_scores[max_idx])
print()
In [34]:
print(len(max_scores))
In [35]:
empty_session
Out[35]:
In [36]:
session_df.count()
Out[36]:
In [37]:
139*138/2.
Out[37]:
In [38]:
import networkx as nx
In [54]:
G = nx.Graph()
empty_session = 0
#for start_time,group in grouped_sessions:
titles = []
max_scores = []
for title1,so_id1 in zip(session_df['title'],session_df['so_id']):
for title2,so_id2 in zip(session_df['title'],session_df['so_id']):
if so_id1 >= so_id2: continue # only fill the upper triangle
# similarity between all pairs of presentations between the two sessions
scores = session_similarity(so_id1, so_id2)
if scores.size == 0: # no presentations in one of the sessions
empty_session += 1
continue
#max_scores.append(scores.max())
#titles.append([title1, title2])
G.add_node(title1)
G.add_node(title2)
G.add_edge(title1, title2, weight=1.-scores.max())
In [55]:
nx.draw(G)
In [56]:
G.number_of_nodes()
Out[56]:
In [57]:
session_df.count()
Out[57]:
In [ ]:
n_elements = 3
results = graphscheduler.find_solution_numerical(G, n_elements)
In [ ]:
G.number_of_nodes() % n_elements
In [ ]: