In [1]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from IPython import display

There's usually a catch all topic with conference jargon, it's easy to spot this so we manually remove it


In [2]:
doc_topic = np.genfromtxt('doc_topic.csv',delimiter=',')
topic_word = np.genfromtxt('topic_word.csv',delimiter=',')

with open('vocab.csv') as f:
    vocab = f.read().splitlines()

# Show document distributions across topics
plt.imshow(doc_topic.T,interpolation='none')
plt.show()

# Remove topic 2 = catch all prasa-robmech jargon (if your stopwords are set up nicely don't bother)
#doc_topic = np.delete(doc_topic, (3), axis=1)
#doc_topic = (doc_topic.T/np.sum(doc_topic,axis=1)).T
#topic_word = np.delete(topic_word,(3),axis=0)
#topic_word = topic_word/np.sum(topic_word,axis=0)

#plt.imshow(doc_topic.T,interpolation='none')
#plt.show()



In [3]:
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
Y = pdist(doc_topic, 'seuclidean')

D = squareform(Y)

In [4]:
plt.figure(figsize=(15,8))
plt.imshow(D,interpolation='none')
plt.show()


Set conference schedule here, session_papers has a list of sessions and the number of papers they can hold


In [5]:
# Number of papers in each session, schedule
session_papers = [4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 4, 3]
print sum(session_papers), len(session_papers)


55 16

In [ ]:
# Makes pretty spreadsheet, requires a csv file with paper details (title, authors, paper id)
def save_schedule():
    
    import xlsxwriter
    from matplotlib import cm
    from matplotlib import colors

    workbook = xlsxwriter.Workbook('example.xlsx')
    worksheet = workbook.add_worksheet()
    worksheet.set_column(0, 0, 10) 
    worksheet.set_column(1, 1, 50)
    worksheet.set_column(2, 4, 80)

    with open('vocab.csv') as f:
        vocab = f.read().splitlines()

    import csv
    paper_details = []
    with open('paper_details.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            paper_details.append(row)

    worksheet.write(0, 0, 'Session')
    worksheet.write(0, 1, 'Topic')
    worksheet.write(0, 2, 'Title')
    worksheet.write(0, 3, 'Authors')
    worksheet.write(0, 4, 'Paper ID')

    cmap = cm.get_cmap('hsv', int(np.max(mfinal)))    # PiYG

    for j,sess in enumerate(sorted(mfinal)):
        i = np.argsort(mfinal)[j]
        detail = paper_details[int(i)]
        Pt = 1.0/session_papers[int(sess)]*np.sum(doc_topic[mfinal==sess,:],axis=0)

        Pw = np.sum(np.multiply(topic_word.T,Pt),axis=1)

        bins = np.argsort(Pw)[-6:]
        sess_topic =  ' '.join(np.array(vocab)[bins].tolist())

        fmt = workbook.add_format()
        fmt.set_border(1)
        fmt.set_bg_color(colors.rgb2hex(cmap(int(sess))[:3]))

        worksheet.write(j+1, 0, sess,fmt)
        worksheet.write(j+1, 1, sess_topic,fmt)
        worksheet.write(j+1, 2, detail['title'],fmt)
        worksheet.write(j+1, 3, detail['authors'],fmt)
        worksheet.write(j+1, 4, detail['paper_id'],fmt)

    workbook.close()

Cluster papers into sessions, this may take some time, so just stop when you're happy with the output in example.xlsx


In [ ]:
N = doc_topic.shape[0]
K = len(session_papers)
Num_Iters = 2500

# Greedy clustering
EBest = 10000;

plt.figure(figsize=(20,8))
for reseed_iter in range(Num_Iters):
    
    # Randomly allocate papers to sessions
    mp = np.arange(N)
    np.random.shuffle(mp)
    Gcs = np.hstack((0,np.cumsum(np.array(session_papers))))
    m = np.zeros((N,))
    for j in range(1,Gcs.shape[0]):
        m[(mp<Gcs[j])&(mp >= Gcs[j-1])] = j-1

    # Calculate cost of session assignment
    E = 0
    for k in range(K):
        i,j = np.meshgrid(np.where(m==k),np.where(m==k))
        E = E + np.sum(D[i,j])/(D.shape[0]*D.shape[0])
    E = E/K
    
    t = 0
    while(1):
        E_p = E
        rp = np.arange(N)
        np.random.shuffle(rp)
        for a in rp:
            for b in set(range(N)) - set([a]):
                temp = m[a]
                m[a] = m[b]
                m[b] = temp
                E_t = 0
                for k in range(K):
                    i,j = np.meshgrid(np.where(m==k),np.where(m==k))
                    E_t = E_t + np.sum(D[i,j])/(D.shape[0]*D.shape[0])
                E_t = E_t/K
                if (E_t < E):
                    E = E_t
                    #print "Iter:", reseed_iter, t,a,b,E,EBest
                    #display.clear_output(wait=True)
                else:
                    m[b] = m[a]
                    m[a] = temp
        
        if (E_p == E):
            break
            
        t = t + 1    
    if (E < EBest):
        EBest = E
        mfinal = m
        save_schedule()
        #Show session distribution assignments
        Sess_mat = []
        for i in range(K):
            Sess_mat.append(doc_topic[mfinal==i,:])
            Sess_mat.append(np.zeros((1,doc_topic.shape[1])))
            #plt.subplot(4,4,i+1)
            #plt.imshow(doc_topic[mfinal==i,:],interpolation='none')
            #Pt = 1.0/session_papers[i]*np.sum(doc_topic[mfinal==i,:],axis=0)
            #Pw = np.sum(np.multiply(topic_word.T,Pt),axis=1)
            #bins = np.argsort(Pw)[-4:]
            #sess_topic =  ' '.join(np.array(vocab)[bins].tolist())
            #plt.title(sess_topic)
        plt.imshow(np.vstack(Sess_mat).T,interpolation='none')
        plt.ylabel('Topic distribution')
        display.clear_output(wait=True)
        display.display(plt.gcf())
        print "Iter:", reseed_iter, t,a,b,E,EBest



In [ ]:
#Show session distribution assignments
plt.figure(figsize=(15,5))
for i in range(K):
    plt.subplot(3,4,i)
    plt.imshow(doc_topic[mfinal==i,:],interpolation='none')
plt.show()

In [ ]:
# Save to csv instead of xlsx if you prefer
def save_csv():
    with open('vocab.csv') as f:
        vocab = f.read().splitlines()

    import csv
    paper_details = []
    with open('paper_details.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            paper_details.append(row)

    keys = paper_details[0].keys()
    keys.insert(0,'topic')
    keys.insert(0,'session')

    with open('scheduled_papers.csv', 'wb') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        for j,sess in enumerate(sorted(mfinal)):
            i = np.argsort(mfinal)[j]
            detail = paper_details[int(i)]
            Pt = 1.0/session_papers[int(sess)]*np.sum(doc_topic[mfinal==sess,:],axis=0)
            #Pt = doc_topic[int(i),:]

            Pw = np.sum(np.multiply(topic_word.T,Pt),axis=1)

            bins = np.argsort(Pw)[-6:]
            sess_topic =  ' '.join(np.array(vocab)[bins].tolist())
            print detail['title'][0:40], sess_topic
            detail['topic'] = sess_topic
            detail['session'] = sess
            dict_writer.writerow(detail)