In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%pylab inline

In [ ]:
#open files
sci_list_tot = (pd.read_csv("scientists_1.csv"))
std_list_tot = (pd.read_csv("students.csv"))

In [ ]:
std_list_tot['Full Name'] = std_list_tot['Name'] + " " + std_list_tot['Last'] 
sci_list_tot['Full Name'] = sci_list_tot['Name'] + " " + sci_list_tot['Last'] 
sci_list_tot['Full Address'] = sci_list_tot['Mailing Address (starting September 2017)'].fillna('') + " " + sci_list_tot['Address Line 2'].fillna('') + " " + sci_list_tot['City'].fillna('') + " " + sci_list_tot['State / Province / Region'].fillna('') + " " + sci_list_tot['Postal / Zip Code'].fillna('') + " " + sci_list_tot['Country'] .fillna('')

In [ ]:
#drop any duplicate email entries in the data frame
drop = np.where(sci_list_tot.duplicated('Email')==True)[0]
temp = sci_list_tot.drop(sci_list_tot.index[drop])
sci_list_tot = temp
#drop any duplicate email entries in the data frame
drop = np.where(std_list_tot.duplicated('Full Name')==True)[0]
temp = std_list_tot.drop(std_list_tot.index[drop])
std_list_tot = temp

In [ ]:
sci_list = np.array(sci_list_tot)
std_list = np.array(std_list_tot)

In [ ]:
"""#mock files
sci_list = np.array(pd.read_csv("mock_sci_long.csv"))
std_list = np.array(pd.read_csv("mock_std_long.csv"))
"""

In [ ]:
#sort lists, student by enthusiasm and scientist by time_stamp
std_list = np.array(sorted(std_list, key = lambda x: x[12])[:: - 1])
sci_list = np.array(sorted(sci_list, key = lambda x: x[20])[:: - 1])

In [ ]:
"""
#sort lists, student by enthusiasm and scientist by time_stamp
std_list = np.array(sorted(std_list, key = lambda x: x[1])[:: - 1])
sci_list = np.array(sorted(sci_list, key = lambda x: x[1])[:: - 1])
"""

In [ ]:
# pick the relevant colums for the sorting
#student: name, enthusiasm, field
#scientist: name, time_stamp, field1, field2
std_list = np.array((std_list.T[1], std_list.T[12], std_list.T[11])).T
sci_list = np.array((sci_list.T[2], sci_list.T[19], sci_list.T[12], sci_list.T[13])).T

In [ ]:
### data fix ###
#turn student list back to python list
std_list = list(std_list)
for i1 in range(len(std_list)):
    std_list[i1] = list(std_list[i1])
#adding second field for students, to increase the chance of succesful matching    
for i1 in range(len(std_list)):
    if std_list[i1][2] == "machines":
        std_list[i1].append("engineering")
    elif std_list[i1][2] == "computers":
        std_list[i1].append("energy")
    elif std_list[i1][2] == "space":
        std_list[i1].append("rocks")        
    else:
        std_list[i1].append("nothing")

In [ ]:
#compute the matches for each student
sci_list_c = np.copy(sci_list)
matches = np.empty(len(std_list), dtype = object)
for i1 in range(len(std_list)):
    a1 = np.where(sci_list.T[2] == std_list[i1][2]); a2 = np.where(sci_list.T[3] == std_list[i1][2])
    a3 = np.where(sci_list.T[2] == std_list[i1][3]); a4 = np.where(sci_list.T[3] == std_list[i1][3])
    matches[i1] = np.append(np.append(np.append(np.append(a1, a2), a3), a4), - 1)

In [ ]:
#do the actual matching, make touple pairs = (student id, scientist id)
#if there is no match, pairs[1] == < 0
sci_list_c = np.copy(sci_list)
matches_c = np.copy(matches)
pairs = []
for i1 in range(len(matches_c)):
    i2 = -1;
    while(True):
        aux = int(matches_c[i1][0])
        if len(matches_c[i1]) == 1:
            pairs.append((i1, i2))
            break
        if sci_list_c[aux][0] != None:
            sci_list_c[aux][0] = None
            pairs.append(np.array((i1, aux)))
            break
        if sci_list_c[aux][0] == None:
            matches_c[i1] = matches_c[i1][1:]
            i2 = -2
pairs = np.array(pairs)

In [ ]:
#random matching the students that did not get a succesful match
res_id = []
for i1 in range(len(sci_list_c)):
    if sci_list_c[i1][0] != None:
        res_id.append(i1)
for i1 in range(len(pairs)):
    if pairs[i1][1] < 0:
        pairs[i1][1] = np.random.choice(res_id)
        res_id.remove(pairs[i1][1])

In [ ]:
#opening again the files, to have all the columns
sci_list_tot = np.array(sci_list_tot)
std_list_tot = np.array(std_list_tot)

In [ ]:
len((sci_list_tot[0]))

In [ ]:
for i1 in range(len(std_list_tot)):
    std_list_tot[i1][2] = std_list_tot[i1][2].strip().replace("-"," ").title()
    std_list_tot[i1][1] = std_list_tot[i1][1].strip().replace("-"," ").title()    
for i1 in range(len(sci_list_tot)):
    sci_list_tot[i1][2] = sci_list_tot[i1][2].strip().replace("-"," ").title()
    sci_list_tot[i1][3] = sci_list_tot[i1][3].strip().replace("-"," ").title()

In [ ]:
#sort them again in the same way as before
std_list_tot = np.array(sorted(std_list_tot, key = lambda x: x[12])[:: - 1])
sci_list_tot = np.array(sorted(sci_list_tot, key = lambda x: x[19])[:: - 1])

In [ ]:
"""
#opening again the files, to have all the columns
sci_list_tot = np.array(pd.read_csv("mock_sci_long.csv"))
std_list_tot = np.array(pd.read_csv("mock_std_long.csv"))

#sort them again in the same way as before
std_list_tot = np.array(sorted(std_list_tot, key = lambda x: x[1])[:: - 1])
sci_list_tot = np.array(sorted(sci_list_tot, key = lambda x: x[1])[:: - 1])
"""
#create a touple with all the data of students and scientists
tot_pairs = []
for i1 in range(len(pairs)):
    tot_pairs.append((std_list_tot[pairs[i1][0]], sci_list_tot[pairs[i1][1]]))

In [ ]:
len(pairs)

In [ ]:
grade = []
i4 = 0
for i1 in range(len(std_list_tot)):
    i3 = 0
    for i2 in range(6):
        if type(np.array(std_list_tot)[i1][5:11][i2]) == str:
            i3 = -10
            grade.append(np.array(std_list_tot)[i1][5:11][i2])
            break
        i3 +=1    
        if i3 == 6:
            i4 +=1
            grade.append("")

In [ ]:
len(tot_pairs)

In [ ]:
len(grade)

In [ ]:
#output 
# <scientist full name> \t (<field1>, <field2>) ·> <student full name> is interested in : <field> \t <enthusiasm>
output = []
for i1 in range(len(tot_pairs)):
    aux = ("",str(tot_pairs[i1][1][1]), str(tot_pairs[i1][1][2]), str(tot_pairs[i1][1][3]), str(tot_pairs[i1][1][10]), str(tot_pairs[i1][1][11]), str(tot_pairs[i1][1][27]), str(tot_pairs[i1][0][3]), str(tot_pairs[i1][0][1]),str(tot_pairs[i1][0][2]), str(tot_pairs[i1][0][4]), (grade[i1]), std_list_tot[i1][11] ,"","","")
    output.append(aux)
    print(aux)

In [ ]:


In [ ]:
np.savetxt("output.csv",output,header="Notes^email^First Name^Last Name^Occupation^Field^Mailing address^Teacher^First Name^Last Name^Grade^Class^Interest^Empty1^Empty2^Empty3",delimiter="^",fmt="%s")

In [ ]:
#output 
# <scientist full name> \t (<field1>, <field2>) ·> <student full name> is interested in : <field> \t <enthusiasm>
for i1 in range(len(tot_pairs)):
    print(str(tot_pairs[i1][1][0]) + "\t (" + str(tot_pairs[i1][1][2]) + \
", " + str(tot_pairs[i1][1][3]) + ")  \t ·> "+str(tot_pairs[i1][0][0]) + \
          " is interested in: " + str(tot_pairs[i1][0][2]) + " \t (" + str(tot_pairs[i1][0][1]) + ").")

In [ ]:
email scientist
grade and class room
full

In [ ]:


In [ ]:
#output 
# <scientist full name> \t (<field1>, <field2>) ·> <student full name> is interested in : <field> \t <enthusiasm>
for i1 in range(len(tot_pairs)):
    print(str(tot_pairs[i1][1][2]) + " " + str(tot_pairs[i1][1][3]) + "\t (" + str(tot_pairs[i1][1][12]) + \
", " + str(tot_pairs[i1][1][13]) + ")  \t ·> "+str(tot_pairs[i1][0][1]) + " " + str(tot_pairs[i1][0][2]) + \
" is interested in: " + str(tot_pairs[i1][0][11]) + " \t (" + str(tot_pairs[i1][0][12]) + ").")

In [ ]:
#OLD
"""plt.figure(figsize=(20,5))
my_dict = {i:(mock_sci_lef.T[2].tolist()).count(i) for i in mock_sci_lef.T[2]}

asdf1 = np.array(list(my_dict.values()))

plt.plot(np.array(list(my_dict.values())), linestyle='--', drawstyle='steps')
plt.xticks(np.arange(0,len(asdf1)),np.array(list(my_dict.keys())))
plt.xticks(rotation=90)
plt.ylim(1,60)
plt.show()

plt.figure(figsize=(20,5))
my_dict = {i:(mock_sci.T[2].tolist()).count(i) for i in mock_sci.T[2]}

asdf1 = np.array(list(my_dict.values()))

plt.plot(np.array(list(my_dict.values())), linestyle='--', drawstyle='steps')
plt.xticks(np.arange(0,len(asdf1)),np.array(list(my_dict.keys())))
plt.xticks(rotation=90)
plt.ylim(1,150)
plt.show()

plt.figure(figsize=(20,5))
my_dict = {i:(mock_sci.T[3].tolist()).count(i) for i in mock_sci.T[3]}

asdf1 = np.array(list(my_dict.values()))
plt.plot(np.array(list(my_dict.values())), linestyle='--', drawstyle='steps')
plt.xticks(np.arange(0,len(asdf1)),np.array(list(my_dict.keys())))
plt.xticks(rotation=90)
plt.ylim(1,150)
plt.show()

plt.figure(figsize=(20,5))
my_dict = {i:(mock_std.T[2].tolist()).count(i) for i in mock_std.T[2]}

asdf1 = np.array(list(my_dict.values()))
plt.plot(np.array(list(my_dict.values())), linestyle='--', drawstyle='steps')
plt.xticks(np.arange(0,len(asdf1)),np.array(list(my_dict.keys())))
plt.xticks(rotation=90)
plt.ylim(1,150)
plt.show()

mock_sci_lef = []
for i1 in range(len(mock_sci_c)):
    if mock_sci_c[i1][0] != None:
        mock_sci_lef.append(mock_sci_c[i1])
mock_sci_lef = np.array(mock_sci_lef)
mock_std_lef = []
for i1 in range(len(mock_std)):
    if pair[i1][1] == -2:
        mock_std_lef.append(mock_std[pair[i1][0]])
mock_std_lef = np.array(mock_std_lef)

plt.figure(figsize=(20,5))
my_dict = {i:(mock_std_lef.T[2].tolist()).count(i) for i in mock_std_lef.T[2]}

asdf1 = np.array(list(my_dict.values()))

plt.plot(np.array(list(my_dict.values())), linestyle='--', drawstyle='steps')
plt.xticks(np.arange(0,len(asdf1)),np.array(list(my_dict.keys())))
plt.xticks(rotation=90)
plt.ylim(1,60)
plt.show()

for i1 in range(len(mock_std_lef)):
    print(mock_std_lef.T[1][i1],mock_std_lef.T[2][i1])
"""