In [2]:
!pip install nxpd
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np
from operator import truediv
from collections import Counter
import itertools
import random
import collaboratr
#from nxpd import draw
#import nxpd
#reload(collaboratr)
1. What is your name? [text entry]
2. What is your gender? [multiple choice]
3. What are your general science interests? [checkboxes]
I can ask for other information from the students (e.g., grade, school name) and scientists (email).
After receiving the responses, load up the CSV of responses from the Google Form by running the cell below (you'll have to change the path to your own CSV).
In [2]:
def format_name(data):
first_name = ['-'.join(list(map(str.capitalize,d))) for d in data['Name'].str.replace(" ", "-").str.split('-')]
last_name = ['-'.join(list(map(str.capitalize,d))) for d in data['Last'].str.replace(" ", "-").str.split('-')]
full_name = pd.Series([m+" "+n for m,n in zip(first_name,last_name)])
return full_name
In [10]:
# Retrieve data from Google Sheet and parse using pandas dataframe
student_data = pd.read_csv("students.csv")
student_data = student_data.replace(np.nan,' ', regex=True)
# Store student information in variables.
#
# Collaboratr divided people into "learners" and "teachers" based on what they wanted to "learn" and "teach."
# Here, students are always "learners" by default and the scientists are always "teachers."
# To maintain the structure of the pandas dataframe,
# I've created blank values for what students want to "teach" and what scientists want to "learn."
### write a function that would format names (including hyphens)
student_data['Full Name'] = format_name(student_data)
student_names = student_data['Full Name']
nStudents = len(student_names)
student_learn = student_data['If I could be any type of scientist when I grow up, I would want to study:']
student_teach = pd.Series(["" for i in range (nStudents)], index=[i for i in range(nStudents)])
student_email = pd.Series(["" for i in range (nStudents)], index=[i for i in range(nStudents)])
# Store scientist information in variables.
scientist_data = pd.read_csv("scientists_1.csv")
scientist_data = scientist_data.replace(np.nan,' ', regex=True)
#drop any duplicate email entries in the data frame
drop = np.where(scientist_data.duplicated('Email')==True)[0]
temp = scientist_data.drop(scientist_data.index[drop])
scientist_data = temp
scientist_data['Full Name'] = format_name(scientist_data)
scientist_names = scientist_data['Full Name']
nScientists = len(scientist_names)
scientist_learn = pd.Series(["" for i in range (nScientists)], index=[i for i in range(nScientists)])
scientist_teach = scientist_data['We will match you with a pen pal who has expressed an interest in at least one of the following subjects. Which topic is most relevant to your work?']
scientist_email = scientist_data['Email']
In [181]:
#drop any duplicate email entries in the data frame
drop = np.where(scientist_data.duplicated('Full Name')==True)[0]
temp = scientist_data.drop(scientist_data.index[drop])
scientist_data = temp
In [8]:
names = student_names.append(scientist_names, ignore_index=True)
learn = student_learn.append(scientist_learn, ignore_index=True)
teach = student_teach.append(scientist_teach, ignore_index=True)
emails = student_email.append(scientist_email, ignore_index=True)
In [9]:
G = nx.DiGraph()
I thought about several ways to do this. Each student has a "pool" of scientists to be assigned to based on their interests. This was a non-trivial problem. I try to have no more than 2 students assigned to each scientist, working with a limited dataset of roughly 20 scientists and 30 students. Most scientists come from astronomy/physics or psychology/neuroscience. Here are my attempts to do just that:
For each student, randomly draw from their "pool" of scientists with matching interests. This typically caused the more "underrepresented" scientists to get oversubscribed quickly, e.g., having one biologist and having many students interested in biology. This didn't help for students who had limited interests. If I couldn't match everyone up, I'd try again with different random draws. Couldn't find a solution for the conditions listed above. Maybe this would work better if we had a nScientists > nStudents.
Start with the "least popular" topic, that is the topic where the student-to-scientist ratio is smallest. Loop through the students with those interests and try to match them to a scientist. Then, we work are way up the list until we get to the most popular topic. This approach worked much better.
In [43]:
# Insert users in graphs
for n,e,l,t in zip(names, emails, learn, teach):
collaboratr.insert_node(G,n, email=e, learn=l.split(';'), teach=t.split(';'))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [74]:
def sort_things(stu_data, sci_data):
num_interests = {}
for i,r in stu_data.iterrows():
name = r['Name'].capitalize() + " " + r['Last'].capitalize()
num_interests = { name: 1 }
print(num_interests)
stu_names_sorted = sorted(num_interests, key=num_interests.get)
print(stu_names_sorted)
interests_stu = Counter(list(itertools.chain.from_iterable(\
[ i.split(';') for i in stu_data['If I could be any type of scientist when I grow up, I would want to study:'] ])))
interests_sci = Counter(list(itertools.chain.from_iterable(\
[ i.split(';') for i in sci_data['We will match you with a pen pal who has expressed an interest in at least one of the following subjects. Which topic is most relevant to your work?'] ])))
interests_rel = { key: interests_stu[key]/interests_sci[key] for key in interests_sci.keys() }
interests_rel_sorted = sorted(interests_rel, key=interests_rel.get)
return interests_rel_sorted, stu_names_sorted
def assigner(assign, stu_data, sci_data, max_students=2):
assign_one = {}
subscriptions = { n: 0 for n in sci_data['What is your name?'] }
interests_rel_sorted, stu_names_sorted = sort_things(stu_data, sci_data)
for key in interests_rel_sorted:
for name in stu_names_sorted:
if name not in assign_one:
if key in assign[name].keys():
try:
scientist = np.random.choice(assign[name][key])
except ValueError:
scientist = np.random.choice(scientist_data['What is your name?'])
assign_one[name] = scientist
subscriptions[scientist] += 1
if subscriptions[scientist]>=max_students:
for kk,vv in assign.items():
if vv:
for k,v in vv.items():
if scientist in v:
v.remove(scientist)
for name in stu_names_sorted:
if name not in assign_one:
scientist = np.random.choice([ k for k,v in subscriptions.items() if v < max_students ])
assign_one[name] = scientist
return assign_one
In [47]:
assign_one = None
max_students = 2
while assign_one is None:
try:
participants = G.nodes(data=True)
assign = collaboratr.assign_users(G,participants)
assign_one = assigner(assign, student_data, scientist_data, max_students=max_students)
if max(Counter([v for k,v in assign_one.items()]).values())>max_students:
assign_one = None
except ValueError:
# print("error")
pass
print(assign_one)
print(Counter([v for k,v in assign_one.items()]))
In [8]:
items = []
for k,v in assign_one.items():
items.append(str(v.ljust(22) + "-> " + k.ljust(22) + "who is interested in " \
+ student_data.loc[student_data['What is your name?'] == k]\
['What general science fields are you interested in?'].tolist()[0] ))
for i in sorted(items):
print(i)
In [75]:
a, b = sort_things(student_data, scientist_data)
print(a, b)
In [ ]: