GEAR Network

Jiarui Xu - jxu57@illinois.edu


In [1]:
%matplotlib inline

In [2]:
import sys
default_stdout = sys.stdout
default_stderr = sys.stderr
reload(sys)
sys.stdout = default_stdout
sys.stderr = default_stderr
sys.setdefaultencoding('utf-8')

In [3]:
import scrapy
import requests
import urllib2
import BeautifulSoup
import pickle
import pyprind
import json
import collections
import matplotlib.pyplot as plt

In [ ]:
import os

In [5]:
sys.path.append("../")

In [6]:
import mathscinet

In [8]:
reload(mathscinet)


Out[8]:
<module 'mathscinet' from '../mathscinet.pyc'>

In [ ]:

Load basic infomation


In [ ]:
with open("./profile/profile.json") as f:
    orig_profile = json.load(f)

In [ ]:
gear_mathsci_mapper = {}
mathsci_gear_mapper = {}

for person in orig_profile['items']:
    gear_mathsci_mapper[person['member_id']] = person['mathsci_id']
    mathsci_gear_mapper[person['mathsci_id']] = person['member_id']

In [ ]:

Retrieve papers for each professor

Construct paper_set: mathsci_id -> paper_list mapping


In [ ]:
bar = pyprind.ProgBar(len(orig_profile['items']), bar_char='█', width=70)

paper_set = {}

for person in orig_profile['items']:
    
    bar.update()
    
    member_id = person['member_id']
    mathsci_id = person['mathsci_id']
    
    if mathsci_id != 'NA':
        paper_set[mathsci_id] = mathscinet.find_papers_by_author_id(mathsci_id)

In [ ]:
with open("mathscinet_paper_base.json", "wb") as f:
    json.dump(paper_set, f)

In [ ]:
with open("mathscinet_paper_base.json", "rb") as f:
    paper_set = json.load(f)

In [ ]:
paper_set

Profile Update

1. Retrieving papers


In [ ]:
paper_set_2011 = {}
count_2011 = 0

for mathsci_id in paper_set.keys():
    
    paper_set_2011[mathsci_id] = []
    
    for paper in paper_set[mathsci_id]:
        if paper['date'] >= 2011:
            count_2011 += 1
            paper_set_2011[mathsci_id].append(paper)

print "Completed! count:", count_2011

2. Retrieving citations


In [ ]:
bar = pyprind.ProgBar(count_2011, bar_char='█', width=70)

for mathsci_id in paper_set_2011.keys():
    
    for paper in paper_set_2011[mathsci_id]:
        bar.update()
        paper['citing'] = mathscinet.find_parent_citations( paper['id'] )

In [ ]:
with open("mathscinet_paper_base_with_citation.json", "wb") as f:
    json.dump(paper_set_2011, f)

In [ ]:
with open("mathscinet_paper_base_with_citation.json", "rb") as f:
    paper_set_2011 = json.load(f)

3. Updating the profile accordingly

3.1 Update collaborator detail


In [ ]:
def update_collaborators(gear_profile, gear_paper_set, starting_year, ending_year, converter):
    
    col_detail_key = "%s-%s collaborators details" % (str(starting_year), str(ending_year))
    col_size_key = "%s-%s collaborators sizes" % (str(starting_year), str(ending_year))
    
    for person in gear_profile['items']:
        details = {}
        sizes = {}
        
        mathsci_id = person['mathsci_id']

        # if mathsci_id does not exist, continue
        if mathsci_id == "NA":
            continue

        for paper in gear_paper_set[mathsci_id]:
            year = paper['date']

            if year < starting_year or year > ending_year:
                continue

            authors = paper['authors']

            for au in authors:
                if au in converter:
                    gear_id = converter[au]
                    
                    if gear_id == person['member_id']:
                        continue
                    
                    if gear_id in details:
                        details[gear_id].append( paper['id'] )
                    else:
                        details[gear_id] = [ paper['id'] ]
        
        for key in details.keys():
            sizes[key] = len(details[key])
            
        person[col_detail_key] = details
        person[col_size_key] = sizes

In [ ]:
for ending_year in range(2011, 2017):
    update_collaborators(orig_profile, paper_set_2011, 2011, ending_year, mathsci_gear_mapper)

3.2 Update citation


In [ ]:
def get_citation_keys(citations):
    keys = []
    for cite in citations:
        keys.append(cite['id'])
    return keys

In [ ]:
def retrieve_citations(person, selected_paper_set, starting_year, ending_year):
    
    mathsci_id = person['mathsci_id']
    
    citations = []
    
    if mathsci_id == "NA":
        return citations
    
    paper_list = selected_paper_set[mathsci_id]
    
    for paper in paper_list:
        year = paper['date']
        if year < starting_year or year > ending_year:
            continue
        citations.extend( get_citation_keys(paper['citing']) )
        
    return citations

In [ ]:
def list_overlap(list_a, list_b):

    a_multiset = collections.Counter(list_a)
    b_multiset = collections.Counter(list_b)

    overlap = list((a_multiset & b_multiset).elements())
    
    return overlap

In [ ]:
def update_citations(profile, paper_set, starting_year, ending_year, converter):
    
    cite_details_key = "%s-%s citation details" % (str(starting_year), str(ending_year))
    cite_sizes_key = "%s-%s citation sizes" % (str(starting_year), str(ending_year))
    
    # build member_id -> person mapping
    temp_profile = {}
    for person in profile['items']:
        member_id = person['member_id']
        temp_profile[member_id] = person
        
    author_list = temp_profile.keys()

    for person in profile['items']:
        person[cite_details_key] = {}
        person[cite_sizes_key] = {}
        
        member_id = person['member_id']
        
        for other_person_id in author_list:
            
            if other_person_id == member_id:
                continue
            
            other_person = temp_profile[other_person_id]
            
            other_person_citation = retrieve_citations(other_person, paper_set, starting_year, ending_year)
            this_person_citation = retrieve_citations(person, paper_set, starting_year, ending_year)
            
            length = len(list_overlap(other_person_citation, this_person_citation)) 
            if length > 0:
                person[cite_sizes_key][other_person_id] = list_overlap(other_person_citation, this_person_citation)
                person[cite_sizes_key][other_person_id] = len(person[cite_sizes_key][other_person_id])

In [ ]:
for ending_year in range(2011, 2017):
    update_citations(orig_profile, paper_set_2011, 2011, ending_year, mathsci_gear_mapper)

In [ ]:
orig_profile

Matrix Builder


In [ ]:
def print_matrix(folder_name, file_name, matrix):
    path = os.path.join(folder_name, file_name)
    print path
    with open(path, "w") as f:
        f.write("Source;Target;Weight;Type\n")
        for key in matrix.keys():
            f.write(str(key[0]) + ";")
            f.write(str(key[1]) + ";")
            f.write(str(matrix[key]) + ";")
            f.write("undirected\n")

In [ ]:
def matrix_maker(gear_profile, starting_year, ending_year):
    collab_matrix = {}
    citation_matrix = {}
    
    cit_size_key = "%s-%s citation sizes" % (str(starting_year), str(ending_year))
    col_size_key = "%s-%s collaborators size" % (str(starting_year), str(ending_year))
    
    for person in gear_profile['items']:
        
        if person['mathsci_id'] == "NA":
            continue
        
        author_id = person['member_id']
        
        for key in person[col_size_key].keys():
            if key > author_id:
                collab_matrix[(author_id, key)] = person[col_size_key][key]
        
        for key in person[cit_size_key].keys():
            if key > author_id:
                citation_matrix[(author_id, key)] = person[cit_size_key][key]
                
    print_matrix("output", str(starting_year)+"_"+str(ending_year)+"_citation", citation_matrix)
    print_matrix("output", str(starting_year)+"_"+str(ending_year)+"_coauthor", collab_matrix)

In [ ]:


In [ ]:
for ending_year in range(2011, 2017):
    matrix_pair = matrix_maker(orig_profile, 2011, ending_year)

Visualize


In [ ]:
#The package which handles the graph objects
import networkx as nx

# Matplotlib is the default package for
# rendering the graphs
import matplotlib.pyplot as plt

def simple_graph(profile):

    #create an empty graph
    G = nx.Graph()
    
    name_mapping = {}
    
    for person in profile['items']:
        aid = person['member_id']
        name = person['name']
        surname = person['surname']
        name_mapping[aid] = name + " " + surname
        
    for person in profile['items']:
        aid = person['member_id']
        cite = person['2011-2015 citation stats']
        for au in cite.keys():
            edge = (aid, au)
            # name_edge = (name_mapping[aid], name_mapping[au])
            # G.add_edge(name_edge[0], name_edge[1])
            G.add_edge(str(edge[0]), str(edge[1]))
    
    #draw the graph
    nx.draw(G)
    # plt.savefig("graph.png", dpi=1000)
    #show
    plt.show()

    app = Viewer(G)
    app.mainloop()
simple_graph(orig_profile)

In [ ]:
def similarity_calculator(first, second):
    #if type(first) is not unicode:
    #    first = unicode(first, 'utf-8')
    #if type(second) is not unicode:
    #    second = unicode(second, 'utf-8')


    # 1. fuzzy matcher
    fr = fuzz.ratio(first, second)
    pr = fuzz.partial_ratio(first, second)
    sor = fuzz.token_sort_ratio(first, second)
    ser = fuzz.token_set_ratio(first, second)    
    
    # 2. sequence similarity
    s = difflib.SequenceMatcher(lambda x: x == " ", first, second)
    seq = round(s.ratio(), 3)
    
    # 3. edit distance
    # 3.1 absolute
    try:
        lv_ab = Levenshtein.distance(first, second)
    except:
        print "ooops", first, second
    
    # 3.2 jaro
    lv_ja = Levenshtein.jaro(first, second)
    
    # 3.3 jaro_winkler
    lv_jaw = Levenshtein.jaro_winkler(first, second)
    
    # 3.4 ratio
    lv_ra = Levenshtein.ratio(first, second)
    

    # 4 jarcard and 
    sr = distance.sorensen(first, second)
    ja = distance.jaccard(first, second)
    print fr, pr, sor, ser, seq, lv_ab, lv_ja, lv_jaw, lv_ra

In [ ]:
fuzz.ratio('Andersen, J\xf8rgen E.; Chekhov, Leonid O.; Penner, R. C.; Reidys, Christian M.; Su\u0142kowski, Piotr Topological recursion for chord diagrams, RNA complexes, and cells in moduli spaces. Nuclear Phys. B 866 (2013), no. 3, 414\u2013443.', \
           'Ande J\xf8rgen E.; Chekhov, Leonid O Penner, R. C.; Reidys, Christian M.; Su\u0142kowski, Piotr Topological recursion for chord diagrams, RNAcomplexes,and cells in moduli spaces. Nuclear Phys. B 866 (2013), no. 3, 4142013443.')

Computer number of paper they wrote together between y1 and y2


In [ ]:
def count_coop(gid1, gid2, y1, y2):
    val = 0
    aid1 = base[gid1]
    aid2 = base[gid2]
    # print aid1, aid2
    papers_1 = paper_base[aid1]
    # papers_2 = paper_base[aid2]
    
    for paper in papers_1:
        if paper['year'] >= y1 and paper['year']<= y2:
            if aid2 in paper['authors']:
                val += 1
    return val

In [ ]:
# 24
# 12

In [ ]:
count_coop("24","12",1000,2016)

In [ ]:
data = {}
for a1 in base.keys():
    data[a1] = {}
    for a2 in base.keys():
        if a1 is not a2:
            data[a1][a2] = count_coop(a1, a2, 2011, 2011)

In [ ]:
with open("input.csv", "w") as f:
    f.write("Source;Target;Weight;Type\n")
    for a1 in data.keys():
        for a2 in data[a1].keys():
            if data[a1][a2] > 0:
                f.write(a1)
                f.write(";")
                f.write(a2)
                f.write(";")
                f.write(str(data[a1][a2]))
                f.write(";")
                f.write("undirected\n")

In [ ]:
def print_single_year(year):
    data = {}
    for a1 in base.keys():
        data[a1] = {}
        for a2 in base.keys():
            if a1 is not a2:
                data[a1][a2] = count_coop(a1, a2, year, year)
                
    with open(str(year)+"_input.csv", "w") as f:
        f.write("Source;Target;Weight;Type\n")
        for a1 in data.keys():
            for a2 in data[a1].keys():
                if data[a1][a2] > 0:
                    f.write(a1)
                    f.write(";")
                    f.write(a2)
                    f.write(";")
                    f.write(str(data[a1][a2]))
                    f.write(";")
                    f.write("undirected\n")

In [ ]:
print_single_year(2011)
print_single_year(2012)
print_single_year(2013)
print_single_year(2014)
print_single_year(2015)
print_single_year(2016)

Combine databases


In [ ]:
orig = "Boileau, Michel; Boyer, Steven; Cebanu, Radu; Walsh, Genevieve S. Knot commensurability and the Berge conjecture. Geom. Topol. 16 (2012), no. 2, 625–664."
new = 'Knot commensurability and the Berge conjecture.'

In [ ]:
orig = "Steven B. Bradlow"
new = "Bradlow, S"

In [ ]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import math
import wikipedia
import Levenshtein
import difflib
import distance
import textblob
import numpy

In [ ]:
import json
with open("papers.json") as f:
    old_base = json.loads(f.read())

In [ ]:
old_base.keys()

In [ ]:
paper_base

In [ ]:
# make a mapping from 
mapping = {}
for key in base.keys():
    mapping[base[key]] = key
    
mapping

In [ ]:
# go through paper_base, filter only 2011+ papers

out = []
for key in paper_base.keys():
    paper_list = paper_base[key]
    for paper in paper_list:
        if paper['year'] >= 2011:
            if check_if_coop(paper['authors'], mapping):
                newone = {}
                newone['collaborator_ids'] = get_ids(paper['authors'], mapping)
                newone['date'] = str(paper['year'])
                newone['description'] = paper['article_title']
                out.append(newone)

In [ ]:
out_base = []
for unit in out:
    f = True
    for exi in out_base:
        if unit['description'] == exi['description']:
            f = False
    if f:
        out_base.append(unit)

In [ ]:
len(out_base)

In [ ]:
out_base

In [ ]:
def check_if_coop(authors, mapping):
    val = 0
    for au in authors:
        if au in mapping.keys():
            val += 1
    if val>=2:
        return True
    else:
        return False
    
def get_ids(authors, mapping):
    ret = []
    for au in authors:
        if au in mapping.keys():
            ret.append(int(mapping[au]))
    return ret

In [ ]:
# out is auto database
# old_base is old database

final = []

for a in old_base['papers']:
    f = True
    for b in final:
        if title_compare(a, b) is True:
            f = False
    if f:
        final.append(a)
        
for a in out_base:
    f = True
    for b in final:
        if title_compare(a, b) is True:
            f = False
    if f:
        final.append(a)

In [ ]:
len(old_base['papers'])

In [ ]:
len(final)

In [ ]:
def title_compare(a, b):
    return fuzz.partial_ratio(a['description'], b['description'])>95

In [ ]:
def ids_compare(a, b):
    return set(a['collaborator_ids']) == set(b['collaborator_ids'])

In [ ]:
def date_compare(a, b):
    return a['date'] == b['date']

In [ ]:
with open("prof.txt", "r") as f:
    profile = json.loads(f.read())

In [ ]:
for prof in profile['items']:
    id = prof['member_id']
    try:
        prof[u'mathsci_id'] = mapping[id]
    except:
        prof[u'mathsci_id'] = 0

In [ ]:
profile

In [ ]:
import json
with open('new_profile.json', 'w') as outfile:
    json.dump(profile, outfile)

In [ ]:
import json
with open('new_papers.json', 'w') as outfile:
    json.dump(final, outfile)

In [ ]:
"ddd  ".strip()

In [ ]:
import collections

In [ ]:
a = [3,4,5,5,5,6]
b = [1,3,4,4,5,5,6,7]

a_multiset = collections.Counter(a)
b_multiset = collections.Counter(b)

overlap = list((a_multiset & b_multiset).elements())

In [ ]:
overlap

In [ ]:
import community
import networkx as nx
import matplotlib.pyplot as plt

#better with karate_graph() as defined in networkx example.
#erdos renyi don't have true community structure
G = nx.erdos_renyi_graph(30, 0.05)

#first compute the best partition
partition = community.best_partition(G)

#drawing
size = float(len(set(partition.values())))
pos = nx.spring_layout(G)
count = 0.
for com in set(partition.values()) :
    count = count + 1.
    list_nodes = [nodes for nodes in partition.keys()
                                if partition[nodes] == com]
    nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20,
                                node_color = str(count / size))


nx.draw_networkx_edges(G,pos, alpha=0.5)
plt.show()

In [ ]:
%matplotlib inline  
import matplotlib.pyplot as plt

In [ ]:
import matplotlib.pyplot as plt

In [ ]: