In [1]:
%matplotlib inline
In [2]:
import sys
default_stdout = sys.stdout
default_stderr = sys.stderr
reload(sys)
sys.stdout = default_stdout
sys.stderr = default_stderr
sys.setdefaultencoding('utf-8')
In [3]:
import scrapy
import requests
import urllib2
import BeautifulSoup
import pickle
import pyprind
import json
import collections
import matplotlib.pyplot as plt
In [ ]:
import os
In [5]:
sys.path.append("../")
In [6]:
import mathscinet
In [8]:
reload(mathscinet)
Out[8]:
In [ ]:
In [ ]:
with open("./profile/profile.json") as f:
orig_profile = json.load(f)
In [ ]:
gear_mathsci_mapper = {}
mathsci_gear_mapper = {}
for person in orig_profile['items']:
gear_mathsci_mapper[person['member_id']] = person['mathsci_id']
mathsci_gear_mapper[person['mathsci_id']] = person['member_id']
In [ ]:
In [ ]:
bar = pyprind.ProgBar(len(orig_profile['items']), bar_char='█', width=70)
paper_set = {}
for person in orig_profile['items']:
bar.update()
member_id = person['member_id']
mathsci_id = person['mathsci_id']
if mathsci_id != 'NA':
paper_set[mathsci_id] = mathscinet.find_papers_by_author_id(mathsci_id)
In [ ]:
with open("mathscinet_paper_base.json", "wb") as f:
json.dump(paper_set, f)
In [ ]:
with open("mathscinet_paper_base.json", "rb") as f:
paper_set = json.load(f)
In [ ]:
paper_set
In [ ]:
paper_set_2011 = {}
count_2011 = 0
for mathsci_id in paper_set.keys():
paper_set_2011[mathsci_id] = []
for paper in paper_set[mathsci_id]:
if paper['date'] >= 2011:
count_2011 += 1
paper_set_2011[mathsci_id].append(paper)
print "Completed! count:", count_2011
In [ ]:
bar = pyprind.ProgBar(count_2011, bar_char='█', width=70)
for mathsci_id in paper_set_2011.keys():
for paper in paper_set_2011[mathsci_id]:
bar.update()
paper['citing'] = mathscinet.find_parent_citations( paper['id'] )
In [ ]:
with open("mathscinet_paper_base_with_citation.json", "wb") as f:
json.dump(paper_set_2011, f)
In [ ]:
with open("mathscinet_paper_base_with_citation.json", "rb") as f:
paper_set_2011 = json.load(f)
In [ ]:
def update_collaborators(gear_profile, gear_paper_set, starting_year, ending_year, converter):
col_detail_key = "%s-%s collaborators details" % (str(starting_year), str(ending_year))
col_size_key = "%s-%s collaborators sizes" % (str(starting_year), str(ending_year))
for person in gear_profile['items']:
details = {}
sizes = {}
mathsci_id = person['mathsci_id']
# if mathsci_id does not exist, continue
if mathsci_id == "NA":
continue
for paper in gear_paper_set[mathsci_id]:
year = paper['date']
if year < starting_year or year > ending_year:
continue
authors = paper['authors']
for au in authors:
if au in converter:
gear_id = converter[au]
if gear_id == person['member_id']:
continue
if gear_id in details:
details[gear_id].append( paper['id'] )
else:
details[gear_id] = [ paper['id'] ]
for key in details.keys():
sizes[key] = len(details[key])
person[col_detail_key] = details
person[col_size_key] = sizes
In [ ]:
for ending_year in range(2011, 2017):
update_collaborators(orig_profile, paper_set_2011, 2011, ending_year, mathsci_gear_mapper)
In [ ]:
def get_citation_keys(citations):
keys = []
for cite in citations:
keys.append(cite['id'])
return keys
In [ ]:
def retrieve_citations(person, selected_paper_set, starting_year, ending_year):
mathsci_id = person['mathsci_id']
citations = []
if mathsci_id == "NA":
return citations
paper_list = selected_paper_set[mathsci_id]
for paper in paper_list:
year = paper['date']
if year < starting_year or year > ending_year:
continue
citations.extend( get_citation_keys(paper['citing']) )
return citations
In [ ]:
def list_overlap(list_a, list_b):
a_multiset = collections.Counter(list_a)
b_multiset = collections.Counter(list_b)
overlap = list((a_multiset & b_multiset).elements())
return overlap
In [ ]:
def update_citations(profile, paper_set, starting_year, ending_year, converter):
cite_details_key = "%s-%s citation details" % (str(starting_year), str(ending_year))
cite_sizes_key = "%s-%s citation sizes" % (str(starting_year), str(ending_year))
# build member_id -> person mapping
temp_profile = {}
for person in profile['items']:
member_id = person['member_id']
temp_profile[member_id] = person
author_list = temp_profile.keys()
for person in profile['items']:
person[cite_details_key] = {}
person[cite_sizes_key] = {}
member_id = person['member_id']
for other_person_id in author_list:
if other_person_id == member_id:
continue
other_person = temp_profile[other_person_id]
other_person_citation = retrieve_citations(other_person, paper_set, starting_year, ending_year)
this_person_citation = retrieve_citations(person, paper_set, starting_year, ending_year)
length = len(list_overlap(other_person_citation, this_person_citation))
if length > 0:
person[cite_sizes_key][other_person_id] = list_overlap(other_person_citation, this_person_citation)
person[cite_sizes_key][other_person_id] = len(person[cite_sizes_key][other_person_id])
In [ ]:
for ending_year in range(2011, 2017):
update_citations(orig_profile, paper_set_2011, 2011, ending_year, mathsci_gear_mapper)
In [ ]:
orig_profile
In [ ]:
def print_matrix(folder_name, file_name, matrix):
path = os.path.join(folder_name, file_name)
print path
with open(path, "w") as f:
f.write("Source;Target;Weight;Type\n")
for key in matrix.keys():
f.write(str(key[0]) + ";")
f.write(str(key[1]) + ";")
f.write(str(matrix[key]) + ";")
f.write("undirected\n")
In [ ]:
def matrix_maker(gear_profile, starting_year, ending_year):
collab_matrix = {}
citation_matrix = {}
cit_size_key = "%s-%s citation sizes" % (str(starting_year), str(ending_year))
col_size_key = "%s-%s collaborators size" % (str(starting_year), str(ending_year))
for person in gear_profile['items']:
if person['mathsci_id'] == "NA":
continue
author_id = person['member_id']
for key in person[col_size_key].keys():
if key > author_id:
collab_matrix[(author_id, key)] = person[col_size_key][key]
for key in person[cit_size_key].keys():
if key > author_id:
citation_matrix[(author_id, key)] = person[cit_size_key][key]
print_matrix("output", str(starting_year)+"_"+str(ending_year)+"_citation", citation_matrix)
print_matrix("output", str(starting_year)+"_"+str(ending_year)+"_coauthor", collab_matrix)
In [ ]:
In [ ]:
for ending_year in range(2011, 2017):
matrix_pair = matrix_maker(orig_profile, 2011, ending_year)
In [ ]:
#The package which handles the graph objects
import networkx as nx
# Matplotlib is the default package for
# rendering the graphs
import matplotlib.pyplot as plt
def simple_graph(profile):
#create an empty graph
G = nx.Graph()
name_mapping = {}
for person in profile['items']:
aid = person['member_id']
name = person['name']
surname = person['surname']
name_mapping[aid] = name + " " + surname
for person in profile['items']:
aid = person['member_id']
cite = person['2011-2015 citation stats']
for au in cite.keys():
edge = (aid, au)
# name_edge = (name_mapping[aid], name_mapping[au])
# G.add_edge(name_edge[0], name_edge[1])
G.add_edge(str(edge[0]), str(edge[1]))
#draw the graph
nx.draw(G)
# plt.savefig("graph.png", dpi=1000)
#show
plt.show()
app = Viewer(G)
app.mainloop()
simple_graph(orig_profile)
In [ ]:
def similarity_calculator(first, second):
#if type(first) is not unicode:
# first = unicode(first, 'utf-8')
#if type(second) is not unicode:
# second = unicode(second, 'utf-8')
# 1. fuzzy matcher
fr = fuzz.ratio(first, second)
pr = fuzz.partial_ratio(first, second)
sor = fuzz.token_sort_ratio(first, second)
ser = fuzz.token_set_ratio(first, second)
# 2. sequence similarity
s = difflib.SequenceMatcher(lambda x: x == " ", first, second)
seq = round(s.ratio(), 3)
# 3. edit distance
# 3.1 absolute
try:
lv_ab = Levenshtein.distance(first, second)
except:
print "ooops", first, second
# 3.2 jaro
lv_ja = Levenshtein.jaro(first, second)
# 3.3 jaro_winkler
lv_jaw = Levenshtein.jaro_winkler(first, second)
# 3.4 ratio
lv_ra = Levenshtein.ratio(first, second)
# 4 jarcard and
sr = distance.sorensen(first, second)
ja = distance.jaccard(first, second)
print fr, pr, sor, ser, seq, lv_ab, lv_ja, lv_jaw, lv_ra
In [ ]:
fuzz.ratio('Andersen, J\xf8rgen E.; Chekhov, Leonid O.; Penner, R. C.; Reidys, Christian M.; Su\u0142kowski, Piotr Topological recursion for chord diagrams, RNA complexes, and cells in moduli spaces. Nuclear Phys. B 866 (2013), no. 3, 414\u2013443.', \
'Ande J\xf8rgen E.; Chekhov, Leonid O Penner, R. C.; Reidys, Christian M.; Su\u0142kowski, Piotr Topological recursion for chord diagrams, RNAcomplexes,and cells in moduli spaces. Nuclear Phys. B 866 (2013), no. 3, 4142013443.')
In [ ]:
def count_coop(gid1, gid2, y1, y2):
val = 0
aid1 = base[gid1]
aid2 = base[gid2]
# print aid1, aid2
papers_1 = paper_base[aid1]
# papers_2 = paper_base[aid2]
for paper in papers_1:
if paper['year'] >= y1 and paper['year']<= y2:
if aid2 in paper['authors']:
val += 1
return val
In [ ]:
# 24
# 12
In [ ]:
count_coop("24","12",1000,2016)
In [ ]:
data = {}
for a1 in base.keys():
data[a1] = {}
for a2 in base.keys():
if a1 is not a2:
data[a1][a2] = count_coop(a1, a2, 2011, 2011)
In [ ]:
with open("input.csv", "w") as f:
f.write("Source;Target;Weight;Type\n")
for a1 in data.keys():
for a2 in data[a1].keys():
if data[a1][a2] > 0:
f.write(a1)
f.write(";")
f.write(a2)
f.write(";")
f.write(str(data[a1][a2]))
f.write(";")
f.write("undirected\n")
In [ ]:
def print_single_year(year):
data = {}
for a1 in base.keys():
data[a1] = {}
for a2 in base.keys():
if a1 is not a2:
data[a1][a2] = count_coop(a1, a2, year, year)
with open(str(year)+"_input.csv", "w") as f:
f.write("Source;Target;Weight;Type\n")
for a1 in data.keys():
for a2 in data[a1].keys():
if data[a1][a2] > 0:
f.write(a1)
f.write(";")
f.write(a2)
f.write(";")
f.write(str(data[a1][a2]))
f.write(";")
f.write("undirected\n")
In [ ]:
print_single_year(2011)
print_single_year(2012)
print_single_year(2013)
print_single_year(2014)
print_single_year(2015)
print_single_year(2016)
In [ ]:
orig = "Boileau, Michel; Boyer, Steven; Cebanu, Radu; Walsh, Genevieve S. Knot commensurability and the Berge conjecture. Geom. Topol. 16 (2012), no. 2, 625–664."
new = 'Knot commensurability and the Berge conjecture.'
In [ ]:
orig = "Steven B. Bradlow"
new = "Bradlow, S"
In [ ]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import math
import wikipedia
import Levenshtein
import difflib
import distance
import textblob
import numpy
In [ ]:
import json
with open("papers.json") as f:
old_base = json.loads(f.read())
In [ ]:
old_base.keys()
In [ ]:
paper_base
In [ ]:
# make a mapping from
mapping = {}
for key in base.keys():
mapping[base[key]] = key
mapping
In [ ]:
# go through paper_base, filter only 2011+ papers
out = []
for key in paper_base.keys():
paper_list = paper_base[key]
for paper in paper_list:
if paper['year'] >= 2011:
if check_if_coop(paper['authors'], mapping):
newone = {}
newone['collaborator_ids'] = get_ids(paper['authors'], mapping)
newone['date'] = str(paper['year'])
newone['description'] = paper['article_title']
out.append(newone)
In [ ]:
out_base = []
for unit in out:
f = True
for exi in out_base:
if unit['description'] == exi['description']:
f = False
if f:
out_base.append(unit)
In [ ]:
len(out_base)
In [ ]:
out_base
In [ ]:
def check_if_coop(authors, mapping):
val = 0
for au in authors:
if au in mapping.keys():
val += 1
if val>=2:
return True
else:
return False
def get_ids(authors, mapping):
ret = []
for au in authors:
if au in mapping.keys():
ret.append(int(mapping[au]))
return ret
In [ ]:
# out is auto database
# old_base is old database
final = []
for a in old_base['papers']:
f = True
for b in final:
if title_compare(a, b) is True:
f = False
if f:
final.append(a)
for a in out_base:
f = True
for b in final:
if title_compare(a, b) is True:
f = False
if f:
final.append(a)
In [ ]:
len(old_base['papers'])
In [ ]:
len(final)
In [ ]:
def title_compare(a, b):
return fuzz.partial_ratio(a['description'], b['description'])>95
In [ ]:
def ids_compare(a, b):
return set(a['collaborator_ids']) == set(b['collaborator_ids'])
In [ ]:
def date_compare(a, b):
return a['date'] == b['date']
In [ ]:
with open("prof.txt", "r") as f:
profile = json.loads(f.read())
In [ ]:
for prof in profile['items']:
id = prof['member_id']
try:
prof[u'mathsci_id'] = mapping[id]
except:
prof[u'mathsci_id'] = 0
In [ ]:
profile
In [ ]:
import json
with open('new_profile.json', 'w') as outfile:
json.dump(profile, outfile)
In [ ]:
import json
with open('new_papers.json', 'w') as outfile:
json.dump(final, outfile)
In [ ]:
"ddd ".strip()
In [ ]:
import collections
In [ ]:
a = [3,4,5,5,5,6]
b = [1,3,4,4,5,5,6,7]
a_multiset = collections.Counter(a)
b_multiset = collections.Counter(b)
overlap = list((a_multiset & b_multiset).elements())
In [ ]:
overlap
In [ ]:
import community
import networkx as nx
import matplotlib.pyplot as plt
#better with karate_graph() as defined in networkx example.
#erdos renyi don't have true community structure
G = nx.erdos_renyi_graph(30, 0.05)
#first compute the best partition
partition = community.best_partition(G)
#drawing
size = float(len(set(partition.values())))
pos = nx.spring_layout(G)
count = 0.
for com in set(partition.values()) :
count = count + 1.
list_nodes = [nodes for nodes in partition.keys()
if partition[nodes] == com]
nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20,
node_color = str(count / size))
nx.draw_networkx_edges(G,pos, alpha=0.5)
plt.show()
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
import matplotlib.pyplot as plt
In [ ]: