In this lab we will do the following:
You can download this notebook from here.
In [87]:
!pip install oauth2
!pip install unidecode
In [1]:
%matplotlib inline
from collections import defaultdict
import json
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl
#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
Get a LinkedIn API key at http://developer.linkedin.com/documents/authentication (choose r_network)
Save your authentication:
In [2]:
#Johanna
#user_token = '6a516d33-786e-443c-b6e9-def654f88098'
#user_secret = 'c03c49da-9dae-4b05-a2af-82e40426439f'
#api_key = 'xpsswsigqw4r'
#secret_key = 'aIRpJHhA8JHTRsyb'
#Alex
#api_key = 'g8lq60ilatfh'
#secret_key = 'XEOmeklHWHtmwgoQ'
#user_token = 'a8991ba6-9a27-40d7-ac6f-9280cc1dc650'
#user_secret = '43a11017-c1f3-4c30-afab-43df3c39b938'
#Nicolas
user_token = 'd41f3e0c-6bb9-4db8-b324-25a723ff2f50'
user_secret = 'fc66e892-6f92-4e15-b9a9-b0cccbec5336'
api_key = 'kg7oy496e09a'
secret_key = 'oLCLRNxVjt8ZY6OE'
Next we are scraping our data using the LinkedIn API. (Code for using the LinkedIn API is taken and adjusted from http://dataiku.com/blog/2012/12/07/visualizing-your-linkedin-graph-using-gephi-part-1.html).
In [3]:
import oauth2 as oauth
import urlparse
def request_token(consumer):
client = oauth.Client(consumer)
request_token_url = 'https://api.linkedin.com/uas/oauth/requestToken?scope=r_network'
resp, content = client.request(request_token_url, "POST")
if resp['status'] != '200':
raise Exception("Invalid response %s." % resp['status'])
request_token = dict(urlparse.parse_qsl(content))
return request_token
#consumer = oauth.Consumer(api_key, secret_key)
#r_token = request_token(consumer)
#print "Request Token: oauth_token: %s, oauth_token_secret: %s" % (r_token['oauth_token'], r_token['oauth_token_secret'])
In [4]:
def authorize(request_token):
authorize_url ='https://api.linkedin.com/uas/oauth/authorize'
print "Go to the following link in your browser:"
print "%s?oauth_token=%s" % (authorize_url, request_token['oauth_token'])
print
accepted = 'n'
while accepted.lower() == 'n':
accepted = raw_input('Have you authorized me? (y/n) ')
oauth_verifier = raw_input('What is the PIN? ')
return oauth_verifier
#oauth_verifier = authorize(r_token)
In [5]:
def access(consumer, request_token, oauth_verifier):
access_token_url = 'https://api.linkedin.com/uas/oauth/accessToken'
token = oauth.Token(request_token['oauth_token'], request_token['oauth_token_secret'])
token.set_verifier(oauth_verifier)
client = oauth.Client(consumer, token)
resp, content = client.request(access_token_url, "POST")
access_token = dict(urlparse.parse_qsl(content))
return access_token
#a_token = access(consumer, r_token, oauth_verifier)
#print a_token
#print "Access Token: oauth_token = %s, oauth_token_secret = %s" % (a_token['oauth_token'], a_token['oauth_token_secret'])
#print "You may now access protected resources using the access tokens above."
In [7]:
consumer = oauth.Consumer(api_key, secret_key)
r_token = request_token(consumer)
print "Request Token: oauth_token: %s, oauth_token_secret: %s" % (r_token['oauth_token'], r_token['oauth_token_secret'])
oauth_verifier = authorize(r_token)
a_token = access(consumer, r_token, oauth_verifier)
print a_token
print "Access Token: oauth_token = %s, oauth_token_secret = %s" % (a_token['oauth_token'], a_token['oauth_token_secret'])
print "You may now access protected resources using the access tokens above."
In [8]:
import simplejson
import codecs
output_file = 'linkedIn_links.csv'
my_name = 'Your Name'
def linkedin_connections():
# Use your credentials to build the oauth client
consumer = oauth.Consumer(key=api_key, secret=secret_key)
token = oauth.Token(key=a_token['oauth_token'], secret=a_token['oauth_token_secret'])
client = oauth.Client(consumer, token)
# Fetch first degree connections
resp, content = client.request('http://api.linkedin.com/v1/people/~/connections?format=json')
results = simplejson.loads(content)
# File that will store the results
output = codecs.open(output_file, 'w', 'utf-8')
# Loop through the 1st degree connection and see how they connect to each other
for result in results["values"]:
con = "%s %s" % (result["firstName"].replace(",", " "), result["lastName"].replace(",", " "))
print >>output, "%s,%s" % (my_name, con)
# This is the trick, use the search API to get related connections
u = "https://api.linkedin.com/v1/people/%s:(relation-to-viewer:(related-connections))?format=json" % result["id"]
resp, content = client.request(u)
rels = simplejson.loads(content)
try:
for rel in rels['relationToViewer']['relatedConnections']['values']:
sec = "%s %s" % (rel["firstName"].replace(",", " "), rel["lastName"].replace(",", " "))
print >>output, "%s,%s" % (con, sec)
except:
pass
linkedin_connections()
In [12]:
from operator import itemgetter
from unidecode import unidecode
clean_output_file = 'linkedIn_links_clean.csv'
def stringify(chain):
# Simple utility to build the nodes labels
allowed = '0123456789abcdefghijklmnopqrstuvwxyz_'
c = unidecode(chain.strip().lower().replace(' ', '_'))
return ''.join([letter for letter in c if letter in allowed])
def clean(f_input, f_output):
output = open(f_output, 'w')
# Store the edges inside a set for dedup
edges = set()
for line in codecs.open(f_input, 'r', 'utf-8'):
from_person, to_person = line.strip().split(',')
_f = stringify(from_person)
_t = stringify(to_person)
# Reorder the edge tuple
_e = tuple(sorted((_f, _t), key=itemgetter(0, 1)))
edges.add(_e)
for edge in edges:
print >>output, '%s,%s' % (edge[0], edge[1])
clean(output_file, clean_output_file)
When you have run these cells you have a 'linkedIn_links_clean.csv' file in the directory of your notebook, that is compatible with gephi. If you don't have a LinkedIn account or think your network is boring you can use one of ours which you can get here.
Gephi requires Java to run, at least a JRE of version 6. To check if you have java installed, open a console and run
$ java -version
java version "1.7.0_25"
OpenJDK Runtime Environment (IcedTea 2.3.12) (7u25-2.3.12-4ubuntu3)
OpenJDK 64-Bit Server VM (build 23.7-b01, mixed mode)
If you don't have java or only an outdated version, go here to download it.
To install gephi, download it and follow these installation instructions.
The analysis with a GUI based tool is hard to convey in an IPython Notebook ;). If you don't want to watch the video, here is the Gephi Quick Start guide.
Here are the things we are doing:
In [ ]:
In [12]:
import csv
from collections import defaultdict
pairlist=[]
connections=defaultdict(list)
userset=set()
with open('linkedIn_links_clean.csv', 'rb') as csvfile:
allrows = csv.reader(csvfile, delimiter=',')
for row in allrows:
# if ((row[0]=='your_name') | (row[1]=='your_name')): continue # exclude yourself ?
pairlist.append((row[0], row[1]))
connections[row[0]].append(row[1])
connections[row[1]].append(row[0])
userset.add(row[0])
userset.add(row[1])
## Actual algorithm starts here
## display the pagerank
In [3]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib
import math
g = nx.Graph()
remove_me = False
for user in userset:
if remove_me & (user=='your_name'): continue
g.add_node(user)
for user in userset:
if remove_me & (user=='your_name'): continue
nconnec = 0
for connection in connections[user]:
if remove_me & (connection=='your_name'): continue
g.add_edge(user, connection, weight = 1)
nconnec+=1
if remove_me & (nconnec==0):
g.remove_node(user)
pagerank_nx = nx.pagerank_scipy(g)
color = [(min(pagerank_nx[n]*30.,1),min(pagerank_nx[n]*30.,1), min(pagerank_nx[n]*30.,1)) for n in pagerank_nx]
pos = nx.spring_layout(g, iterations=100)
nx.draw_networkx_edges(g, pos, width=1, alpha=0.4)
nx.draw_networkx_nodes(g, pos, node_color=color, node_size=100, alpha=1, linewidths =0.5)
#lbls = nx.draw_networkx_labels(g, pos)
plt.show()
In [4]:
# checks whether we have the same, or similar, pageranks
sorted_pr = sorted(pagerank_nx.iteritems(), reverse=True, key=lambda (k,v): v)
print sorted_pr[:10]
A few stats about your network:
In [11]:
# your number of connection
print 'my degree is: ', g.degree('your_name'), '\n'
# diameter = maximum nb of edges between 2 nodes = always 2 in this case
print 'the graph diameter is: ',nx.diameter(g), '\n'
#center : surprising ?
print 'the center is: ',nx.center(g), '\n'
# number of clique communities of 5 nodes
print 'there are ', len(list(nx.k_clique_communities(g, 5))),'clique communities\n'
# most influential ?
print 'degree: ', g.degree(sorted_pr[2]),'\n'
print 'shortest path between Hanspeter and a friend', nx.shortest_path(g,source='hanspeter_pfister',target='etienne_corteel'),'\n'
In [ ]: