In [53]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
from prettytable import PrettyTable
import csv as csv
%matplotlib inline
Data is from http://www.ieor.berkeley.edu/~goldberg/jester-data/ with the first column removed and converted from .xls to .csv
This data is 24983 users who reviewed of 100 jokes (with reviews from -10 to 10). A value of 99 means that the reviewer did not review the joke
In [2]:
nReviewers = 24983
nJokes = 100
G=nx.generators.empty_graph(nReviewers+nJokes)
G.clear()
G.name="Jester Dataset 1"
Jokes = []
for i in range(0,100):
Jokes.append('Joke' + str(i))
In [3]:
with open('jester-data-1.csv', 'rb') as csvfile:
book = csv.reader(csvfile)
i = 0
for row in book:
for j in range(0, 100):
if row[j] != '99':
G.add_edge(i,Jokes[j],weight=float(row[j]))
i+=1
In [4]:
G.number_of_nodes()
Out[4]:
In [5]:
G.number_of_edges()
Out[5]:
In [6]:
nx.is_bipartite(G)
Out[6]:
In [7]:
X, Y = bipartite.sets(G)
In [8]:
len(Y)
Out[8]:
In [9]:
len(X)
Out[9]:
So we see that the graph is bipartite with the Users and Jokes separate
Instead of the example in the book where the islands are made by number of tweets (number of neighbors), we will create the islands based on the ratings
In [12]:
#Taken from Social Network Analysis for Startups
def trim_edges(g, weight=1):
g2=nx.Graph()
for f, to, edata in g.edges(data=True):
if edata['weight'] > weight:
g2.add_edge(f,to,edata)
return g2
In [13]:
#Taken from Social Network Analysis for Startups
def island_method(g, iterations=5):
weights= [edata['weight'] for f,to,edata in g.edges(data=True)]
mn=int(min(weights))
mx=int(max(weights))
#compute the size of the step, so we get a reasonable step in iterations
step=int((mx-mn)/iterations)
return [[threshold, trim_edges(g, threshold)] for threshold in range(mn,mx,step)]
In [14]:
islands = island_method(G)
In [16]:
for i in islands:
print i[0], len(i[1])
I think we can interpret the islands as almost a cumlative histogram of the ratings. Thus almost everyone gave a -9 or higher rating, but not that many gave a 9 or higher
We can interest ourselves in the rating 9 island
In [22]:
HighestRated = islands[-1]
HighG = HighestRated[1]
In [32]:
HighG.number_of_nodes()
Out[32]:
In [33]:
HighG.number_of_edges()
Out[33]:
In [34]:
nx.is_bipartite(HighG)
Out[34]:
In [35]:
X, Y = bipartite.sets(HighG)
In [36]:
len(X)
Out[36]:
In [57]:
len(Y)
Out[57]:
In [ ]: