In [53]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
from prettytable import PrettyTable
import csv as csv
%matplotlib inline

Data is from http://www.ieor.berkeley.edu/~goldberg/jester-data/ with the first column removed and converted from .xls to .csv

This data is 24983 users who reviewed of 100 jokes (with reviews from -10 to 10). A value of 99 means that the reviewer did not review the joke


In [2]:
nReviewers = 24983
nJokes = 100
G=nx.generators.empty_graph(nReviewers+nJokes)
G.clear()
G.name="Jester Dataset 1"

Jokes = []
for i in range(0,100):
    Jokes.append('Joke' + str(i))

In [3]:
with open('jester-data-1.csv', 'rb') as csvfile:
    book = csv.reader(csvfile)
    i = 0 
    for row in book:
        for j in range(0, 100):
            if row[j] != '99':
                G.add_edge(i,Jokes[j],weight=float(row[j]))
        i+=1

In [4]:
G.number_of_nodes()


Out[4]:
25083

In [5]:
G.number_of_edges()


Out[5]:
1810455

In [6]:
nx.is_bipartite(G)


Out[6]:
True

In [7]:
X, Y = bipartite.sets(G)

In [8]:
len(Y)


Out[8]:
100

In [9]:
len(X)


Out[9]:
24983

So we see that the graph is bipartite with the Users and Jokes separate

Instead of the example in the book where the islands are made by number of tweets (number of neighbors), we will create the islands based on the ratings


In [12]:
#Taken from Social Network Analysis for Startups
def trim_edges(g, weight=1):
    g2=nx.Graph()
    for f, to, edata in g.edges(data=True):
        if edata['weight'] > weight:
            g2.add_edge(f,to,edata)
    return g2

In [13]:
#Taken from Social Network Analysis for Startups
def island_method(g, iterations=5):
    weights= [edata['weight'] for f,to,edata in g.edges(data=True)]
    mn=int(min(weights))
    mx=int(max(weights))
    #compute the size of the step, so we get a reasonable step in iterations
    step=int((mx-mn)/iterations)
    return [[threshold, trim_edges(g, threshold)] for threshold in range(mn,mx,step)]

In [14]:
islands = island_method(G)

In [16]:
for i in islands:
    print i[0], len(i[1])


-9 25077
-6 25074
-3 25060
0 25013
3 24673
6 21942
9 7489

I think we can interpret the islands as almost a cumlative histogram of the ratings. Thus almost everyone gave a -9 or higher rating, but not that many gave a 9 or higher

We can interest ourselves in the rating 9 island


In [22]:
HighestRated = islands[-1]
HighG = HighestRated[1]

In [32]:
HighG.number_of_nodes()


Out[32]:
7489

In [33]:
HighG.number_of_edges()


Out[33]:
38054

In [34]:
nx.is_bipartite(HighG)


Out[34]:
True

In [35]:
X, Y = bipartite.sets(HighG)

In [36]:
len(X)


Out[36]:
7389

In [57]:
len(Y)


Out[57]:
100

In [ ]: