Parse Wikipedia categories into network

Data is collected with Wikicrawl



In [ ]:

    
%pylab inline
plt.rc('figure', figsize=(8, 5))

import os
# from collections import Counter
from fnmatch import fnmatch
import yaml
import networkx as nx



In [3]:

    
def slugify(value):
    import unicodedata
    import re

    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value
    if type(value) is str:
        value=value.decode('utf-8');

    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
    value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
    value = unicode(re.sub('[-\s]+', '_', value))
    
    return value.capitalize()



In [4]:

    
corpus_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
wk_path = os.path.join(corpus_path, 'wikipedia')
wk_en_path = os.path.join(wk_path, 'en')

print wk_en_path









    



/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en



In [57]:

    
# wikipedia
categories=nx.Graph()

for path, subdirs, files in os.walk(wk_en_path):
    
    here=os.path.split(path)[1]    
    parent=os.path.split(os.path.split(path)[0])[1]
    
    # print parent, " -> ", here
    categories.add_edge(parent, here)
    
    categories[parent]["path"]=path
    categories[here]["path"]=path
    
    for name in files:
        if fnmatch(name, "*.yaml"): # check if there is a text file

            category_name=name[0:-5]
            yaml_file_path = os.path.join(path, category_name+".yaml")
            
            # yaml
            yaml_file = open(yaml_file_path, "r")
            docs = yaml.load_all(yaml_file)
            
            # category_name
            for doc in docs:
                cat_parent=doc["CategoryPath"][0]
                
                categories.add_edge(slugify(cat_parent), slugify(category_name))
                
                categories[slugify(cat_parent)]["path"]=path
                categories[slugify(category_name)]["path"]=path
                
                # print slugify(cat_parent)," -> ", slugify(category_name)
                
                for cat in doc["Categories"][0]["en"]:
                    categories.add_edge(slugify(category_name), slugify(cat))
                    # print slugify(category_name), " -> ", slugify(cat)
                    categories[slugify(cat)]["path"]=path

    # print


print("The categories graph %s has %d nodes with %d edges"
          %(categories.name,nx.number_of_nodes(categories),nx.number_of_edges(categories)))









    



The categories graph  has 5384 nodes with 10540 edges

Visualization

With NetworkX



In [8]:

    
nx.draw_networkx(categories, node_size=15, edge_color='y', with_labels=False, alpha=.4, linewidths=0)

Export with Graphviz



In [20]:

    
def nx_to_gv_file(_nx_graph, _name, _dir_path):
    '''
    Convert meme corpus to Graphviz file
    '''
    # t0=time()

    # 
    gv_filepath=_dir_path+"/"+_name+".gv"
    viz_filepath=_dir_path+"/"+_name+".png"

    with open(gv_filepath,'w') as f:
        

        line = "digraph mentions {\n" # open .gv file
        f.write(line)

        for i,edge in enumerate(_nx_graph.edges()):
            line='"'+edge[0]+'"'+"->"+'"'+edge[1]+'"'+"\n"
            # print line 
            f.write(line)

        line = "}"+"\n" # close .gv file
        f.write(line)

    print " graphiz file saved as %s"%gv_filepath
    
    # draw with graphviz
    command = "sfdp -Gbgcolor=black -Ncolor=white -Ecolor=white -Nwidth=0.05  -Nheight=0.05 -Nfixedsize=true -Nlabel='' -Earrowsize=0.4 -Gsize=75 -Gratio=fill -Tpng " + gv_filepath + " > " + viz_filepath
    
    os.system(command)
    print "viz graph saved as %s"%viz_filepath

nx_to_gv_file(categories, "wiki_cat", os.getcwd())









    



 graphiz file save as /home/clemsos/Dev/junkware/junkware-objects/tests/wiki_cat.gv
viz graph saved as /home/clemsos/Dev/junkware/junkware-objects/tests/wiki_cat.png

Import as CSV for Gephi



In [19]:

    
import csv 

def list_to_csv(_keys,_rows,_csv_filepath):
    
    with open(_csv_filepath,'w') as f: # writes the final output to CSV
        csv_out=csv.writer(f)
        csv_out.writerow(_keys) # add header
        for row in _rows:
            csv_out.writerow(row)

    print " csv has been stored as %s"%_csv_filepath
    
def nx_to_gephi_csv(_nx_graph, _name, _dir_path):

    list_to_csv(["Id", "Label"],_nx_graph.nodes(),_dir_path + '/'+_name+'_nodes.csv')
    list_to_csv(["Source","Target"],_nx_graph.edges(),_dir_path +'/'+_name+'_edges.csv')
    print "graph files (nodes+edges) saved at %s"%_dir_path

nx_to_gephi_csv(categories, "wiki_cat", os.getcwd())









    



 csv has been stored as /home/clemsos/Dev/junkware/junkware-objects/tests/wiki_cat_nodes.csv
 csv has been stored as /home/clemsos/Dev/junkware/junkware-objects/tests/wiki_cat_edges.csv
graph files (nodes+edges) saved at /home/clemsos/Dev/junkware/junkware-objects/tests

Graph Walk

Select 2 random nodes



In [77]:

    
import os
from random import choice

first_node = choice(categories.nodes())                  # pick a random node

possible_nodes = set(categories.nodes())
neighbours = categories.neighbors(first_node) + [first_node]
possible_nodes.difference_update(neighbours)    # remove the first node and all its neighbours from the candidates
second_node = choice(list(possible_nodes))      # pick second node

print first_node, second_node









    



Animal_rights Video_game_terminology



In [78]:

    
# print categories[first_node]["path"]
# print categories[second_node]["path"]

for file in os.listdir(categories[first_node]["path"]):
    if file.endswith(".txt"):
        print os.path.join(categories[first_node]["path"], file)

for file in os.listdir(categories[second_node]["path"]):
    if file.endswith(".txt"):
        print os.path.join(categories[second_node]["path"], file)









    



/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Belief/Availability_cascade.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Belief/Anthropocentrism.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Belief/Basic_belief.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Belief/Alief_[belief].txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Belief/Attitude_polarization.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Belief/Affective_disposition_theory.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Belief/Belief_in_God.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Belief/Belief_in_luck.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Belief/Bad_faith.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Games/Arcade_games/Arcade_Game_Construction_Kit.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Games/Arcade_games/Amusement_arcade.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Games/Arcade_games/Arcade_Game_Cards.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Games/Arcade_games/Timeline_of_arcade_video_game_history.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Games/Arcade_games/List_of_trackball_arcade_games.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Games/Arcade_games/List_of_Amiga_arcade_conversions.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Games/Arcade_games/10-Yard_Fight.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Games/Arcade_games/List_of_arcade_video_games.txt
/home/clemsos/Dev/junkware/junkware-objects/data/wikipedia/en/Games/Arcade_games/Arcade_game.txt

Parse Wikipedia categories into network

Visualization

With NetworkX

Export with Graphviz

Import as CSV for Gephi

Graph Walk

Select 2 random nodes

Get the text