In [1]:
# preambles
import re
# import custom functions
import sys
# windows path to tools
tool_path = 'C:\\Users\\FG\Desktop\\PhD\\Research\\reddit'
if tool_path not in sys.path:
sys.path.append(tool_path)
import reddit_tools as rt
#Cats machine Data path
# data_dir = '/home/denys/Research/PowerRelations/Reddit'
#felix's local dev dir
data_dir = 'C:\\Users\\FG\\Desktop\\PhD\\Research\\reddit\\devdata'
print data_dir
#open latest directory
import os
latest_subdir = max([os.path.join(data_dir,d) for d in os.listdir(data_dir)], key=os.path.getmtime)
print latest_subdir
In [2]:
network = rt.extract_network(latest_subdir)
In [4]:
#number of users in the network
print len(network)
In [5]:
#look at first few
for user in network.keys()[:3]:
print user, network[user]
In [7]:
# small test network
test_network = {'user_A':{'user_A':set([5,6]), 'user_B':set([1]), 'user_C':set([4])}, 'user_B':{'user_A':set([2])}, 'user_C':{'user_A':set([3])} }
print test_network
sym_test_network = rt.symmetrize_network(test_network)
print test_network
print sym_test_network
In [8]:
sym_network = rt.symmetrize_network(network)
print sym_network['xanadu_reloaded']['whiskeycommander']
print sym_network['whiskeycommander']['xanadu_reloaded']
In [9]:
# number of user/node in complete network
print "number of users: ", len(sym_network)
In [10]:
#save network in pickled form
import cPickle as pickle
# name and location to save in
pickle_name = latest_subdir.split(os.sep)[-1]+ "_network.pckl"
pickle_dir = os.sep.join(latest_subdir.split(os.sep)[:-2]) + os.sep + "Pickled Data"
pickle.dump( sym_network, open( pickle_dir + os.sep + pickle_name, "wb") )
print "saved network in ", pickle_dir + '\\' + pickle_name
In [11]:
# reload pickled file for testing
new_sym_network = pickle.load( open(pickle_dir + os.sep + pickle_name, "rb") )
print "loaded network from ", pickle_dir + os.sep + pickle_name
In [12]:
# number of user/node in complete network
print "number of users: ", len(new_sym_network)
In [ ]:
In [3]:
# ### Deprecated (moved to reddit tools)
# # user oriented data structure
# # network = {user_name:{username:set(messages) }}
# network = dict()
# # testing variables
# post_count = 0
# deleted_post_count = 0
# # import XML parser
# import xml.etree.ElementTree as ET
# # walk directory
# for dir_name, subdir_list, file_list in os.walk(latest_subdir):
# print "working in " + dir_name
# for file_name in file_list:
# # open file
# if ("Thread" in file_name) and (".xml" in file_name):
# # print "working on " + dir_name + '/' + file_name
# # parse the file
# tree = ET.parse(dir_name + '/' + file_name)
# root = tree.getroot()
# # depth first traversal
# for elem in root.iter():
# # is this a node with comments?
# comments = elem.find('Comments')
# if comments != None:
# # if it is it should have a poster
# poster = elem.find('user').text.strip()
# # find the name of users that replied
# for comment in comments.findall('Comment'):
# post_count += 1
# commenter = comment.find('user').text.strip()
# comment_text = comment.find('body').text
# if comment_text == None:
# #if somehow the comment has no body, exit this loop
# deleted_post_count += 1
# break
# #save these in the dictionary
# if poster not in network:
# # first time seing this user as poster. add him
# network[poster] = {commenter: set([comment_text.strip()]) }
# else:
# if commenter not in network[poster]:
# # first time this commenter is commenting on this user
# network[poster][commenter] = set([comment_text.strip()])
# else:
# # add text to existing list
# network[poster][commenter] = network[poster][commenter] | set([comment_text.strip()])
In [6]:
### Deprecated (moved to reddit_tools.py)
# #symmetrize network
# def symmetrize_network(network):
# # given a network of the form {user_name1 : {user_name2 : {relationship features} } }
# # returns a symmetrical version of that network
# import copy
# sym_network = copy.deepcopy(network)
# #go through network
# for user_A in network.keys():
# for user_B in network[user_A].keys():
# msgs = network[user_A][user_B]
# if user_B not in sym_network:
# sym_network[user_B] = {user_A:msgs}
# elif user_A not in sym_network[user_B]:
# sym_network[user_B][user_A] = msgs
# else:
# sym_network[user_B][user_A] = sym_network[user_B][user_A] | msgs
# return sym_network