Network achitecture: {user_name1 : {user_name2 : set(messages between) } } Current relationship features: messages from 2->1

In [1]:
# preambles
import re

# import custom functions
import sys
# windows path to tools
tool_path = 'C:\\Users\\FG\Desktop\\PhD\\Research\\reddit'
if tool_path not in sys.path:
    sys.path.append(tool_path)
import reddit_tools as rt


#Cats machine Data path
# data_dir = '/home/denys/Research/PowerRelations/Reddit'

#felix's local dev dir
data_dir = 'C:\\Users\\FG\\Desktop\\PhD\\Research\\reddit\\devdata'
print data_dir

#open latest directory
import os
latest_subdir = max([os.path.join(data_dir,d) for d in os.listdir(data_dir)], key=os.path.getmtime)
print latest_subdir


C:\Users\FG\Desktop\PhD\Research\reddit\devdata
C:\Users\FG\Desktop\PhD\Research\reddit\devdata\Data_world8

In [2]:
network = rt.extract_network(latest_subdir)


working in C:\Users\FG\Desktop\PhD\Research\reddit\devdata\Data_world8

In [4]:
#number of users in the network
print len(network)


18275

In [5]:
#look at first few
for user in network.keys()[:3]:
    print user, network[user]


FieelChannel {'Vandbg': set(["Probably suicide I'd guess. It's happened before (see SilkAir 185 and Pacific 773)"]), 'Immortal_Wombat': set(["When the Abu Ghraib abuse scandal hit the news a number of pictures weren't released to the public.\n\n[Lynndie England](http://upload.wikimedia.org/wikipedia/commons/9/93/AG-10B.JPG) apparently appears in various explicit sex acts."])}
lynzee {'KuriTokyo': set(['The new slogan is "Yeah, It\'s alright."\n\nThe Alright Barrier Reef.'])}
Metaphoricalsimile {'orru': set(['How the fuck is not wanting your country to go to war anti- your country?'])}

In [7]:
# small test network
test_network = {'user_A':{'user_A':set([5,6]), 'user_B':set([1]), 'user_C':set([4])}, 'user_B':{'user_A':set([2])}, 'user_C':{'user_A':set([3])} }
print test_network
sym_test_network = rt.symmetrize_network(test_network)
print test_network
print sym_test_network


{'user_B': {'user_A': set([2])}, 'user_C': {'user_A': set([3])}, 'user_A': {'user_B': set([1]), 'user_C': set([4]), 'user_A': set([5, 6])}}
{'user_B': {'user_A': set([2])}, 'user_C': {'user_A': set([3])}, 'user_A': {'user_B': set([1]), 'user_C': set([4]), 'user_A': set([5, 6])}}
{'user_B': {'user_A': set([1, 2])}, 'user_C': {'user_A': set([3, 4])}, 'user_A': {'user_B': set([1, 2]), 'user_C': set([3, 4]), 'user_A': set([5, 6])}}

In [8]:
sym_network = rt.symmetrize_network(network)
print sym_network['xanadu_reloaded']['whiskeycommander']
print sym_network['whiskeycommander']['xanadu_reloaded']


set(["Partition Syria. That's the only way to solve this crisis. One half goes to Assad, the other goes to whoever the opposition is. The fucked up part about it? The side that Assad keeps will probably be the more stable of the two."])
set(["Partition Syria. That's the only way to solve this crisis. One half goes to Assad, the other goes to whoever the opposition is. The fucked up part about it? The side that Assad keeps will probably be the more stable of the two."])

In [9]:
# number of user/node in complete network
print "number of users: ", len(sym_network)


number of users:  31655

In [10]:
#save network in pickled form
import cPickle as pickle

# name and location to save in
pickle_name = latest_subdir.split(os.sep)[-1]+ "_network.pckl"
pickle_dir = os.sep.join(latest_subdir.split(os.sep)[:-2]) + os.sep + "Pickled Data"
pickle.dump( sym_network, open( pickle_dir + os.sep + pickle_name, "wb") )
print "saved network in ", pickle_dir + '\\' + pickle_name


saved network in  C:\Users\FG\Desktop\PhD\Research\reddit\Pickled Data\Data_world8_network.pckl

In [11]:
# reload pickled file for testing
new_sym_network = pickle.load( open(pickle_dir + os.sep + pickle_name, "rb") )
print "loaded network from ", pickle_dir + os.sep + pickle_name


loaded network from  C:\Users\FG\Desktop\PhD\Research\reddit\Pickled Data\Data_world8_network.pckl

In [12]:
# number of user/node in complete network
print "number of users: ", len(new_sym_network)


number of users:  31655

In [ ]:


In [3]:
# ### Deprecated (moved to reddit tools)
# # user oriented data structure
# # network  = {user_name:{username:set(messages) }}
# network = dict()

# # testing variables
# post_count = 0
# deleted_post_count = 0

# # import XML parser
# import xml.etree.ElementTree as ET

# # walk directory
# for dir_name, subdir_list, file_list in os.walk(latest_subdir):
#     print "working in " + dir_name
    
#     for file_name in file_list:
#         # open file
#         if ("Thread" in file_name) and (".xml" in file_name):
# #             print "working on " + dir_name + '/' + file_name
            
#             # parse the file
#             tree = ET.parse(dir_name + '/' + file_name)
#             root = tree.getroot()
            
#             # depth first traversal 
#             for elem in root.iter():
#                 # is this a node with comments?
#                 comments = elem.find('Comments')
#                 if comments != None:
#                     # if it is it should have a poster
#                     poster = elem.find('user').text.strip()
#                     # find the name of users that replied
#                     for comment in comments.findall('Comment'):
#                         post_count += 1
#                         commenter = comment.find('user').text.strip()
#                         comment_text = comment.find('body').text
#                         if comment_text == None:
#                             #if somehow the comment has no body, exit this loop
#                             deleted_post_count += 1
#                             break
                        
                                
#                         #save these in the dictionary
#                         if poster not in network:
#                             # first time seing this user as poster. add him
#                             network[poster] = {commenter: set([comment_text.strip()]) }
#                         else:
#                             if commenter not in network[poster]:
#                                 # first time this commenter is commenting on this user 
#                                 network[poster][commenter] = set([comment_text.strip()])
#                             else:
#                                 # add text to existing list
#                                 network[poster][commenter] = network[poster][commenter] | set([comment_text.strip()])

In [6]:
### Deprecated (moved to reddit_tools.py)

# #symmetrize network
# def symmetrize_network(network):
#     # given a network of the form {user_name1 : {user_name2 : {relationship features} } }
#     # returns a symmetrical version of that network
    
#     import copy
#     sym_network = copy.deepcopy(network)
    
#     #go through network
#     for user_A in network.keys():
#         for user_B in network[user_A].keys():
#             msgs = network[user_A][user_B]
            
#             if user_B not in sym_network:
#                 sym_network[user_B] = {user_A:msgs}
#             elif user_A not in sym_network[user_B]:
#                 sym_network[user_B][user_A] =  msgs
#             else:
#                 sym_network[user_B][user_A] = sym_network[user_B][user_A] | msgs
    
    
#     return sym_network