In [3]:
# Network achitecture:
# {user_name1 : {user_name2 : set(messages between) } }
# Current relationship features: messages from 2->1

In [4]:
# preambles
import re

# import custom functions
import sys
# windows path to tools
tool_path = 'C:\\Users\\FG\Desktop\\PhD\\Research\\reddit'
if tool_path not in sys.path:
    sys.path.append(tool_path)
import reddit_tools as rt


#Cats machine Data path
# data_dir = '/home/denys/Research/PowerRelations/Reddit'

#felix's local dev dir
data_dir = 'C:\\Users\\FG\\Desktop\\PhD\\Research\\reddit\\Snapshot_data'
print data_dir

#open latest directory
import os
latest_subdir = max([os.path.join(data_dir,d) for d in os.listdir(data_dir)], key=os.path.getmtime)
print latest_subdir


C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data
C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_world2

In [5]:
network = rt.extract_network(data_dir)


working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Datatest
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_new
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_new2
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_pol6
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_pol7
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_pol8
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_pol_3
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_pol_4
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_world
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_world2
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_world3
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_world4
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_world5
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_world6
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_world7
working in C:\Users\FG\Desktop\PhD\Research\reddit\Snapshot_data\Data_world8

In [6]:
#number of users in the network
print len(network)


85495

In [7]:
#look at first few
for user in network.keys()[:3]:
    print user, network[user]


HirsutePursuit46 {'FrugalNinja': set(['If someone is breathing on their own with various other proofs they possess brain functionality they should be kept alive via feeding tube. If someone shows utter brain death i.e not able to breathe on their own, no brain wave activity - they should be removed from the machine keeping them falsely alive.\n\nAnd no its not okay to sit back and watch someone suffer but killing them is not the answer.'])}
grad97 {'herticalt': set(["They're not concerned about people's eating habits it's about pushing their religious beliefs onto others. The ruling party BJP are Hindu nationalists, have connections to Hindu extremist organizations, and have been implicated in connection to religious violence on a number of occasions. To include the destruction of [the famous Babri Mosque](http://en.wikipedia.org/wiki/Ayodhya_dispute#Demolition_of_the_Babri_Mosque)."])}
xa_6ja {'Skellum': set(['Time for some dominant possessive fucking. Shit...does this mean Atlas Shrugged is a better S&M book than 50 shades of grey?'])}

In [8]:
# small test network
test_network = {'user_A':{'user_A':set([5,6]), 'user_B':set([1]), 'user_C':set([4])}, 'user_B':{'user_A':set([2])}, 'user_C':{'user_A':set([3])} }
print test_network
sym_test_network = rt.symmetrize_network(test_network)
print test_network
print sym_test_network


{'user_B': {'user_A': set([2])}, 'user_C': {'user_A': set([3])}, 'user_A': {'user_B': set([1]), 'user_C': set([4]), 'user_A': set([5, 6])}}
{'user_B': {'user_A': set([2])}, 'user_C': {'user_A': set([3])}, 'user_A': {'user_B': set([1]), 'user_C': set([4]), 'user_A': set([5, 6])}}
{'user_B': {'user_A': set([1, 2])}, 'user_C': {'user_A': set([3, 4])}, 'user_A': {'user_B': set([1, 2]), 'user_C': set([3, 4]), 'user_A': set([5, 6])}}

In [9]:
# sanity check
sym_network = rt.symmetrize_network(network)
print sym_network['xanadu_reloaded']['whiskeycommander']
print sym_network['whiskeycommander']['xanadu_reloaded']


set(["Partition Syria. That's the only way to solve this crisis. One half goes to Assad, the other goes to whoever the opposition is. The fucked up part about it? The side that Assad keeps will probably be the more stable of the two."])
set(["Partition Syria. That's the only way to solve this crisis. One half goes to Assad, the other goes to whoever the opposition is. The fucked up part about it? The side that Assad keeps will probably be the more stable of the two."])

In [10]:
# number of user/node in complete network
print "number of users: ", len(sym_network)


number of users:  136173

In [11]:
#save network in pickled form
import cPickle as pickle

# name and location to save in
pickle_name = data_dir.split(os.sep)[-1]+ "_network.pckl"
pickle_dir = os.sep.join(latest_subdir.split(os.sep)[:-2]) + os.sep + "Pickled Data"
pickle.dump( sym_network, open( pickle_dir + os.sep + pickle_name, "wb") )
print "saved network in ", pickle_dir + '\\' + pickle_name


saved network in  C:\Users\FG\Desktop\PhD\Research\reddit\Pickled Data\Snapshot_data_network.pckl

In [12]:
# reload pickled file for testing
new_sym_network = pickle.load( open(pickle_dir + os.sep + pickle_name, "rb") )
print "loaded network from ", pickle_dir + os.sep + pickle_name


loaded network from  C:\Users\FG\Desktop\PhD\Research\reddit\Pickled Data\Snapshot_data_network.pckl

In [13]:
# number of user/node in complete network
print "number of users: ", len(new_sym_network)


number of users:  136173

In [13]:


In [14]:
# ### Deprecated (moved to reddit tools)
# # user oriented data structure
# # network  = {user_name:{username:set(messages) }}
# network = dict()

# # testing variables
# post_count = 0
# deleted_post_count = 0

# # import XML parser
# import xml.etree.ElementTree as ET

# # walk directory
# for dir_name, subdir_list, file_list in os.walk(latest_subdir):
#     print "working in " + dir_name
    
#     for file_name in file_list:
#         # open file
#         if ("Thread" in file_name) and (".xml" in file_name):
# #             print "working on " + dir_name + '/' + file_name
            
#             # parse the file
#             tree = ET.parse(dir_name + '/' + file_name)
#             root = tree.getroot()
            
#             # depth first traversal 
#             for elem in root.iter():
#                 # is this a node with comments?
#                 comments = elem.find('Comments')
#                 if comments != None:
#                     # if it is it should have a poster
#                     poster = elem.find('user').text.strip()
#                     # find the name of users that replied
#                     for comment in comments.findall('Comment'):
#                         post_count += 1
#                         commenter = comment.find('user').text.strip()
#                         comment_text = comment.find('body').text
#                         if comment_text == None:
#                             #if somehow the comment has no body, exit this loop
#                             deleted_post_count += 1
#                             break
                        
                                
#                         #save these in the dictionary
#                         if poster not in network:
#                             # first time seing this user as poster. add him
#                             network[poster] = {commenter: set([comment_text.strip()]) }
#                         else:
#                             if commenter not in network[poster]:
#                                 # first time this commenter is commenting on this user 
#                                 network[poster][commenter] = set([comment_text.strip()])
#                             else:
#                                 # add text to existing list
#                                 network[poster][commenter] = network[poster][commenter] | set([comment_text.strip()])

In [15]:
### Deprecated (moved to reddit_tools.py)

# #symmetrize network
# def symmetrize_network(network):
#     # given a network of the form {user_name1 : {user_name2 : {relationship features} } }
#     # returns a symmetrical version of that network
    
#     import copy
#     sym_network = copy.deepcopy(network)
    
#     #go through network
#     for user_A in network.keys():
#         for user_B in network[user_A].keys():
#             msgs = network[user_A][user_B]
            
#             if user_B not in sym_network:
#                 sym_network[user_B] = {user_A:msgs}
#             elif user_A not in sym_network[user_B]:
#                 sym_network[user_B][user_A] =  msgs
#             else:
#                 sym_network[user_B][user_A] = sym_network[user_B][user_A] | msgs
    
    
#     return sym_network