Reddit Data Exploration
In [3]:
# preambles
import re
#Cats machine Data path
data_dir = '/home/denys/Research/PowerRelations/Reddit'
#felix's local dev dir
data_dir = 'C:\\Users\\FG\\Desktop\\PhD\\Research\\reddit\\devdata'
print data_dir
#open latest directory
import os
latest_subdir = max([os.path.join(data_dir,d) for d in os.listdir(data_dir)], key=os.path.getmtime)
print latest_subdir
In [4]:
# user oriented data structure
# user_list = {user_name:{thread_name:msg_count}}
user_list = dict()
# thread oriented data structure
# thread list = {thread_name:{user_name:msg_count}}
thread_list = dict()
In [5]:
# walk directory
for dir_name, subdir_list, file_list in os.walk(latest_subdir):
print "working in " + dir_name
for file_name in file_list:
# open file
if ("Thread" in file_name) and (".xml" in file_name):
# print "working on " + dir_name + '/' + file_name
#parse file
with open(dir_name + '/' + file_name, 'r') as open_file:
for line in open_file:
#look at lines with user names using regexp
match_obj = re.match(r'\s*<user>(.*)</user>\n', line)
if match_obj:
# user tag found
# work on user oriented data
if match_obj.group(1) not in user_list:
# first time seeing this user
user_list[match_obj.group(1)] = {file_name:1}
else:
if file_name not in user_list[match_obj.group(1)]:
# first time seeing this user in this thread
user_list[match_obj.group(1)][file_name] = 1
else:
#user has multiple posts in this thread
user_list[match_obj.group(1)][file_name] += 1
#work on thread oriented data
if file_name not in thread_list:
# first time in thread
thread_list[file_name] = {match_obj.group(1):1}
else:
if match_obj.group(1) not in thread_list[file_name]:
#first time seeing user in this thread
thread_list[file_name][match_obj.group(1)] = 1
else:
#user has multiple posts in this thread
thread_list[file_name][match_obj.group(1)] += 1
In [6]:
print "number of users: ", len(user_list)
print "number of thread: ", len(thread_list)
In [7]:
for user in user_list.keys()[:10]:
print user, user_list[user]
print
for thread in thread_list.keys()[:2]:
print thread, thread_list[thread]
In [8]:
# sanity check
# total number of posts
sum = 0
for user in user_list.keys():
for thread in user_list[user].keys():
sum += user_list[user][thread]
print sum
sum = 0
for thread in thread_list.keys():
for user in thread_list[thread].keys():
sum += thread_list[thread][user]
print sum
In [9]:
import matplotlib.pyplot as plt
%matplotlib inline
In [10]:
# plot the number of people posting in n threads
#find max length
max_len = 0
for user in user_list.keys():
max_len = max(len(user_list[user]), max_len, 0)
print "the largest number of threads a user has posted in is: ", max_len
# set initial counts to 0
counts_1 = [0] * (max_len+1)
# aggregate counts
for user in user_list.keys():
# increment count
counts_1[len(user_list[user])] += 1
# plot relevant results
max_x = 20
plt.bar(range(max_len+1)[:max_x], counts_1[:max_x], align='center')
plt.xlim(0,max_x)
plt.show()
In [11]:
#count of how many posts per user per thread
# find max posts by a single user in a thread
max_count = 0
for user in user_list.keys():
for thread in user_list[user].keys():
max_count = max(max_count, user_list[user][thread])
print "max posts by a single user in a thread: ", max_count
# initialize
counts_2 = [0]*(max_count+1)
for user in user_list.keys():
for thread in user_list[user].keys():
counts_2[user_list[user][thread]] += 1
# plot relevant results
max_x = 20
plt.bar(range(max_len+1)[:max_x], counts_2[:max_x], align='center')
plt.xlim(0,max_x)
plt.show()
print "first 20 values: ",counts_2[:20]
In [12]:
# number of messages per thread
counts_3 = dict()
for user in user_list.keys():
for thread in user_list[user].keys():
if thread not in counts_3:
counts_3[thread] = user_list[user][thread]
else:
counts_3[thread] += user_list[user][thread]
# plot relevant results
counts_3_list = sorted(counts_3.values(), reverse=True)
plt.bar(range(len(counts_3_list)), counts_3_list, align='center')
plt.xlim(-0.5,len(counts_3_list)+1)
plt.show()
In [13]:
# how many users per thread
max_users = 0
for thread in thread_list.keys():
max_users = max(max_users, len(thread_list[thread]))
print "Max number of users in a thread: ", max_users
counts_4 = sorted([len(thread_list[x]) for x in thread_list.keys()], reverse=True)
# plot relevant results
plt.bar(range(len(counts_4)), counts_4, align='center')
plt.xlim(-0.5,len(counts_4)+1)
plt.show()
In [14]:
#XML Version of the parsing
import xml.etree.ElementTree as ET
# user oriented data structure
# user_list = {user_name:{thread_name:msg_count}}
user_list2 = dict()
# walk directory
for dir_name, subdir_list, file_list in os.walk(latest_subdir):
print "working in " + dir_name
for file_name in file_list:
# open file
if ("Thread" in file_name) and (".xml" in file_name):
# print "working on " + dir_name + '/' + file_name
# parse the file
tree = ET.parse(dir_name + '/' + file_name)
root = tree.getroot()
# depth first traversal
for elem in root.iter():
if elem.tag == 'user':
#user found
if elem.text not in user_list2:
# first time seeing this user
user_list2[elem.text] = 1
print len(user_list2)
In [15]:
print len(user_list2)
There is a discrepancy in the num of users. The following code explores the cause.
In [16]:
# import custom functions
import sys
# windows path to tools
tool_path = 'C:\\Users\\FG\Desktop\\PhD\\Research\\reddit'
if tool_path not in sys.path:
sys.path.append(tool_path)
import reddit_tools as rt
In [17]:
network = rt.extract_network(latest_subdir)
In [19]:
sym_network = rt.symmetrize_network(network)
In [20]:
print len(sym_network)
In [24]:
cnt = 0
for user in user_list2:
if user not in sym_network.keys():
cnt += 1
print cnt, user
In [23]:
for user in sym_network.keys():
if user not in user_list2:
print user
Looking at the files, these are users that appear only in threads with no replies.
In [ ]: