Reddit Data Exploration


In [3]:
# preambles
import re

#Cats machine Data path
data_dir = '/home/denys/Research/PowerRelations/Reddit'

#felix's local dev dir
data_dir = 'C:\\Users\\FG\\Desktop\\PhD\\Research\\reddit\\devdata'
print data_dir

#open latest directory
import os
latest_subdir = max([os.path.join(data_dir,d) for d in os.listdir(data_dir)], key=os.path.getmtime)
print latest_subdir


C:\Users\FG\Desktop\PhD\Research\reddit\devdata
C:\Users\FG\Desktop\PhD\Research\reddit\devdata\Data_world8

In [4]:
# user oriented data structure
# user_list  = {user_name:{thread_name:msg_count}}
user_list = dict()

# thread oriented data structure
# thread list = {thread_name:{user_name:msg_count}}
thread_list = dict()

In [5]:
# walk directory
for dir_name, subdir_list, file_list in os.walk(latest_subdir):
    print "working in " + dir_name
    for file_name in file_list:
        # open file
        if ("Thread" in file_name) and (".xml" in file_name):
            # print "working on " + dir_name + '/' + file_name
            
            #parse file
            with open(dir_name + '/' + file_name, 'r') as open_file:
                for line in open_file:
                             
                    #look at lines with user names using regexp
                    match_obj = re.match(r'\s*<user>(.*)</user>\n', line)

                    if match_obj:
                        # user tag found
                        # work on user oriented data
                        if match_obj.group(1) not in user_list:
                            # first time seeing this user
                            user_list[match_obj.group(1)] = {file_name:1}
                        else:
                            if file_name not in user_list[match_obj.group(1)]:
                                # first time seeing this user in this thread
                                user_list[match_obj.group(1)][file_name] = 1
                            else:
                                #user has multiple posts in this thread
                                user_list[match_obj.group(1)][file_name] += 1
                        
                        #work on thread oriented data
                        if file_name not in thread_list:
                            # first time in thread
                            thread_list[file_name] = {match_obj.group(1):1}
                        else:
                            if match_obj.group(1) not in thread_list[file_name]:
                                #first time seeing user in this thread
                                thread_list[file_name][match_obj.group(1)] = 1
                            else:
                                #user has multiple posts in this thread
                                thread_list[file_name][match_obj.group(1)] += 1


working in C:\Users\FG\Desktop\PhD\Research\reddit\devdata\Data_world8

In [6]:
print "number of users: ", len(user_list)
print "number of thread: ", len(thread_list)


number of users:  31717
number of thread:  2434

In [7]:
for user in user_list.keys()[:10]:
    print user, user_list[user]

print 

for thread in thread_list.keys()[:2]:
    print thread, thread_list[thread]


grad97 {'Thread_world858.xml': 1}
Computer_Name {'Thread_world2338.xml': 1, 'Thread_world1811.xml': 3, 'Thread_world1308.xml': 1, 'Thread_world2225.xml': 1, 'Thread_world1171.xml': 1, 'Thread_world2176.xml': 1, 'Thread_world872.xml': 6, 'Thread_world1110.xml': 2, 'Thread_world1662.xml': 2, 'Thread_world2401.xml': 1, 'Thread_world1500.xml': 1, 'Thread_world1356.xml': 3, 'Thread_world384.xml': 5, 'Thread_world64.xml': 5, 'Thread_world1021.xml': 1, 'Thread_world546.xml': 1, 'Thread_world1496.xml': 1, 'Thread_world423.xml': 1, 'Thread_world1438.xml': 1, 'Thread_world2267.xml': 1, 'Thread_world664.xml': 1, 'Thread_world408.xml': 1, 'Thread_world935.xml': 1, 'Thread_world986.xml': 3, 'Thread_world11.xml': 1, 'Thread_world1381.xml': 1, 'Thread_world449.xml': 1, 'Thread_world1044.xml': 1, 'Thread_world1002.xml': 2, 'Thread_world2195.xml': 4, 'Thread_world964.xml': 2, 'Thread_world2364.xml': 1, 'Thread_world458.xml': 1}
PointAndClick {'Thread_world1799.xml': 6}
simkessy {'Thread_world2189.xml': 1}
xrmrct45 {'Thread_world1777.xml': 1}
Bedebao {'Thread_world1821.xml': 1, 'Thread_world762.xml': 2, 'Thread_world767.xml': 1}
no1skaman {'Thread_world182.xml': 1}
RedSquirrelFtw {'Thread_world2181.xml': 1}
Workwhereucan {'Thread_world393.xml': 1}
Stemnin {'Thread_world583.xml': 1}

Thread_world966.xml {'CollumMcJingleballs': 1, 'rppkn': 1, 'Bushbone': 1}
Thread_world2227.xml {'HitlersFleshlight': 1, 'Brosepherr': 1, 'repeat-': 2, 'tottenhamspurs': 2, 'brahmyn': 1, 'Bejewerly': 2, 'lambright': 1, 'lordderplythethird': 1, 'KamehamehaSockpuppet': 1, 'HitlersGynecologist': 1, '50ShadesOfPatriotic': 1, 'perogies': 1, 'DracoOculus': 1, 'TheLastOfYou': 1, 'FoxReagan': 1, 'NyupDeddyXMTN': 2, 'ilicho': 1, 'Zenarchist': 1}

In [8]:
# sanity check
# total number of posts
sum = 0
for user in user_list.keys():
    for thread in user_list[user].keys():
        sum += user_list[user][thread] 
print sum 

sum = 0
for thread in thread_list.keys():
    for user in thread_list[thread].keys():
        sum += thread_list[thread][user]
print sum


97015
97015

In [9]:
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
# plot the number of people posting in n threads

#find max length
max_len = 0
for user in user_list.keys():
    max_len = max(len(user_list[user]), max_len, 0)
print "the largest number of threads a user has posted in is: ", max_len

# set initial counts to 0
counts_1 = [0] * (max_len+1)

# aggregate counts
for user in user_list.keys():
    # increment count
    counts_1[len(user_list[user])] += 1

# plot relevant results
max_x = 20
plt.bar(range(max_len+1)[:max_x], counts_1[:max_x], align='center')
plt.xlim(0,max_x)
plt.show()


the largest number of threads a user has posted in is:  477

In [11]:
#count of how many posts per user per thread

# find max posts by a single user in a thread
max_count = 0
for user in user_list.keys():
    for thread in user_list[user].keys():
        max_count = max(max_count, user_list[user][thread])
print "max posts by a single user in a thread: ", max_count

# initialize
counts_2 = [0]*(max_count+1)

for user in user_list.keys():
    for thread in user_list[user].keys():
        counts_2[user_list[user][thread]] += 1

# plot relevant results
max_x = 20
plt.bar(range(max_len+1)[:max_x], counts_2[:max_x], align='center')
plt.xlim(0,max_x)
plt.show()

print "first 20 values: ",counts_2[:20]


max posts by a single user in a thread:  30
first 20 values:  [0, 54986, 8870, 2920, 1297, 635, 338, 186, 117, 75, 55, 29, 27, 17, 15, 5, 7, 3, 2, 4]

In [12]:
# number of messages per thread
counts_3 = dict()

for user in user_list.keys():
    for thread in user_list[user].keys():
        if thread not in counts_3:
            counts_3[thread] = user_list[user][thread]
        else:
            counts_3[thread] += user_list[user][thread]
            
#  plot relevant results
counts_3_list = sorted(counts_3.values(), reverse=True)
plt.bar(range(len(counts_3_list)), counts_3_list, align='center')
plt.xlim(-0.5,len(counts_3_list)+1)
plt.show()



In [13]:
# how many users per thread
max_users = 0
for thread in thread_list.keys():
    max_users = max(max_users, len(thread_list[thread]))
print "Max number of users in a thread: ", max_users

counts_4 = sorted([len(thread_list[x]) for x in thread_list.keys()], reverse=True)
#  plot relevant results
plt.bar(range(len(counts_4)), counts_4, align='center')
plt.xlim(-0.5,len(counts_4)+1)
plt.show()


Max number of users in a thread:  186

In [14]:
#XML Version of the parsing
import xml.etree.ElementTree as ET
# user oriented data structure
# user_list  = {user_name:{thread_name:msg_count}}
user_list2 = dict()

# walk directory
for dir_name, subdir_list, file_list in os.walk(latest_subdir):
    print "working in " + dir_name
    
    for file_name in file_list:
        # open file
        if ("Thread" in file_name) and (".xml" in file_name):
#             print "working on " + dir_name + '/' + file_name
            
            # parse the file
            tree = ET.parse(dir_name + '/' + file_name)
            root = tree.getroot()
            
            # depth first traversal 
            for elem in root.iter():
                if elem.tag == 'user':
                    #user found
                    if elem.text not in user_list2:
                        # first time seeing this user
                        user_list2[elem.text] = 1
                        
print len(user_list2)


working in C:\Users\FG\Desktop\PhD\Research\reddit\devdata\Data_world8
31717

In [15]:
print len(user_list2)


31717

There is a discrepancy in the num of users. The following code explores the cause.


In [16]:
# import custom functions
import sys
# windows path to tools
tool_path = 'C:\\Users\\FG\Desktop\\PhD\\Research\\reddit'
if tool_path not in sys.path:
    sys.path.append(tool_path)
import reddit_tools as rt

In [17]:
network = rt.extract_network(latest_subdir)


working in C:\Users\FG\Desktop\PhD\Research\reddit\devdata\Data_world8

In [19]:
sym_network = rt.symmetrize_network(network)

In [20]:
print len(sym_network)


31655

In [24]:
cnt = 0
for user in user_list2:
    if user not in sym_network.keys():
        cnt += 1 
        print cnt, user


1 SebasTheBass
2 drkatherine1
3 ghfdfrdf
4 thatshirtman
5 TheRedKIller
6 wefdfs
7 JediMasterMaceDindu
8 hotyaznboi
9 asker007
10 itsapartyinthensa
11 occer
12 nt337
13 TechLovinGeek
14 WinterContent
15 kim_attree
16 slicky803
17 TheHorrorFreak
18 TheMrGhost
19 headyasphuck
20 fluffyduckyp
21 httgf
22 AirDeParis
23 Ninma
24 Ducreux4U
25 autumn_motive
26 game_taker101
27 wumao
28 caiyixian
29 kash_if
30 digitaldevncs
31 KharakIsBurning
32 Promethean1
33 shadow-banned
34 littledinobug12
35 Adminisitrator
36 h1478971
37 TheRoseOfSolidarity
38 companoo
39 afowles
40 coldfurify
41 The_CT_Kid
42 aguamineral
43 Ambarsariya
44 upslupe
45 Vereorx
46 CaptainOnBoard
47 barneyskywalker
48 timair
49 Gnurx
50 antiduehring
51 cameronj
52 FeedMeBlood
53 TIFUbyredditting
54 Procrastinating_Emu
55 rhktuhin
56 st_gerasimos
57 marketing-art
58 zerode
59 shizzler
60 olasaustralia
61 bobby3eb
62 Reagan409

In [23]:
for user in sym_network.keys():
    if user not in user_list2:
        print user

Looking at the files, these are users that appear only in threads with no replies.


In [ ]: