In [474]:
from mailbox import mbox
import pandas as pd
import os, sys
import pytz
import time

In [516]:
# http://engineroom.trackmaven.com/blog/monthly-challenge-natural-language-processing/

def get_contents(message, body=None):
    if not body:
        body = message.get_payload(decode=True)
    if len(message):
        
        contents = {
            "fid" : message['GGE-Forum-Id'],
            "tid" : message['GGE-Topic-Id'],
            "mid" : message['GGE-Message-Id'],
            "date": message['date'],
            "from": utils.parseaddr(message['from'])[1],#.replace('<', '').replace('>', '').split(' ')[-1], # GRABS EMAIL ONLY
            "to": message['to'],
            "cc" : message['Cc'] or "None",
            "bcc" : message['Bcc'] or "None",
            "subject": message['subject'] or "None",
            "body": body,
        }
        return contents
#         return df.append(contents, ignore_index=True)
    
# GGE-Forum-Id plots-dev
# GGE-Topic-Id KWhkJE0YFh4
# GGE-Message-Id RVvisTryAAAJ
# Date Mon, 10 Jul 2017 18:02:44 -0500
# From Stevie Lewis <stevie@publiclab.org>
# To undisclosed-recipients:;
# Subject OpenHour tonight!
# Body
# Cc
# Bcc everyone@publiclab.org
# Sender steviepubliclab@gmail.com

# {'bcc': 'everyone@publiclab.org',
#  'body': None,
#  'cc': 'None',
#  'date': 'Mon, 10 Jul 2017 18:02:44 -0500',
#  'fid': 'plots-dev',
#  'from': 'stevie@publiclab.org',
#  'mid': 'RVvisTryAAAJ',
#  'subject': 'OpenHour tonight!',
#  'tid': 'KWhkJE0YFh4',
#  'to': 'undisclosed-recipients:;'}

# contents = get_contents(box[0])
# contents

Parse text/plain content


In [486]:
googlegroups = [group.split('.')[0] for group in os.listdir('../data/mbox/')]

for group in googlegroups:
    
    try:
        start = time.time()
    
        box = mbox('../data/mbox/' + group + '.mbox')

        cols = ["fid", "tid", "mid", "date", "from", "to", "cc", "bcc", "subject", "body"]
        messages = pd.DataFrame(columns=cols)

        for message in box:
            for part in email_iters.typed_subpart_iterator(message, 'text', 'plain'):
                content = get_contents(message, part.get_payload(decode=True))
                messages = messages.append(content, ignore_index=True)

        messages['date'] = pd.to_datetime(messages['date'], utc=True)

        messages.to_csv('../data/temp/' + group + '.csv', index=False)
        
        end = time.time()

        print 'COMPLETED PROCESSING', messages.shape[0], 'POSTS FROM', group.upper(), 'IN', round(end - start, 2), 'SEC'
        
    except:
        
        print 'ERROR PROCESSING', group.upper()


COMPLETED PROCESSING 3099 POSTS FROM GRASSROOTSMAPPING IN 49.97 SEC
COMPLETED PROCESSING 214 POSTS FROM LABORATORIOPUBLICO IN 5.3 SEC
COMPLETED PROCESSING 226 POSTS FROM PLOTS-AIRQUALITY IN 4.05 SEC
COMPLETED PROCESSING 214 POSTS FROM PLOTS-ALPHA IN 3.91 SEC
COMPLETED PROCESSING 279 POSTS FROM PLOTS-AMSTERDAM IN 6.53 SEC
COMPLETED PROCESSING 330 POSTS FROM PLOTS-BALTIMORE-DC IN 7.61 SEC
COMPLETED PROCESSING 1107 POSTS FROM PLOTS-BARNRAISING IN 23.23 SEC
COMPLETED PROCESSING 1395 POSTS FROM PLOTS-BOSTON IN 28.81 SEC
COMPLETED PROCESSING 161 POSTS FROM PLOTS-BUTTE IN 5.34 SEC
COMPLETED PROCESSING 1998 POSTS FROM PLOTS-DEV IN 30.27 SEC
COMPLETED PROCESSING 339 POSTS FROM PLOTS-EDUCATION IN 8.01 SEC
COMPLETED PROCESSING 1054 POSTS FROM PLOTS-GSOC IN 17.56 SEC
COMPLETED PROCESSING 1794 POSTS FROM PLOTS-GULFCOAST IN 35.23 SEC
COMPLETED PROCESSING 2276 POSTS FROM PLOTS-INFRARED IN 42.37 SEC
COMPLETED PROCESSING 504 POSTS FROM PLOTS-KICKSTARTER IN 12.73 SEC
COMPLETED PROCESSING 404 POSTS FROM PLOTS-NORCAL IN 8.63 SEC
COMPLETED PROCESSING 1598 POSTS FROM PLOTS-NYC IN 48.9 SEC
COMPLETED PROCESSING 5572 POSTS FROM PLOTS-ORGANIZERS IN 120.88 SEC
COMPLETED PROCESSING 398 POSTS FROM PLOTS-PHILADELPHIA IN 8.99 SEC
COMPLETED PROCESSING 196 POSTS FROM PLOTS-POTENTIOSTAT IN 6.42 SEC
COMPLETED PROCESSING 322 POSTS FROM PLOTS-PROVIDENCE IN 7.92 SEC
COMPLETED PROCESSING 213 POSTS FROM PLOTS-SKANE IN 6.19 SEC
COMPLETED PROCESSING 297 POSTS FROM PLOTS-SOUTHEAST IN 6.64 SEC
COMPLETED PROCESSING 3620 POSTS FROM PLOTS-SPECTROMETRY IN 62.27 SEC
COMPLETED PROCESSING 1220 POSTS FROM PLOTS-WATERQUALITY IN 23.32 SEC
COMPLETED PROCESSING 493 POSTS FROM PUBLIC-LAB-CHICAGO IN 10.37 SEC
COMPLETED PROCESSING 11 POSTS FROM PUBLICLAB-JERUSALEM IN 1.27 SEC
COMPLETED PROCESSING 465 POSTS FROM PUBLICLAB-LA IN 10.32 SEC
COMPLETED PROCESSING 248 POSTS FROM PUBLICLAB-MIDWEST IN 5.87 SEC
COMPLETED PROCESSING 2 POSTS FROM PUBLICLAB-MOUNTAINWEST IN 0.02 SEC
COMPLETED PROCESSING 89 POSTS FROM PUBLICLAB-NORTHEAST IN 1.63 SEC
COMPLETED PROCESSING 42 POSTS FROM PUBLIC-LAB-NORTHWEST IN 0.85 SEC
COMPLETED PROCESSING 5314 POSTS FROM PUBLICLABORATORY IN 84.48 SEC
COMPLETED PROCESSING 12 POSTS FROM PUBLIC-LAB-VANCOUVER IN 0.22 SEC
COMPLETED PROCESSING 242 POSTS FROM PUBLIC-LAB-WRITING-GROUP IN 7.18 SEC
COMPLETED PROCESSING 86 POSTS FROM PUBLICLAB-PORTUGUES IN 1.97 SEC

Add thread position


In [498]:
googlegroups = [group.split('.')[0] for group in os.listdir('../data/temp/')]

for group in googlegroups:
    
    try:
        start = time.time()
        
        messages = pd.read_csv('../data/temp/' + group + '.csv', parse_dates=['date'], infer_datetime_format=True)
       
        cols = ["fid", "tid", "mid", "pos", "date", "from", "to", "cc", "bcc", "subject", "body"]
        with_pos = pd.DataFrame(columns=cols)

        threads = messages.groupby(['tid'])

        for index, thread in threads:

            posts = thread.sort_values(by='date')
            posts['pos'] = range(len(posts))

            with_pos = with_pos.append(posts, ignore_index=True)
            
        with_pos = with_pos[cols]
        with_pos['pos'] = with_pos['pos'].astype(int)
        
        with_pos.to_csv('../data/posts/' + group + '-posts.csv', index=False)
        
        end = time.time()

        print 'COMPLETED PROCESSING', messages.shape[0], 'POSTS FROM', group.upper(), 'IN', round(end - start, 2), 'SEC'
        
    except:
        
        print 'ERROR PROCESSING', group.upper()


COMPLETED PROCESSING 226 POSTS FROM PLOTS-AIRQUALITY IN 0.74 SEC
COMPLETED PROCESSING 214 POSTS FROM PLOTS-ALPHA IN 0.45 SEC
COMPLETED PROCESSING 3099 POSTS FROM GRASSROOTSMAPPING IN 8.54 SEC
COMPLETED PROCESSING 214 POSTS FROM LABORATORIOPUBLICO IN 0.56 SEC
COMPLETED PROCESSING 279 POSTS FROM PLOTS-AMSTERDAM IN 1.1 SEC
COMPLETED PROCESSING 330 POSTS FROM PLOTS-BALTIMORE-DC IN 1.26 SEC
COMPLETED PROCESSING 1107 POSTS FROM PLOTS-BARNRAISING IN 1.81 SEC
COMPLETED PROCESSING 1395 POSTS FROM PLOTS-BOSTON IN 3.55 SEC
COMPLETED PROCESSING 161 POSTS FROM PLOTS-BUTTE IN 0.86 SEC
COMPLETED PROCESSING 1998 POSTS FROM PLOTS-DEV IN 3.91 SEC
COMPLETED PROCESSING 339 POSTS FROM PLOTS-EDUCATION IN 1.33 SEC
COMPLETED PROCESSING 1054 POSTS FROM PLOTS-GSOC IN 1.48 SEC
COMPLETED PROCESSING 1794 POSTS FROM PLOTS-GULFCOAST IN 4.88 SEC
COMPLETED PROCESSING 2276 POSTS FROM PLOTS-INFRARED IN 5.27 SEC
COMPLETED PROCESSING 504 POSTS FROM PLOTS-KICKSTARTER IN 1.34 SEC
COMPLETED PROCESSING 404 POSTS FROM PLOTS-NORCAL IN 1.84 SEC
COMPLETED PROCESSING 1598 POSTS FROM PLOTS-NYC IN 5.07 SEC
COMPLETED PROCESSING 5572 POSTS FROM PLOTS-ORGANIZERS IN 12.69 SEC
COMPLETED PROCESSING 398 POSTS FROM PLOTS-PHILADELPHIA IN 1.97 SEC
COMPLETED PROCESSING 196 POSTS FROM PLOTS-POTENTIOSTAT IN 1.17 SEC
COMPLETED PROCESSING 322 POSTS FROM PLOTS-PROVIDENCE IN 1.71 SEC
COMPLETED PROCESSING 213 POSTS FROM PLOTS-SKANE IN 1.11 SEC
COMPLETED PROCESSING 297 POSTS FROM PLOTS-SOUTHEAST IN 1.46 SEC
COMPLETED PROCESSING 3620 POSTS FROM PLOTS-SPECTROMETRY IN 9.53 SEC
COMPLETED PROCESSING 1220 POSTS FROM PLOTS-WATERQUALITY IN 3.56 SEC
COMPLETED PROCESSING 493 POSTS FROM PUBLIC-LAB-CHICAGO IN 1.51 SEC
COMPLETED PROCESSING 11 POSTS FROM PUBLICLAB-JERUSALEM IN 0.12 SEC
COMPLETED PROCESSING 465 POSTS FROM PUBLICLAB-LA IN 1.49 SEC
COMPLETED PROCESSING 248 POSTS FROM PUBLICLAB-MIDWEST IN 0.74 SEC
COMPLETED PROCESSING 2 POSTS FROM PUBLICLAB-MOUNTAINWEST IN 0.11 SEC
COMPLETED PROCESSING 89 POSTS FROM PUBLICLAB-NORTHEAST IN 0.29 SEC
COMPLETED PROCESSING 42 POSTS FROM PUBLIC-LAB-NORTHWEST IN 0.26 SEC
COMPLETED PROCESSING 5314 POSTS FROM PUBLICLABORATORY IN 13.23 SEC
COMPLETED PROCESSING 12 POSTS FROM PUBLIC-LAB-VANCOUVER IN 0.16 SEC
COMPLETED PROCESSING 242 POSTS FROM PUBLIC-LAB-WRITING-GROUP IN 0.97 SEC
COMPLETED PROCESSING 86 POSTS FROM PUBLICLAB-PORTUGUES IN 0.27 SEC

Create edgelist


In [515]:
googlegroups = [group.split('.')[0] for group in os.listdir('../data/temp/')]

for group in googlegroups:
    
    try:
        start = time.time()
        
        messages = pd.read_csv('../data/posts/' + group + '-posts.csv', parse_dates=['date'], infer_datetime_format=True)
        messages.drop('pos', 1, inplace=True)
        
        cols = ["fid", "tid", "mid", "date", "source", "target"]messages = pd.read_csv('../data/temp/' + group + '.csv', parse_dates=['date'], infer_datetime_format=True)
       
        cols = ["fid", "tid", "mid", "pos", "date", "from", "to", "cc", "bcc", "subject", "body"]
        with_pos = pd.DataFrame(columns=cols)

        threads = messages.groupby(['tid'])

        for index, thread in threads:

            posts = thread.sort_values(by='date')
            posts['pos'] = range(len(posts))

            with_pos = with_pos.append(posts, ignore_index=True)
            
        with_pos = with_pos[cols]
        with_pos['pos'] = with_pos['pos'].astype(int)
        edge_list = pd.DataFrame(columns=cols)
        
        singletons = 0
        threads = messages.groupby(['tid'])
        
        for index, thread in threads:
    
            if len(thread) > 1:

                posts = thread.sort_values(by='date')

                op = posts.iloc[0]
                responses = posts.iloc[1:]

                for response in responses.iterrows():

                    edge = response[1][:4] # we still want columns fid, tid, mid, and date
                    edge['source'] = response[1]['from']
                    edge['target'] = op['from']

                    edge_list = edge_list.append(edge, ignore_index=True)
            else:

                singletons += 1
                
        edge_list.to_csv('../data/edges/' + group + '-edges.csv', index=False)
        
        end = time.time()
        
        print 'COMPLETED PROCESSING', edge_list.shape[0], 'EDGES EXCLUDING', singletons, 'SINGLETONS FROM', group.upper(), 'IN', round(end - start, 2), 'SEC'
        
    except:
        
        print 'ERROR PROCESSING', group.upper()


COMPLETED PROCESSING 101 EDGES EXCLUDING 76 SINGLETONS FROM PLOTS-AIRQUALITY IN 0.79 SEC
COMPLETED PROCESSING 168 EDGES EXCLUDING 18 SINGLETONS FROM PLOTS-ALPHA IN 1.26 SEC
COMPLETED PROCESSING 1754 EDGES EXCLUDING 785 SINGLETONS FROM GRASSROOTSMAPPING IN 13.02 SEC
COMPLETED PROCESSING 136 EDGES EXCLUDING 38 SINGLETONS FROM LABORATORIOPUBLICO IN 1.04 SEC
COMPLETED PROCESSING 59 EDGES EXCLUDING 186 SINGLETONS FROM PLOTS-AMSTERDAM IN 0.49 SEC
COMPLETED PROCESSING 88 EDGES EXCLUDING 197 SINGLETONS FROM PLOTS-BALTIMORE-DC IN 0.74 SEC
COMPLETED PROCESSING 828 EDGES EXCLUDING 112 SINGLETONS FROM PLOTS-BARNRAISING IN 6.31 SEC
COMPLETED PROCESSING 796 EDGES EXCLUDING 382 SINGLETONS FROM PLOTS-BOSTON IN 6.04 SEC
COMPLETED PROCESSING 22 EDGES EXCLUDING 120 SINGLETONS FROM PLOTS-BUTTE IN 0.21 SEC
COMPLETED PROCESSING 1351 EDGES EXCLUDING 321 SINGLETONS FROM PLOTS-DEV IN 9.99 SEC
COMPLETED PROCESSING 98 EDGES EXCLUDING 202 SINGLETONS FROM PLOTS-EDUCATION IN 0.74 SEC
COMPLETED PROCESSING 867 EDGES EXCLUDING 71 SINGLETONS FROM PLOTS-GSOC IN 6.19 SEC
COMPLETED PROCESSING 909 EDGES EXCLUDING 593 SINGLETONS FROM PLOTS-GULFCOAST IN 6.92 SEC
COMPLETED PROCESSING 1580 EDGES EXCLUDING 358 SINGLETONS FROM PLOTS-INFRARED IN 12.36 SEC
COMPLETED PROCESSING 391 EDGES EXCLUDING 31 SINGLETONS FROM PLOTS-KICKSTARTER IN 2.88 SEC
COMPLETED PROCESSING 130 EDGES EXCLUDING 226 SINGLETONS FROM PLOTS-NORCAL IN 1.0 SEC
COMPLETED PROCESSING 727 EDGES EXCLUDING 620 SINGLETONS FROM PLOTS-NYC IN 6.93 SEC
COMPLETED PROCESSING 3843 EDGES EXCLUDING 804 SINGLETONS FROM PLOTS-ORGANIZERS IN 31.06 SEC
COMPLETED PROCESSING 135 EDGES EXCLUDING 207 SINGLETONS FROM PLOTS-PHILADELPHIA IN 1.11 SEC
COMPLETED PROCESSING 34 EDGES EXCLUDING 139 SINGLETONS FROM PLOTS-POTENTIOSTAT IN 0.32 SEC
COMPLETED PROCESSING 87 EDGES EXCLUDING 194 SINGLETONS FROM PLOTS-PROVIDENCE IN 0.74 SEC
COMPLETED PROCESSING 21 EDGES EXCLUDING 175 SINGLETONS FROM PLOTS-SKANE IN 0.24 SEC
COMPLETED PROCESSING 84 EDGES EXCLUDING 177 SINGLETONS FROM PLOTS-SOUTHEAST IN 0.74 SEC
COMPLETED PROCESSING 2491 EDGES EXCLUDING 566 SINGLETONS FROM PLOTS-SPECTROMETRY IN 20.49 SEC
COMPLETED PROCESSING 751 EDGES EXCLUDING 281 SINGLETONS FROM PLOTS-WATERQUALITY IN 5.97 SEC
COMPLETED PROCESSING 245 EDGES EXCLUDING 180 SINGLETONS FROM PUBLIC-LAB-CHICAGO IN 1.78 SEC
COMPLETED PROCESSING 8 EDGES EXCLUDING 1 SINGLETONS FROM PUBLICLAB-JERUSALEM IN 0.07 SEC
COMPLETED PROCESSING 193 EDGES EXCLUDING 207 SINGLETONS FROM PUBLICLAB-LA IN 1.42 SEC
COMPLETED PROCESSING 157 EDGES EXCLUDING 50 SINGLETONS FROM PUBLICLAB-MIDWEST IN 1.14 SEC
COMPLETED PROCESSING 1 EDGES EXCLUDING 0 SINGLETONS FROM PUBLICLAB-MOUNTAINWEST IN 0.02 SEC
COMPLETED PROCESSING 57 EDGES EXCLUDING 16 SINGLETONS FROM PUBLICLAB-NORTHEAST IN 0.41 SEC
COMPLETED PROCESSING 18 EDGES EXCLUDING 19 SINGLETONS FROM PUBLIC-LAB-NORTHWEST IN 0.13 SEC
COMPLETED PROCESSING 3346 EDGES EXCLUDING 1043 SINGLETONS FROM PUBLICLABORATORY IN 27.11 SEC
COMPLETED PROCESSING 1 EDGES EXCLUDING 10 SINGLETONS FROM PUBLIC-LAB-VANCOUVER IN 0.02 SEC
COMPLETED PROCESSING 56 EDGES EXCLUDING 157 SINGLETONS FROM PUBLIC-LAB-WRITING-GROUP IN 0.48 SEC
COMPLETED PROCESSING 53 EDGES EXCLUDING 16 SINGLETONS FROM PUBLICLAB-PORTUGUES IN 0.45 SEC