notebook.community

Edit and run



In [5]:

    
import sys
print(sys.version)









    



3.6.2 |Continuum Analytics, Inc.| (default, Jul 20 2017, 13:51:32) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]



In [53]:

    
from googlegroupexporter.cli import arguments, verbosity, export_with_progress
from googlegroupexporter.exporters import CsvExporter, MailExporter
from googlegroupexporter.session import session_factory

from types import SimpleNamespace



In [61]:

    
data_dir = 'data/'

# you only need the cookie for private groups
with open(data_dir + 'ggcreds.txt') as f:
    cookie = f.read().splitlines()[0] # we only need the first line in this case
    
# print(cookie)
# # 'SID=6QQE1nPtLxNmQwlIoZVO...WIYhFIRcQMWt0CqVl'

with open(data_dir + 'lists.txt') as f:
    lists = f.read().splitlines()
    
# print(lists)
# # ['publiclaboratory', 'plots-spectrometry',...'grassrootsmapping', 'plots-infrared']


# all of these are probably not neccessary due to many of them being defaults but that's an expirement for another day
kwargs = {'verbose' : 1, 'cookies' : cookie, 'workers' : 10, 
           'cache_dir' : 'webcache', 'cache_days' : 7, 
           'cache_forever' : False, 'mode' : 'mbox-or-csv', 'group' : 'your-group-here'} 

# since I am using a list of groups, I will fill in my own options.group but if you wanted to follow the code exactly
# you would put your group in the space that says 'your-group-here'  similarly for 'mode' i'm just going to fill it in
# directly

options = SimpleNamespace(**kwargs)

session = session_factory(
        options.cookies, options.workers,
        options.cache_dir, options.cache_days, options.cache_forever)



In [57]:

    
# # MBOX
Exporter = dict(csv=CsvExporter, mbox=MailExporter)['mbox'] # <-- or options.mode

for group in lists:
    
    try:
    
        options.group = group

        export_with_progress(Exporter(session), options.group)
        
    except Exception as err:
        
        print('SAD EMOTICON FACE: {0}'.format(err))
        
# # For as many groups as you have...

# # [04:25] 8389 downloads from 8389 requests
# # 0it [00:00, ?it/s]

# # 22 index pages listing 2152 topics with 6079 messages.



In [63]:

    
# # CSV
# # As an aside, this takes much less time because it is only scraping the index page

Exporter = dict(csv=CsvExporter, mbox=MailExporter)['csv'] # <-- or options.mode

for group in lists:
    
    try:
    
        print('Retreiving entries for {0}'.format(group))  # <-- or options.group

        export_with_progress(Exporter(session), group)
        
        print('')
        
    except Exception as err:
        
        print('SAD EMOTICON FACE: {0}'.format(err))
        
# # For as many groups as you have...

# # Retreiving entries for publiclaboratory

# # [00:10] 2175 downloads from 2175 requests
# # [00:00] 2189 downloads from 2277 requests

# # 2150 topics listed.



In [ ]: