In [1]:
import sqlite3 as sql
import pandas as pd
import codecs, re

In [2]:
db = sql.connect('database.sqlite')
data = pd.read_sql('SELECT parent_id, body, id FROM May2015\
                    WHERE LENGTH(body) > 10 AND LENGTH(body) < 150', db, chunksize=100000)

In [8]:
REMOVE = re.compile(r'[\*\r\n]*')

In [34]:
def getParent(parent_id):
    return pd.read_sql("SELECT parent_id, body, id FROM May2015\
                        WHERE id='" + comment.parent_id[3:] + "'", db)
    
def validComment(comment):
    
    # ignore deleted comments
    if '[deleted]' in comment:
        return False
    
    # allow only ascii characters
    try:
        comment.decode('ascii')
    except:
        return False
    
    # no emoticons allowed!
#     if re.match('\[\]\(/[a-z]+\)', emoticon) is None:
    if '[]' in comment[:2]:
        return False
    
    return True
        
    
minconvlen = 8
numchunks = 0
filenum = 0


with open('conv/redditComments.txt', 'w+') as f:
    # for each chunk of data
    for chunk in data:

        # for each comment in the chunk
        for idx, comment in chunk.iterrows():
            parent = getParent(comment.parent_id)

            if not validComment(comment.body):
                continue

            # trace parents
            conv = [comment.body]
            while not parent.empty:

                if not validComment(parent.body[0]):
                    break

                conv.append(parent.body[0])
                parent = getParent(parent.parent_id[0])

            # write long conversations to file
            if len(conv) > minconvlen:
                for comm in reversed(conv):
                    f.write(re.sub(REMOVE, ' ', comm))
                    f.write('\n')

                filenum += 1

        numchunks +=1
        print "Chunks processed:\t" + str(numchunks), "Files written:\t" + str(filenum)


Chunks processed:	1 Files written:	406
Chunks processed:	2 Files written:	1442
Chunks processed:	3 Files written:	2812
Chunks processed:	4 Files written:	4060
Chunks processed:	5 Files written:	5757
Chunks processed:	6 Files written:	7416
Chunks processed:	7 Files written:	9229
Chunks processed:	8 Files written:	11544
Chunks processed:	9 Files written:	14197
Chunks processed:	10 Files written:	17187
Chunks processed:	11 Files written:	20368
Chunks processed:	12 Files written:	22808
Chunks processed:	13 Files written:	25150
Chunks processed:	14 Files written:	27655
Chunks processed:	15 Files written:	29726
Chunks processed:	16 Files written:	32203
Chunks processed:	17 Files written:	34599
Chunks processed:	18 Files written:	37177
Chunks processed:	19 Files written:	40021
Chunks processed:	20 Files written:	43468
Chunks processed:	21 Files written:	46783
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-34-5065093f1072> in <module>()
     32     # for each comment in the chunk
     33     for idx, comment in chunk.iterrows():
---> 34         parent = getParent(comment.parent_id)
     35 
     36         if not validComment(comment.body):

<ipython-input-34-5065093f1072> in getParent(parent_id)
      1 def getParent(parent_id):
      2     lvl, pid = parent_id.split('_')
----> 3     return pd.read_sql("SELECT parent_id, body, id FROM May2015                        WHERE id='" + pid + "'", db)
      4 
      5 def validComment(comment):

/usr/local/lib/python2.7/site-packages/pandas/io/sql.pyc in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize)
    497             sql, index_col=index_col, params=params,
    498             coerce_float=coerce_float, parse_dates=parse_dates,
--> 499             chunksize=chunksize)
    500 
    501     try:

/usr/local/lib/python2.7/site-packages/pandas/io/sql.pyc in read_query(self, sql, index_col, coerce_float, params, parse_dates, chunksize)
   1607             frame = _wrap_result(data, columns, index_col=index_col,
   1608                                  coerce_float=coerce_float,
-> 1609                                  parse_dates=parse_dates)
   1610             return frame
   1611 

/usr/local/lib/python2.7/site-packages/pandas/io/sql.pyc in _wrap_result(data, columns, index_col, coerce_float, parse_dates)
    137                                    coerce_float=coerce_float)
    138 
--> 139     _parse_date_columns(frame, parse_dates)
    140 
    141     if index_col is not None:

/usr/local/lib/python2.7/site-packages/pandas/io/sql.pyc in _parse_date_columns(data_frame, parse_dates)
    123     # we could in theory do a 'nice' conversion from a FixedOffset tz
    124     # GH11216
--> 125     for col_name, df_col in data_frame.iteritems():
    126         if com.is_datetime64tz_dtype(df_col):
    127             data_frame[col_name] = _handle_date_column(df_col)

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in iteritems(self)
    604         if self.columns.is_unique and hasattr(self, '_item_cache'):
    605             for k in self.columns:
--> 606                 yield k, self._get_item_cache(k)
    607         else:
    608             for i, k in enumerate(self.columns):

/usr/local/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1346             res = self._box_item_values(item, values)
   1347             cache[item] = res
-> 1348             res._set_as_cached(item, self)
   1349 
   1350             # for a chain

/usr/local/lib/python2.7/site-packages/pandas/core/generic.pyc in _set_as_cached(self, item, cacher)
   1356         cacher.
   1357         """
-> 1358         self._cacher = (item, weakref.ref(cacher))
   1359 
   1360     def _reset_cacher(self):

/usr/local/lib/python2.7/site-packages/pandas/core/generic.pyc in __setattr__(self, name, value)
   2679 
   2680         try:
-> 2681             object.__getattribute__(self, name)
   2682             return object.__setattr__(self, name, value)
   2683         except AttributeError:

KeyboardInterrupt: 

In [28]:
thing = "hello, \' \n World."
thing2 = re.sub(r'[^A-Za-z0-9]+[\r\n]*', ' ', thing)
print thing
print thing2


hello, ' 
 World.
hello World 

In [ ]:
df = pd.read_sql('SELECT id, parent_id FROM May2015\
                WHERE LENGTH(body) > 10 AND LENGTH(body) < 150', db)

In [ ]: