In [171]:
    
with open("/Users/chengjun/github/cjc2016/data/ocuppy/central/zz-hk-2014-9.rtf") as f:
   news = f.readlines()
    
In [172]:
    
len(news)
    
    Out[172]:
In [173]:
    
print news[17].decode('gb18030')[:500]
    
    
In [174]:
    
def stringclean(s):
    s = s.decode('gb18030').encode('utf8')
    s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b\cf6 ', '')
    s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b0\cf0 ', '')
    s = s.replace('\par', '').replace('\n', '')
    return s
    
In [175]:
    
print stringclean(news[17])
    
    
In [176]:
    
news_clean = [stringclean(n) for n in news]
len(news_clean)
    
    Out[176]:
In [177]:
    
print news_clean[17][:100]
    
    
In [181]:
    
from collections import defaultdict
def deletetab(s):
    return s.replace('\t', '')
import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()
    
In [183]:
    
def readblocks(data):
    copy = False
    n = 0
    block = []
    chunk = defaultdict(lambda:[])
    for i in data:
        try:
            if "~~~~~~~~~~~~~~~~~~~~~~~~~~  #" in i:
                copy = True
            elif "文章编号:" in i:
                id = i.replace('文章编号: ', '')
                source = block[0].split('|')[0]
                info = block[1]
                title = deletetab(block[3])
                body = [j for j in block[6:] if j != '\n']
                body = ' '.join(body)
                body = deletetab(body)
                body = '"' + body  + '"'
                line = '\t'.join([id, source, info, title, body])
                chunk[id] = line
                block = []
                n += 1
                if n%10 == 0:
                    flushPrint(n)
                copy = False
            elif copy:
                block.append(i)
        except Exception, e:
            print i, e
            pass
    return chunk
    
In [186]:
    
news_result = readblocks(news_clean)
    
    
In [187]:
    
len(news_result)
    
    Out[187]:
In [190]:
    
news_result.keys()[:5]
    
    Out[190]:
In [192]:
    
with open('/Users/chengjun/github/cjc2016/data/zz-hk-2014-9-clean.txt','a') as p:
     for record in news_result.values():
         p.write(record+"\n")
    
In [194]:
    
import pandas as pd
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/zz-hk-2014-9-clean.txt', sep = "\t", header=None)
df[:10]
    
    Out[194]:
In [197]:
    
import os
os.chdir('/Users/chengjun/github/cjc2016/data/occupycentral/')
import glob
filenames = glob.glob('*.rtf')
filenames
    
    Out[197]:
In [198]:
    
for i in filenames:
    print i
    with open(i) as f:
        news = f.readlines()
        news = [stringclean(n) for n in news]
        news_result = readblocks(news)
        with open('/Users/chengjun/github/cjc2016/data/zz-hk-all-clean.txt','a') as p:
            for record in news_result.values():
                p.write(record+"\n")