In [171]:
with open("/Users/chengjun/github/cjc2016/data/ocuppy/central/zz-hk-2014-9.rtf") as f:
news = f.readlines()
In [172]:
len(news)
Out[172]:
In [173]:
print news[17].decode('gb18030')[:500]
In [174]:
def stringclean(s):
s = s.decode('gb18030').encode('utf8')
s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b\cf6 ', '')
s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b0\cf0 ', '')
s = s.replace('\par', '').replace('\n', '')
return s
In [175]:
print stringclean(news[17])
In [176]:
news_clean = [stringclean(n) for n in news]
len(news_clean)
Out[176]:
In [177]:
print news_clean[17][:100]
In [181]:
from collections import defaultdict
def deletetab(s):
return s.replace('\t', '')
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
In [183]:
def readblocks(data):
copy = False
n = 0
block = []
chunk = defaultdict(lambda:[])
for i in data:
try:
if "~~~~~~~~~~~~~~~~~~~~~~~~~~ #" in i:
copy = True
elif "文章编号:" in i:
id = i.replace('文章编号: ', '')
source = block[0].split('|')[0]
info = block[1]
title = deletetab(block[3])
body = [j for j in block[6:] if j != '\n']
body = ' '.join(body)
body = deletetab(body)
body = '"' + body + '"'
line = '\t'.join([id, source, info, title, body])
chunk[id] = line
block = []
n += 1
if n%10 == 0:
flushPrint(n)
copy = False
elif copy:
block.append(i)
except Exception, e:
print i, e
pass
return chunk
In [186]:
news_result = readblocks(news_clean)
In [187]:
len(news_result)
Out[187]:
In [190]:
news_result.keys()[:5]
Out[190]:
In [192]:
with open('/Users/chengjun/github/cjc2016/data/zz-hk-2014-9-clean.txt','a') as p:
for record in news_result.values():
p.write(record+"\n")
In [194]:
import pandas as pd
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/zz-hk-2014-9-clean.txt', sep = "\t", header=None)
df[:10]
Out[194]:
In [197]:
import os
os.chdir('/Users/chengjun/github/cjc2016/data/occupycentral/')
import glob
filenames = glob.glob('*.rtf')
filenames
Out[197]:
In [198]:
for i in filenames:
print i
with open(i) as f:
news = f.readlines()
news = [stringclean(n) for n in news]
news_result = readblocks(news)
with open('/Users/chengjun/github/cjc2016/data/zz-hk-all-clean.txt','a') as p:
for record in news_result.values():
p.write(record+"\n")