In [1]:
import os
import re
import pandas as pd
from collections import defaultdict

In [2]:
'''
用于过滤异常编码字符的类
'''

class char_filter():
    def __init__(self):
        self.filter_pattern = re.compile(r"\\x[a-z0-9]{2}|\W|^b'|\{[0-9]+\}")
    def filter(self, text):
        foo = self.filter_pattern.sub(' ', text)
        return foo

In [3]:
'''
根据输入路径读取txt文件,返回一个列表
'''

def load_txt(file_path):
    article = []
    illegal_filter = char_filter()
    with open(file_path, 'rb') as f:
        for line in f:
            foo = str(line)
            article.append(illegal_filter.filter(foo))
    return article

In [4]:
'''
定义输入文件路径并读取文件
'''

filePath = '/home/da/task.txt'
article = load_txt(filePath)
article


Out[4]:
[' The windows in the Wradisley drawing rooms were large  one of them  a vast  shallow 10  bow  which seemed to admit the outside into the interior  rather than to enlighten the interior with the view of what was outside  Mrs  Wradisley sat within reach of  but not too near  a large  very red fire   a fire which was like the turf outside  the growth of generations  or at least had not at all the air of having been lighted to day or any recent day  It did not flame  but glowed steadily  adding something to the color of the room  but not much to the light  Later in the season  when larger parties assembled  there was tea in the hall for the sportsmen and the ladies who waited for them  but Mrs  Wradisley thought the hall draughty  and much preferred the drawing room  which was over furnished after the present mode of drawing rooms  but at least warm  and free from draughts  She was working   knitting with white pins  or else making mysterious chains and bridges in white wool with a crochet hook  her eyes being supposed to be not very strong  and this kind of industry the best adapted for them  As to what Lucy was doing  that defies description  She was doing everything 11  and nothing  She had something of a modern young lady   s contempt for every kind of needlework  and  then  along with that  a great admiration for it as something still more superior than the superiority of idleness  A needle is one of the things that has this double effect  It is the scorn of a great number of highly advanced  very cultured and superior feminine people  but yet here and there will arise one  still more advanced and cultured  who loves the old fashioned weapon  and speaks of it as a sacred implement of life  Lucy followed first one opinion and then another  She had half a dozen pieces of work about  begun under the influence of one class of her friends  abandoned under that of another  She had a little studio  too  where she painted and carved  and executed various of the humbler decorative arts  which  perhaps  to tell the truth  she enjoyed more than art proper  but these details of the young lady   s life may be left to show themselves where there is no need of such vanities  Lucy was  at all events  whatever her other qualities might be  a most enthusiastic friend  12  ']

In [5]:
'''
按空格把句子分割为单词
'''

regex = re.compile('\s+')
words = regex.split(article[0])

In [6]:
'''
对单词进行计数
'''

counts = defaultdict(lambda: 0)  # 使用lambda来定义简单的函数

for word in words:
    counts[word] += 1

In [7]:
'''
生成结果数据框
'''

result = pd.DataFrame({'word':list(counts.keys()), 'value':list(counts.values())})
result.sort_values('value', ascending=False)


Out[7]:
value word
190 27 the
162 23 of
55 14 and
67 11 a
31 9 to
214 8 was
6 7 but
34 5 not
24 5 for
83 5 one
43 5 She
220 4 at
47 4 which
221 4 that
84 4 had
58 4 with
70 4 in
46 3 or
157 3 something
2 3 drawing
167 3 than
108 3 is
222 3 outside
21 3 there
120 3 very
92 3 Lucy
117 3 more
54 3 be
197 3 them
199 3 her
... ... ...
38 1 vast
134 1 opinion
37 1 tea
136 1 loves
137 1 wool
138 1 class
139 1 highly
140 1 will
141 1 no
124 1 vanities
123 1 number
41 1 perhaps
1 1 friends
105 1 qualities
106 1 weapon
107 1 old
48 1 themselves
109 1 studio
110 1 warm
111 1 everything
113 1 has
121 1 arts
114 1 strong
115 1 need
45 1 waited
44 1 modern
118 1 influence
119 1 feminine
42 1 sacred
224 1 were

225 rows × 2 columns