模块测试


In [2]:
import string
list = [',,a,','b!','cj!/n']
item = []
for i in list:
    i = i.strip(string.punctuation)
    #在循环体中用item.strip(string.punctuation) 对内容中的所有单词进行清洗,单词两端
    #的任何标点符号都会被去掉,但带连字符的单词(连字符在单词内部)仍然会保留。
    item.append(i)
print(item)


['a', 'b', 'cj!/n']

In [11]:
import operator
dict_ = {'name1':'2','name2':'1','name3':'2'}
print(sorted(dict_.items(),key=operator.itemgetter(0,1),reverse=False))#排序


[('name1', '2'), ('name2', '1'), ('name3', '2')]

n-gram


In [143]:
from urllib.request import urlopen

import re
import string
import operator

def isCommon(ngram):
    commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it",
        "i", "that", "for", "you", "he", "with", "on", "do", "say", "this",
        "they", "is", "an", "at", "but","we", "his", "from", "that", "not",
        "by", "she", "or", "as", "what", "go", "their","can", "who", "get",
        "if", "would", "her", "all", "my", "make", "about", "know", "will",
        "as", "up", "one", "time", "has", "been", "there", "year", "so",
        "think", "when", "which", "them", "some", "me", "people", "take",
        "out", "into", "just", "see", "him", "your", "come", "could", "now",
        "than", "like", "other", "how", "then", "its", "our", "two", "more",
        "these", "want", "way", "look", "first", "also", "new", "because",
        "day", "more", "use", "no", "man", "find", "here", "thing", "give",
        #"laughter", "applause", 
        "many", "well", "said", "was", "are", "were", "had"]
    #############################################
    for word in ngram:
        if word in commonWords:
            return True
    return False




def cleanInput(input):
    
    input = re.sub('\n+', " ", input).lower()
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = bytes(input, "UTF-8")
    input = input.decode("ascii", "ignore")
    
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def ngrams(input, n):
    input = cleanInput(input)
    print("Total words: %d"%len(input))
    output = {}
    for i in range(len(input)-n+1):
        ngramTemp = " ".join(input[i:i+n])
        #print(ngramTemp)
        if isCommon(ngramTemp.split()):
            #print("in")
            pass
        else:
            if ngramTemp not in output:
                output[ngramTemp] = 0
            output[ngramTemp] += 1
    return (output,len(input))

#content = str(
#    urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(),
#    'utf-8')

In [149]:
for i in range(4):
    year = 2013+i
    print(year)
    content=open("Obama's dinner speech %d.txt"%year).read()
    (ngram,length)=(ngrams(content, 1))
    sortedNGrams = sorted(ngram.items(), key = operator.itemgetter(1), reverse=True)
    ngram.clear()
    for top2 in range(2):
        print(sortedNGrams[top2][0],sortedNGrams[top2][1],length//sortedNGrams[top2][1])


2013
Total words: 2232
laughter 61 36
applause 26 85
2014
Total words: 2096
laughter 60 34
applause 22 95
2015
Total words: 2302
laughter 79 29
applause 32 71
2016
Total words: 2809
laughter 99 28
applause 52 54

In [136]:
for i in range(4):
    year = 2013+i
    print(year)
    content=open("Obama's dinner speech %d.txt"%year).read()
    ngram=(ngrams(content, 2))
    sortedNGrams = sorted(ngram.items(), key = operator.itemgetter(1), reverse=True)
    ngram.clear()
    for top2 in range(20):
        if sortedNGrams[top2][1]>2:
        #print(type(sortedNGrams[top2]))
            print(sortedNGrams[top2])


2013
Total words: 2232
('white house', 4)
('house correspondents', 3)
('charm offensive', 3)
('groucho marx', 3)
2014
Total words: 2096
('white house', 4)
('house correspondents', 4)
('correspondents association', 4)
2015
Total words: 2302
('white house', 5)
('ted cruz', 5)
('house correspondents', 4)
('jeb bush', 3)
('weve got', 3)
('anger translator', 3)
2016
Total words: 2809
('white house', 7)
('free press', 5)
('little bit', 4)
('house correspondents', 3)
('correspondents dinner', 3)
('press corps', 3)