Mining Ulysses

In [ ]:

In [18]:
clean_s = removeDelimiter(s," ",[".",",",";","_","-",":","!","?","\"",")","("])
wordlist = clean_s.split()

In [19]:
dictionary = {}
for word in wordlist:
	if word in dictionary:
		tmp = dictionary[word]

In [26]:
import operator
sorted_dict = sorted(dictionary.items(), key=operator.itemgetter(1))

A better method in Python 3 is -

In [ ]:
sorted(dictionary.items(), key=lambda x: x[1])

In [67]:
#Much more interesting are the uncommon words

infreq_metric = []
for ordered_words,value_words in sorted_dict:
    if value_words == 1:
for word in infreq_metric:


In [64]:
#the very common words
freq_metric = []
for ordered_words,value_words in sorted_dict:
    if value_words > 944:
for pair in freq_metric: 
    print('The word "{}" appears {} times'.format(pair[0],pair[1]))

The word "they" appears 1022 times
The word "or" appears 1036 times
The word "from" appears 1103 times
The word "she" appears 1134 times
The word "as" appears 1197 times
The word "said" appears 1208 times
The word "by" appears 1291 times
The word "at" appears 1305 times
The word "all" appears 1339 times
The word "is" appears 1461 times
The word "him" appears 1526 times
The word "her" appears 1786 times
The word "you" appears 1962 times
The word "for" appears 1963 times
The word "on" appears 2124 times
The word "was" appears 2134 times
The word "it" appears 2372 times
The word "with" appears 2562 times
The word "that" appears 2621 times
The word "i" appears 2708 times
The word "his" appears 3333 times
The word "he" appears 4034 times
The word "in" appears 5003 times
The word "to" appears 5043 times
The word "a" appears 6581 times
The word "and" appears 7285 times
The word "of" appears 8260 times
The word "the" appears 15128 times

In [62]:


These words appear often and I have excluded the really common words

In [51]:
freq_metric = []
for ordered_words,value_words in sorted_dict:
    if value_words >200 and value_words< 944:

let 201
j 202
mrs 203
been 205
put 211
can 215
god 215
name 218
before 219
sir 219
because 220
head 222
himself 222
very 227
any 228
under 230
must 231
night 232
go 235
face 235
long 237
round 239
right 240
again 245
just 249
through 250
day 250
never 253
us 253
only 255
get 256
say 265
come 271
well 275
how 277
father 277
first 278
way 278
here 280
little 290
has 291
street 293
our 295
its 305
hand 308
could 312
where 313
more 315
some 316
those 317
good 321
other 326
know 328
eyes 329
into 330
will 355
yes 360
back 362
off 371
o 371
time 377
would 385
two 389
did 403
man 415
after 428
we 434
see 435
too 441
now 441
over 443
down 452
do 454
says 473
who 488
old 492
this 493
your 496
stephen 505
were 510
which 525
about 542
are 552
when 555
if 564
then 579
so 618
an 659
them 672
no 691
have 699
but 702
there 706
mr 719
their 720
like 731
one 740
had 814
up 830
my 838
what 898
be 898
out 899
not 914
bloom 933

In [52]:
%matplotlib inline
numbs = [int(x) for x in freq_metric]

[<matplotlib.lines.Line2D at 0x10c97ed30>]

In [ ]:
l = [makeSortable(str(dictionary[k])) + " # " + k for k in dictionary.keys()]

	for w in sorted(l):
	count = {}

	for k in dictionary.keys():
		if dictionary[k] in count: 
			tmp = count[dictionary[k]]
			count[dictionary[k]] = tmp + 1
			count[dictionary[k]] = 1
	for k in sorted(count.keys()):
		print(str(count[k]) + " words appear " + str(k) + " times")

In [ ]:
# %load

# this code is licenced under creative commons licence as long as you 
# cite the author: Rene Pickhardt / 

# adds leading zeros to a string so all result strings can be ordered
def makeSortable(w):
	l = len(w)
	tmp = ""
	for i in range(5-l):
		tmp = tmp + "0"
	tmp = tmp + w
	return tmp

#replaces all kind of structures passed in l in a text s with the 2nd argument
def removeDelimiter(s,new,l):
	for c in l:
		s = s.replace(c, new);
	return s;

def analyzeWords(s):
	s = removeDelimiter(s," ",[".",",",";","_","-",":","!","?","\"",")","("])
	wordlist = s.split()

	dictionary = {}
	for word in wordlist:
		if word in dictionary:
			tmp = dictionary[word]

	l = [makeSortable(str(dictionary[k])) + " # " + k for k in dictionary.keys()]

	for w in sorted(l):
	count = {}

	for k in dictionary.keys():
		if dictionary[k] in count: 
			tmp = count[dictionary[k]]
			count[dictionary[k]] = tmp + 1
			count[dictionary[k]] = 1
	for k in sorted(count.keys()):
		print(str(count[k]) + " words appear " + str(k) + " times")

def differentWords(s):
	s = removeDelimiter(s," ",[".",",",";","_","-",":","!","?","\"",")","("])
	wordlist = s.split()
	count = 0
	dictionary = {}
	for word in wordlist:
		if word in dictionary:
			tmp = dictionary[word]
			count = count + 1
	print(str(count) + " different words")
	print("every word was used " + str(float(len(wordlist))/float(count)) + " times on average")
	return count

def analyzeSentences(s):
	s = removeDelimiter(s,".",[".",";",":","!","?"])
	sentenceList = s.split(".")
	wordList = s.split()
	wordCount = len(wordList)
	sentenceCount = len(sentenceList)
	print(str(wordCount) + " words in " + str(sentenceCount) + " sentences ==> " + str(float(wordCount)/float(sentenceCount)) + " words per sentence")

	max = 0
	satz = ""
	for w in sentenceList:
		if len(w) > max:
			max = len(w);
			satz = w;
	print(satz + "laenge " + str(len(satz)))

texts = ["ulysses.txt"]
for text in texts:
	datei = open(text,'r')
	s =