Sebastian Raschka
last updated: 05/09/2014
I would be happy to hear your comments and suggestions.
Please feel free to drop me a note via
twitter, email, or google+.
In [1]:
from collections import defaultdict
from collections import Counter
def add_element_check1(elements):
"""if ele not in dict (v1)"""
d = dict()
for e in elements:
if e not in d:
d[e] = 1
else:
d[e] += 1
return d
def add_element_check2(elements):
"""if ele not in dict (v2)"""
d = dict()
for e in elements:
if e not in d:
d[e] = 0
d[e] += 1
return d
def add_element_except(elements):
"""try-except"""
d = dict()
for e in elements:
try:
d[e] += 1
except KeyError:
d[e] = 1
return d
def add_element_defaultdict(elements):
"""defaultdict"""
d = defaultdict(int)
for e in elements:
d[e] += 1
return d
def add_element_get(elements):
""".get() method"""
d = dict()
for e in elements:
d[e] = d.get(e, 1) + 1
return d
def counter_object(elements):
"""Counter object"""
return Counter(elements)
The first data set will be Martin Luther King's popular "I have a dream" speech for which we want to count the occurences of the individual words. The data file can be downloaded from here.
In [2]:
def read_speech(datafile):
"""
Reads in a text file and returns individual words
in lowercase and stripped from digits and punctuation
"""
def strip_word(word):
out = []
for char in word:
if char.isalpha():
out.append(char.lower())
return "".join(out)
with open(datafile,'r') as infile:
for line in infile:
line = line.strip()
if line:
for word in line.split():
yield strip_word(word)
speech_file = '../data/day3_dictionary_counting/i_have_a_dream_speech.txt'
speech = list(read_speech(speech_file))
Secondly, we will cound the individual nucleotides ("A", "G", "C", or "T") in a randomly generated DNA string. The file can be found here.
In [3]:
def read_dna(datafile):
with open(datafile,'r') as infile:
return list(infile.read())
dna_file = '../data/day3_dictionary_counting/random_dna.txt'
dna = read_dna(dna_file)
In [4]:
import timeit
from functools import reduce
funcs = ['add_element_check1', 'add_element_check2',
'add_element_except', 'add_element_defaultdict',
'add_element_get', 'counter_object']
times_n = {f:[] for f in funcs}
for d in [speech, dna]:
for f in funcs:
times_n[f].append(min(timeit.Timer('%s(d)' %f,
'from __main__ import %s, d' %f)
.repeat(repeat=3, number=1000)))
In [18]:
import platform
import multiprocessing
def print_sysinfo():
print('\nPython version:', platform.python_version())
print('compiler:', platform.python_compiler())
print('\nsystem :', platform.system())
print('release :', platform.release())
print('machine :', platform.machine())
print('processor :', platform.processor())
print('interpreter:', platform.architecture()[0])
print('CPU count :', multiprocessing.cpu_count())
print('\n\n')
In [19]:
%matplotlib inline
In [20]:
labels = [('add_element_check1', 'if-else statements (v1)'),
('add_element_check2', 'if-else statements (v2)'),
('add_element_except', 'try-except blocks'),
('add_element_defaultdict', 'using collections.defaultdict'),
('add_element_get', 'using the .get() method'),
('counter_object', 'using collections.Counter'),
]
In [28]:
from numpy import arange
import matplotlib.pyplot as plt
def plot_timings():
plt.rcParams.update({'font.size': 12})
ind = arange(2) # the x locations for the groups
width = 0.15
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
colors = [(0,'b'), (1,'c'), (2,'g'), (3,'r'), (4,'y'), (5, 'm')]
for l,c in zip(labels,colors):
ax.bar(ind + c[0]*width,
times_n[l[0]],
width,
alpha=0.5,
color=c[1],
label=l[1])
ax.set_ylabel('time in milliseconds')
ax.set_title('Methods for counting elements in a dataset using a dictionary')
ax.set_xticks(ind + width)
ax.set_xticklabels(['Luther speech', 'DNA string'])
ax.set_xlim(-0.1,2)
ax.set_ylim(0,1.5)
plt.legend(loc='upper left')
plt.show()
In [29]:
import prettytable
def count_unique(input_data):
unique = 0
count_dict = add_element_check1(input_data)
for count in count_dict.values():
if count == 1:
unique += 1
different_ele = len(count_dict)
unique_ele = round(unique / different_ele * 100, 2)
return (different_ele, unique_ele)
def summary_table():
speech_different, speech_unique = count_unique(speech)
dna_different, dna_unique = count_unique(dna)
fit_table = prettytable.PrettyTable(["", "total elements" ,
"different elements",
"unique elements (%)"])
fit_table.add_row(["DNA string", len(dna), dna_different, dna_unique])
fit_table.add_row(["Luther speech", len(speech), speech_different, speech_unique])
print(fit_table)
In [30]:
print_sysinfo()
summary_table()
plot_timings()
The bar plot indicates that the performance of the different approaches really depends on how many duplicates we have in our data set. But this can be easily explained:
Here, expect that the try-except loop would not perform very well, since an error is raised (and eventually the except-block executed) if we encounter an element that is not already in the dictionary.
The try-except method performs much better if we have a high number of duplicates, because the sumation in the try-block succeeds if we are encountering elements that already exist as keys in the dictionary - the except-block can be skipped in this case.
Anyway, for a real application, I would use the Counter
function from the collection
module, it is the fastest approach in any case.
In [ ]: