In [51]:
import os
def go_for_a_walk(d):
for root, dirs, files in os.walk(d, topdown=True):
for name in files:
print os.path.join(root, name)
go_for_a_walk(os.getcwd())
In [57]:
!cat exercise14-2.py
In [58]:
!echo 'This is a testing string that may work' > file_in
!python exercise14-2.py may does file_in file_out
!cat file_out
In [43]:
import anagram_sets
import shelve
def store_anagrams(filename):
shelf = shelve.open(filename, 'c')
d = anagram_sets.all_anagrams('words.txt')
for word, word_list in d.iteritems():
shelf[word] = word_list
shelf.close()
def read_anagrams(filename, word):
shelf = shelve.open(filename)
sig = signature(word)
try:
return shelf[sig]
except KeyError:
return []
In [44]:
store_anagrams('anagrams.db')
In [50]:
read_anagrams('anagrams.db', 'cat')
Out[50]:
In [61]:
import os
def walk(workdir):
"""
'walk' take a specified Folder and return all the files in a List within it.
"""
list_all_files = []
for root, dirs, files in os.walk(workdir, topdown=True):
for name in files:
list_all_files.append(os.path.join(root, name))
return list_all_files
def make_md5sum(list_files, suffix):
"""
'make_md5sum' take a List of files and a suffix, then return a dictonary with the md5sums
of the files with the specified sufix as the keys
"""
d_md5sum = {}
for each_file in list_files:
if not each_file.endswith(suffix):
continue
cmd_md5 = 'md5 ' + each_file
fp = os.popen(cmd_md5)
res = fp.read()
fp.close()
md5_sum = res.strip().split(' ')[-1]
if md5_sum not in d_md5sum:
d_md5sum[md5_sum] = [each_file]
else:
d_md5sum[md5_sum].append(each_file)
return d_md5sum
def find_duplicates (dict_md5sum):
"""
'find_duplicates' take a Dictionary of md5sum and output those with more than 1 files attached to it
"""
for each_md5sum in dict_md5sum:
if len(dict_md5sum[each_md5sum]) > 1:
print 'Dups found with md5sum "' + each_md5sum + '":'
for each_file in dict_md5sum[each_md5sum]:
print '\t' + each_file
In [66]:
!echo testing > testing.txt
!cp testing.txt testing_dup.txt
all_files = walk(os.getcwd())
dict_md5sum = make_md5sum(all_files, '.txt')
find_duplicates(dict_md5sum)