Walking the directory tree


In [17]:
import os

def dir_structure (path=None, decorated=False):
    """ read out the full recursive directory structure of `path` and return it as a tuple
    
    path - the path where to start the walk (default: `.`)
    decorated - if True, the actual directory is returned as well as the index
    
    RETURNS
        (dirs, files, tree)
        
        dirs - tuple of dirs, in order traversed by walk
        files - tuple of tuples (filename, dir_ix) where dir_ix is index in dirs
        tree - tuple of tuples (dir_ix1, dir_ix2, ...) where dir_ix1... are subdirs
    """

    if path == None: path = "."
    path = path.rstrip("/")



    dirs_and_files = list(os.walk(path))
    dirs = []
    files = []
    for df in dirs_and_files:
        dir_ix = len(dirs)
        dirs.append( df[0] )
        for f in df[2]:
            files.append( (f, dir_ix) )

    tree = []
    for df in dirs_and_files:
        parent_dir = df[0]
        parent_dir_ix = dirs.index(parent_dir)
        tree.append(tuple((dirs.index(parent_dir+"/"+dir)) for dir in df[1]))
    
    if decorated:
        files = ( (x[0], x[1], dirs[x[1]]) for x in files)
    
    return tuple(( 
        tuple(dirs),
        tuple(files),
        tuple(tree)))

In [26]:
"123/".rstrip("/")


Out[26]:
'123'

In [22]:
dd = dir_structure (decorated=True)
##dd

In [25]:
1

In [182]:
import os
import pandas as pd
import functools
import re
import types

class DirStructure ():
    """ read out the full recursive directory structure of `path` into the object
    
    path - the pathname given to os.walk(); it can also be a tuple `(dirs, files, tree)` in which
            case this tuple is used to initialise the object
    
    PROPERTIES
    
        dirs - tuple of dirs, in order traversed by walk
        files - tuple of tuples (filename, dir_ix) where dir_ix is index in dirs
        tree - tuple of tuples (dir_ix1, dir_ix2, ...) where dir_ix1... are subdirs
        
        dirs_t, files_t - the corresponding pandas dataframe tables
        
    METHODS
        
        subdirs - get all subdirs
        files_bydir - get all files in set of dirs
        files_decorate - add the full directory names to a files table
        files_byre - filter files table by regular expression
        add_col - adds a column to the files_t table (values either explicit or as function)
        re_func - factory function for use in relation to add_col
    
    DEPENDENCIES
    
        os
        pandas
        functools
        re
        types
        
    VERSION AND COPYRIGHT
        
        version 0.1a
        (c) 2014 Stefan LOESCH / oditorium
    
    """
    
    __version__ = "0.1a"
    
    def __init__(self, path=None):
        
        if type(path) == tuple:
            self.dirs = tuple(dirs)
            self.files = tuple(files)
            self.tree = tuple(tree)

            self.dirs_t = pd.DataFrame(list(self.dirs), columns = ['dir'])
            self.files_t = pd.DataFrame(list(self.files), columns = ['file', 'dir'])
            return

        
        if path == None: path = "."
        path = path.rstrip("/")

            
    
        dirs_and_files = list(os.walk(path))
        dirs = []
        files = []
        for df in dirs_and_files:
            dir_ix = len(dirs)
            dirs.append( df[0] )
            for f in df[2]:
                files.append( (f, dir_ix) )

        tree = []
        for df in dirs_and_files:
            parent_dir = df[0]
            parent_dir_ix = dirs.index(parent_dir)

            #print (parent_dir)
            #print (parent_dir_ix)
            #print ([(dirs.index(parent_dir+"/"+dir)) for dir in df[1]])
            tree.append(tuple((dirs.index(parent_dir+"/"+dir)) for dir in df[1]))
                        
        self.dirs = tuple(dirs)
        self.files = tuple(files)
        self.tree = tuple(tree)
        
        self.dirs_t = pd.DataFrame(list(self.dirs), columns = ['dir'])
        self.files_t = pd.DataFrame(list(self.files), columns = ['file', 'dir'])

    def files_bydir (self, dirs=None, files_t=None):
        """filters files_t with respect to all directories in dirs
        
        dirs - iterable of directory indices
        files_t - a pandas table with column `dir` (default: self.files_t)
        """
    
        if type(files_t) == type(None): files_t = self.files_t
        if dirs==None: dirs = tuple(0);
        if type(dirs) == int: dirs = tuple((dirs,))
        the_filter = list(map(any,(zip(*list(list(files_t['dir'] == ix) for ix in dirs)))))
            # this expression is a bit complicated; it filters the equiv of 
            #    ds.files_t['dir'] in dir
            # - generate a list of filter arrays 
            # - zip them together (the double list command are to convert pandas structures to list in the proper format)
            # - map `any()` to each of those zipped elements and unpack the map into a list
            
        return files_t[the_filter]
            
    def _subdirs (self, dir_ix):
        """ returns list of all subdirectories (private)
        
        use subdirs() to access this function
        """
        
        sd = list(self.tree[dir_ix])
        sd1 = [self._subdirs(ix) for ix in sd]
        sd1 = functools.reduce(lambda x,y: x+y, sd1, [])
        if sd1 != [[]]: sd = sd+sd1
        return sd
    

    def subdirs(self, dir_ix, as_str=False, get_files=False):
        """get all subdirs of the given directory (as index or name), or the files therein
        
        dir_ix - the directory index of the root
        as_str - if True, directory indices are expanded into names
        get_files - it True, return list of files rather than list if subdirectories
        """
        
        sd = self._subdirs(dir_ix)
        
        if get_files == True:
            thefiles = self.files_bydir([dir_ix] + sd)
            return thefiles
        
        if as_str == True:
            sd = [self.dirs[ix] for ix in sd]
        return sd
   
    def files_decorate(self, files_t=None):
        """add a column `dirn` to a files table (based on column `dir`)
        
        files_t - must have a column dir containing numerical values
        """
        
        if type(files_t) == type(None): files_t = self.files_t
        files_t['dirn'] = list(self.dirs[ix] for ix in list(files_t['dir']))
        return files_t
        
    def files_byre(self, regex, files_t=None):
        """filter the files table by regex
        
        files_t - must have a column `file` containing the filename
        """
        
        if type(files_t) == type(None): files_t = self.files_t
        the_filter = list(map (lambda fn: type(re.match(regex, fn)) != type(None), list(files_t['file'])))
        return files_t[the_filter]
    
    def add_col(self, heading, values):
        """adds a columns to the files_t table (note: the files property is _not_ kept in synch 
        
        heading - the col heading
        values - the col values (must the the right number of entries; can also be a function f(filename))*
        
        * this might change in the future to 
        """
        if type(values) != types.FunctionType:
            self.files_t[heading] = values
            return self.files_t
        
        fnames = self.files_t['file']
        val1 = map(values, fnames)
        self.files_t[heading] = list(val1)
        return self.files_t
    
    def re_func(self, regex, none_val=None):
        """factory function: returns a function that evaluates regex on its argument an returns first group or none_val
        
        EXAMPLE
        
            f = re_func("(.*)\.jpg$", "0")
            f("test.jpg") -> "test"
            f("test.JPG") -> "0"
        """
        
        def f(s):
            
            m = re.match(regex, s)
            if m == None: return none_val
            return m.groups()[0]
        
        return f

read the directory structure


In [203]:
ds = DirStructure('./delme')

the properties

a tuple of directories, of files, and the tree


In [204]:
ds.dirs


Out[204]:
('./delme',
 './delme/delme1',
 './delme/delme1/delme12',
 './delme/delme1/delme11',
 './delme/delme2')

In [205]:
ds.files


Out[205]:
(('f.jpg', 0),
 ('f1a.jpg', 1),
 ('f1b.jpg', 1),
 ('f12a.jpg', 2),
 ('f11a.jpg', 3),
 ('f2b.jpg', 4),
 ('f2a.jpg', 4))

In [206]:
ds.tree


Out[206]:
((1, 4), (2, 3), (), (), ())

directories and files as pandas dataframes


In [207]:
ds.dirs_t


Out[207]:
dir
0 ./delme
1 ./delme/delme1
2 ./delme/delme1/delme12
3 ./delme/delme1/delme11
4 ./delme/delme2

5 rows × 1 columns


In [208]:
ds.files_t


Out[208]:
file dir
0 f.jpg 0
1 f1a.jpg 1
2 f1b.jpg 1
3 f12a.jpg 2
4 f11a.jpg 3
5 f2b.jpg 4
6 f2a.jpg 4

7 rows × 2 columns


In [209]:
ds.add_col('f2', list(ds.files_t['file']))


Out[209]:
file dir f2
0 f.jpg 0 f.jpg
1 f1a.jpg 1 f1a.jpg
2 f1b.jpg 1 f1b.jpg
3 f12a.jpg 2 f12a.jpg
4 f11a.jpg 3 f11a.jpg
5 f2b.jpg 4 f2b.jpg
6 f2a.jpg 4 f2a.jpg

7 rows × 3 columns


In [210]:
ds.add_col('f3', lambda f: "prefix-"+f)


Out[210]:
file dir f2 f3
0 f.jpg 0 f.jpg prefix-f.jpg
1 f1a.jpg 1 f1a.jpg prefix-f1a.jpg
2 f1b.jpg 1 f1b.jpg prefix-f1b.jpg
3 f12a.jpg 2 f12a.jpg prefix-f12a.jpg
4 f11a.jpg 3 f11a.jpg prefix-f11a.jpg
5 f2b.jpg 4 f2b.jpg prefix-f2b.jpg
6 f2a.jpg 4 f2a.jpg prefix-f2a.jpg

7 rows × 4 columns


In [211]:
def ab(fn):
    m = re.match("[^ab]*(a|b)[^ab]*", fn)
    if m == None: return "0"
    return m.groups()[0]
ds.add_col('ab', ab)


Out[211]:
file dir f2 f3 ab
0 f.jpg 0 f.jpg prefix-f.jpg 0
1 f1a.jpg 1 f1a.jpg prefix-f1a.jpg a
2 f1b.jpg 1 f1b.jpg prefix-f1b.jpg b
3 f12a.jpg 2 f12a.jpg prefix-f12a.jpg a
4 f11a.jpg 3 f11a.jpg prefix-f11a.jpg a
5 f2b.jpg 4 f2b.jpg prefix-f2b.jpg b
6 f2a.jpg 4 f2a.jpg prefix-f2a.jpg a

7 rows × 5 columns


In [212]:
f = ds.re_func("[^ab]*(a|b)[^ab]*", "-")
f("xxxx")


Out[212]:
'-'

In [213]:
f("xxaxx")


Out[213]:
'a'

In [214]:
ds.add_col('ab2', ds.re_func("[^ab]*(a|b)[^ab]*", "-"))


Out[214]:
file dir f2 f3 ab ab2
0 f.jpg 0 f.jpg prefix-f.jpg 0 -
1 f1a.jpg 1 f1a.jpg prefix-f1a.jpg a a
2 f1b.jpg 1 f1b.jpg prefix-f1b.jpg b b
3 f12a.jpg 2 f12a.jpg prefix-f12a.jpg a a
4 f11a.jpg 3 f11a.jpg prefix-f11a.jpg a a
5 f2b.jpg 4 f2b.jpg prefix-f2b.jpg b b
6 f2a.jpg 4 f2a.jpg prefix-f2a.jpg a a

7 rows × 6 columns

the methods

all files in a set of directories, and all subdirs of a directory (as indices or names) or the files therein


In [215]:
ds.files_bydir([0,1,3])


Out[215]:
file dir f2 f3 ab ab2
0 f.jpg 0 f.jpg prefix-f.jpg 0 -
1 f1a.jpg 1 f1a.jpg prefix-f1a.jpg a a
2 f1b.jpg 1 f1b.jpg prefix-f1b.jpg b b
4 f11a.jpg 3 f11a.jpg prefix-f11a.jpg a a

4 rows × 6 columns


In [216]:
ds.subdirs(1, as_str = False)


Out[216]:
[2, 3]

In [217]:
ds.subdirs(1, as_str = True)


Out[217]:
['./delme/delme1/delme12', './delme/delme1/delme11']

In [218]:
ds.subdirs(1, get_files = True)


Out[218]:
file dir f2 f3 ab ab2
1 f1a.jpg 1 f1a.jpg prefix-f1a.jpg a a
2 f1b.jpg 1 f1b.jpg prefix-f1b.jpg b b
3 f12a.jpg 2 f12a.jpg prefix-f12a.jpg a a
4 f11a.jpg 3 f11a.jpg prefix-f11a.jpg a a

4 rows × 6 columns


In [219]:
ds.files_decorate(ds.files_bydir([0,1,3]))


Out[219]:
file dir f2 f3 ab ab2 dirn
0 f.jpg 0 f.jpg prefix-f.jpg 0 - ./delme
1 f1a.jpg 1 f1a.jpg prefix-f1a.jpg a a ./delme/delme1
2 f1b.jpg 1 f1b.jpg prefix-f1b.jpg b b ./delme/delme1
4 f11a.jpg 3 f11a.jpg prefix-f11a.jpg a a ./delme/delme1/delme11

4 rows × 7 columns


In [220]:
ds.files_byre("f2.*\.jpg$")


Out[220]:
file dir f2 f3 ab ab2
5 f2b.jpg 4 f2b.jpg prefix-f2b.jpg b b
6 f2a.jpg 4 f2a.jpg prefix-f2a.jpg a a

2 rows × 6 columns


In [221]:
set("2012_02 Weekend With Parents in Paris".lower().split()[1:]) - out


Out[221]:
{'parents', 'paris', 'weekend'}

In [222]:
out = {"is", "a", "in", "with", "for", "incl", "for", "and", "the"}

In [233]:
def dates(s):
    m = re.match("^([0-9]{8,8})_([0-9]{4,6}).*",s)
    if m==None: return []
    return  m.groups()

In [234]:
dates("20121224_120000")


Out[234]:
('20121224', '120000')

In [235]:
dates("aa_20121224_120000")


Out[235]:
[]

In [236]:
dates("20121224_120000_aa")


Out[236]:
('20121224', '120000')

In [237]:
dates("20121224_1200")


Out[237]:
('20121224', '1200')

In [238]:
dates("20121224_120")


Out[238]:
[]

In [28]:
list("abcdef")


Out[28]:
['a', 'b', 'c', 'd', 'e', 'f']

In [ ]: