In [17]:
import os
def dir_structure (path=None, decorated=False):
""" read out the full recursive directory structure of `path` and return it as a tuple
path - the path where to start the walk (default: `.`)
decorated - if True, the actual directory is returned as well as the index
RETURNS
(dirs, files, tree)
dirs - tuple of dirs, in order traversed by walk
files - tuple of tuples (filename, dir_ix) where dir_ix is index in dirs
tree - tuple of tuples (dir_ix1, dir_ix2, ...) where dir_ix1... are subdirs
"""
if path == None: path = "."
path = path.rstrip("/")
dirs_and_files = list(os.walk(path))
dirs = []
files = []
for df in dirs_and_files:
dir_ix = len(dirs)
dirs.append( df[0] )
for f in df[2]:
files.append( (f, dir_ix) )
tree = []
for df in dirs_and_files:
parent_dir = df[0]
parent_dir_ix = dirs.index(parent_dir)
tree.append(tuple((dirs.index(parent_dir+"/"+dir)) for dir in df[1]))
if decorated:
files = ( (x[0], x[1], dirs[x[1]]) for x in files)
return tuple((
tuple(dirs),
tuple(files),
tuple(tree)))
In [26]:
"123/".rstrip("/")
Out[26]:
In [22]:
dd = dir_structure (decorated=True)
##dd
In [25]:
1
In [182]:
import os
import pandas as pd
import functools
import re
import types
class DirStructure ():
""" read out the full recursive directory structure of `path` into the object
path - the pathname given to os.walk(); it can also be a tuple `(dirs, files, tree)` in which
case this tuple is used to initialise the object
PROPERTIES
dirs - tuple of dirs, in order traversed by walk
files - tuple of tuples (filename, dir_ix) where dir_ix is index in dirs
tree - tuple of tuples (dir_ix1, dir_ix2, ...) where dir_ix1... are subdirs
dirs_t, files_t - the corresponding pandas dataframe tables
METHODS
subdirs - get all subdirs
files_bydir - get all files in set of dirs
files_decorate - add the full directory names to a files table
files_byre - filter files table by regular expression
add_col - adds a column to the files_t table (values either explicit or as function)
re_func - factory function for use in relation to add_col
DEPENDENCIES
os
pandas
functools
re
types
VERSION AND COPYRIGHT
version 0.1a
(c) 2014 Stefan LOESCH / oditorium
"""
__version__ = "0.1a"
def __init__(self, path=None):
if type(path) == tuple:
self.dirs = tuple(dirs)
self.files = tuple(files)
self.tree = tuple(tree)
self.dirs_t = pd.DataFrame(list(self.dirs), columns = ['dir'])
self.files_t = pd.DataFrame(list(self.files), columns = ['file', 'dir'])
return
if path == None: path = "."
path = path.rstrip("/")
dirs_and_files = list(os.walk(path))
dirs = []
files = []
for df in dirs_and_files:
dir_ix = len(dirs)
dirs.append( df[0] )
for f in df[2]:
files.append( (f, dir_ix) )
tree = []
for df in dirs_and_files:
parent_dir = df[0]
parent_dir_ix = dirs.index(parent_dir)
#print (parent_dir)
#print (parent_dir_ix)
#print ([(dirs.index(parent_dir+"/"+dir)) for dir in df[1]])
tree.append(tuple((dirs.index(parent_dir+"/"+dir)) for dir in df[1]))
self.dirs = tuple(dirs)
self.files = tuple(files)
self.tree = tuple(tree)
self.dirs_t = pd.DataFrame(list(self.dirs), columns = ['dir'])
self.files_t = pd.DataFrame(list(self.files), columns = ['file', 'dir'])
def files_bydir (self, dirs=None, files_t=None):
"""filters files_t with respect to all directories in dirs
dirs - iterable of directory indices
files_t - a pandas table with column `dir` (default: self.files_t)
"""
if type(files_t) == type(None): files_t = self.files_t
if dirs==None: dirs = tuple(0);
if type(dirs) == int: dirs = tuple((dirs,))
the_filter = list(map(any,(zip(*list(list(files_t['dir'] == ix) for ix in dirs)))))
# this expression is a bit complicated; it filters the equiv of
# ds.files_t['dir'] in dir
# - generate a list of filter arrays
# - zip them together (the double list command are to convert pandas structures to list in the proper format)
# - map `any()` to each of those zipped elements and unpack the map into a list
return files_t[the_filter]
def _subdirs (self, dir_ix):
""" returns list of all subdirectories (private)
use subdirs() to access this function
"""
sd = list(self.tree[dir_ix])
sd1 = [self._subdirs(ix) for ix in sd]
sd1 = functools.reduce(lambda x,y: x+y, sd1, [])
if sd1 != [[]]: sd = sd+sd1
return sd
def subdirs(self, dir_ix, as_str=False, get_files=False):
"""get all subdirs of the given directory (as index or name), or the files therein
dir_ix - the directory index of the root
as_str - if True, directory indices are expanded into names
get_files - it True, return list of files rather than list if subdirectories
"""
sd = self._subdirs(dir_ix)
if get_files == True:
thefiles = self.files_bydir([dir_ix] + sd)
return thefiles
if as_str == True:
sd = [self.dirs[ix] for ix in sd]
return sd
def files_decorate(self, files_t=None):
"""add a column `dirn` to a files table (based on column `dir`)
files_t - must have a column dir containing numerical values
"""
if type(files_t) == type(None): files_t = self.files_t
files_t['dirn'] = list(self.dirs[ix] for ix in list(files_t['dir']))
return files_t
def files_byre(self, regex, files_t=None):
"""filter the files table by regex
files_t - must have a column `file` containing the filename
"""
if type(files_t) == type(None): files_t = self.files_t
the_filter = list(map (lambda fn: type(re.match(regex, fn)) != type(None), list(files_t['file'])))
return files_t[the_filter]
def add_col(self, heading, values):
"""adds a columns to the files_t table (note: the files property is _not_ kept in synch
heading - the col heading
values - the col values (must the the right number of entries; can also be a function f(filename))*
* this might change in the future to
"""
if type(values) != types.FunctionType:
self.files_t[heading] = values
return self.files_t
fnames = self.files_t['file']
val1 = map(values, fnames)
self.files_t[heading] = list(val1)
return self.files_t
def re_func(self, regex, none_val=None):
"""factory function: returns a function that evaluates regex on its argument an returns first group or none_val
EXAMPLE
f = re_func("(.*)\.jpg$", "0")
f("test.jpg") -> "test"
f("test.JPG") -> "0"
"""
def f(s):
m = re.match(regex, s)
if m == None: return none_val
return m.groups()[0]
return f
In [203]:
ds = DirStructure('./delme')
In [204]:
ds.dirs
Out[204]:
In [205]:
ds.files
Out[205]:
In [206]:
ds.tree
Out[206]:
directories and files as pandas dataframes
In [207]:
ds.dirs_t
Out[207]:
In [208]:
ds.files_t
Out[208]:
In [209]:
ds.add_col('f2', list(ds.files_t['file']))
Out[209]:
In [210]:
ds.add_col('f3', lambda f: "prefix-"+f)
Out[210]:
In [211]:
def ab(fn):
m = re.match("[^ab]*(a|b)[^ab]*", fn)
if m == None: return "0"
return m.groups()[0]
ds.add_col('ab', ab)
Out[211]:
In [212]:
f = ds.re_func("[^ab]*(a|b)[^ab]*", "-")
f("xxxx")
Out[212]:
In [213]:
f("xxaxx")
Out[213]:
In [214]:
ds.add_col('ab2', ds.re_func("[^ab]*(a|b)[^ab]*", "-"))
Out[214]:
In [215]:
ds.files_bydir([0,1,3])
Out[215]:
In [216]:
ds.subdirs(1, as_str = False)
Out[216]:
In [217]:
ds.subdirs(1, as_str = True)
Out[217]:
In [218]:
ds.subdirs(1, get_files = True)
Out[218]:
In [219]:
ds.files_decorate(ds.files_bydir([0,1,3]))
Out[219]:
In [220]:
ds.files_byre("f2.*\.jpg$")
Out[220]:
In [221]:
set("2012_02 Weekend With Parents in Paris".lower().split()[1:]) - out
Out[221]:
In [222]:
out = {"is", "a", "in", "with", "for", "incl", "for", "and", "the"}
In [233]:
def dates(s):
m = re.match("^([0-9]{8,8})_([0-9]{4,6}).*",s)
if m==None: return []
return m.groups()
In [234]:
dates("20121224_120000")
Out[234]:
In [235]:
dates("aa_20121224_120000")
Out[235]:
In [236]:
dates("20121224_120000_aa")
Out[236]:
In [237]:
dates("20121224_1200")
Out[237]:
In [238]:
dates("20121224_120")
Out[238]:
In [28]:
list("abcdef")
Out[28]:
In [ ]: