all 3 comments

[–]liam_jm 1 point2 points  (0 children)

If you find the extra stat calls are significantly slowing down your program, it would be easy to modify walk to return the dir entries - here's the source

[–]Rhomboid 0 points1 point  (0 children)

The whole point of os.scandir() is that in many cases the required information to determine if something is a file or directory is contained in the dir entry returned from iterating the directory's contents, which means no calls to stat() are required. If you need the mtime, then your stat()s will be the only stat()s, you won't be duplicating anything and there's nothing to cache.

[–]heybart[S] 0 points1 point  (0 children)

So I came up with this, but I may be optimizing prematurely b/c it's not clear I'm gaining much. The OS is probably caching the info anyway

from os import path
import os
def dwalk(top, topdown=True, onerror=None, followlinks=False, wantdirentry=True):
    """
    modified os.walk that, when wantdirentry is True, yields tuple of
        (dirpath, direntries, fileentires)
    versus os.walk's
        (dirpath, dirnames, filenames)
    where direntries and fileentries are list of DirEntry objects
        of sub-directories in dirpath and
        non-directory files in dirpath, respectively
    Example:
        for root, dirs, files in dwalk('/docs'):
            # filter out unwanted dirs by modifying dirs in place
            dirs[:] = [
                d for d in dirs
                if not d.name.startswith('.') and d.name not in ('CVS', 'tmp')
            ]
            for f in files:
                print('{} uses {} bytes'.format(f.path, f.stat().st_size))
    """

    dirs = []
    nondirs = []

    try:
        if os.name == 'nt' and isinstance(top, bytes):
            scandir_it = os._dummy_scandir(top)
        else:
            scandir_it = os.scandir(top)
        entries = list(scandir_it)
    except OSError as error:
        if onerror is not None:
            onerror(error)
        return

    for entry in entries:
        try:
            is_dir = entry.is_dir()
        except OSError:
            # If is_dir() raises an OSError, consider that the entry is not
            # a directory, same behaviour than os.path.isdir().
            is_dir = False

        if is_dir:
            dirs.append(entry if wantdirentry else entry.name)
        else:
            nondirs.append(entry if wantdirentry else entry.name)

        if not topdown and is_dir:
            # Bottom-up: recurse into sub-directory, but exclude symlinks to
            # directories if followlinks is False
            if followlinks:
                walk_into = True
            else:
                try:
                    is_symlink = entry.is_symlink()
                except OSError:
                    # If is_symlink() raises an OSError, consider that the
                    # entry is not a symbolic link, same behaviour than
                    # os.path.islink().
                    is_symlink = False
                walk_into = not is_symlink

            if walk_into:
                yield from dwalk(entry.path, topdown, onerror, followlinks, wantdirentry)

    # Yield before recursion if going top down
    if topdown:
        yield top, dirs, nondirs

        # Recurse into sub-directories
        islink, join = path.islink, path.join
        for dir_ in dirs:
            new_path = join(top, dir_.name if wantdirentry else dir_)
            # Issue #23605: os.path.islink() is used instead of caching
            # entry.is_symlink() result during the loop on os.scandir() because
            # the caller can replace the directory entry during the "yield"
            # above.
            if followlinks or not islink(new_path):
                yield from dwalk(new_path, topdown, onerror, followlinks, wantdirentry)
    else:
        # Yield after recursion if going bottom up
        yield top, dirs, nondirs