Download raw body.
ffs_readdir
On Tue, Apr 07, 2026 at 09:18:11AM +0200, Helg wrote:
> On Mon, Apr 06, 2026 at 02:57:21PM +0200, Kirill A. Korinsky wrote:
> > tech@,
> >
> > I'd like to share with you a draft of ffs_readdir()
> >
> > Idea is using lower and cheaper internal API to make listing of
> > directories via ffs_readdir() faster / cheaper than ufs_readdir()
> >
> > This code may be wrong or it can contain bugs. Anyway, it boots, and I
> > already use it without any noticeble issue so far.
> >
> > Performance impact?
> >
> > ufs_readdir()
> >
> > ~ $ time find /home/cvs >/dev/null
> > 0m54.19s real 0m01.90s user 0m22.82s system
> > ~ $ time find /home/cvs >/dev/null
> > 0m05.91s real 0m00.44s user 0m05.51s system
> > ~ $ time find /home/cvs >/dev/null
> > 0m05.91s real 0m00.58s user 0m05.30s system
> > ~ $
> >
> > ffs_readdir()
> >
> > ~ $ time find /home/cvs >/dev/null
> > 0m42.88s real 0m01.84s user 0m11.29s system
> > ~ $ time find /home/cvs >/dev/null
> > 0m02.45s real 0m00.40s user 0m02.07s system
> > ~ $ time find /home/cvs >/dev/null
> > 0m02.34s real 0m00.52s user 0m01.81s system
> > ~ $
> >
> > Anyone willing to test it?
>
> I would recommend that you keep some of the original comments from
> ufs that explain all the block and dirent calculations.
>
> Are ffs and ufs so fundamentally different that ufs_readdir can't
> also benefit from this optimisation?
>
> What's the purpose of the outer loop. Is this to support the readdir
> buffer being larger than a block and enabling readdir to read more
> than one block per call to readdir? ufs_readdir could benefit from
> this too.
>
> I suggest to return EIO instead of EINVAL if a name contains '/'.
> I know this is what ufs does but reading getdents(2) this is not a
> documented reason for returning EINVAL.
I would stay away from EIO as an error for this. EIO is for I/O
errors and normally indicate that the disk is dying.
This is an invalid directory structure and should use some other error,
this is why I picked EINVAL. If we decide to change this errno here then
we should go over all other readdir functions and adjust the code there
too.
> > Index: sys/ufs/ffs/ffs_extern.h
> > ===================================================================
> > RCS file: /home/cvs/src/sys/ufs/ffs/ffs_extern.h,v
> > diff -u -p -r1.51 ffs_extern.h
> > --- sys/ufs/ffs/ffs_extern.h 8 Oct 2024 02:58:26 -0000 1.51
> > +++ sys/ufs/ffs/ffs_extern.h 6 Apr 2026 12:49:45 -0000
> > @@ -150,6 +150,7 @@ int ffs_sysctl(int *, u_int, void *, siz
> > int ffs_sbupdate(struct ufsmount *, int);
> >
> > /* ffs_vnops.c */
> > +int ffs_readdir(void *);
> > int ffs_read(void *);
> > int ffs_write(void *);
> > int ffs_fsync(void *);
> > Index: sys/ufs/ffs/ffs_vnops.c
> > ===================================================================
> > RCS file: /home/cvs/src/sys/ufs/ffs/ffs_vnops.c,v
> > diff -u -p -r1.103 ffs_vnops.c
> > --- sys/ufs/ffs/ffs_vnops.c 27 Mar 2025 23:30:54 -0000 1.103
> > +++ sys/ufs/ffs/ffs_vnops.c 6 Apr 2026 12:49:45 -0000
> > @@ -38,6 +38,7 @@
> > #include <sys/kernel.h>
> > #include <sys/stat.h>
> > #include <sys/buf.h>
> > +#include <sys/dirent.h>
> > #include <sys/mount.h>
> > #include <sys/vnode.h>
> > #include <sys/malloc.h>
> > @@ -78,7 +79,7 @@ const struct vops ffs_vops = {
> > .vop_mkdir = ufs_mkdir,
> > .vop_rmdir = ufs_rmdir,
> > .vop_symlink = ufs_symlink,
> > - .vop_readdir = ufs_readdir,
> > + .vop_readdir = ffs_readdir,
> > .vop_readlink = ufs_readlink,
> > .vop_abortop = vop_generic_abortop,
> > .vop_inactive = ufs_inactive,
> > @@ -173,6 +174,134 @@ const struct vops ffs_fifovops = {
> > .vop_advlock = fifo_advlock
> > };
> > #endif /* FIFO */
> > +
> > +/*
> > + * Vnode op for reading directories.
> > + */
> > +int
> > +ffs_readdir(void *v)
> > +{
> > + struct vop_readdir_args *ap = v;
> > + struct uio *uio = ap->a_uio;
> > + struct vnode *vp = ap->a_vp;
> > + struct inode *ip = VTOI(vp);
> > + struct fs *fs = ip->i_fs;
> > + union {
> > + struct dirent dn;
> > + char __pad[roundup(sizeof(struct dirent), 8)];
> > + } u;
> > + struct direct *dp;
> > + struct buf *bp;
> > + char *edp;
> > + off_t bytesinfile, off;
> > + daddr_t lbn, nextlbn;
> > + size_t count, entries;
> > + int blkoffset, bsize, done, error, readcnt, size, xfersize;
> > +
> > + if (uio->uio_rw != UIO_READ)
> > + return (EINVAL);
> > +
> > + count = uio->uio_resid;
> > + entries = (uio->uio_offset + count) & (DIRBLKSIZ - 1);
> > +
> > + if (count <= entries)
> > + return (EINVAL);
> > +
> > + readcnt = (int)(min(count, (size_t)64 * 1024) - entries);
> > + off = uio->uio_offset;
> > + done = 0;
> > + error = 0;
> > + bp = NULL;
> > +
> > + memset(&u, 0, sizeof(u));
> > +
> > + while (error == 0 && done == 0 && readcnt > 0) {
> > + bp = NULL;
> > + if ((bytesinfile = DIP(ip, size) - off) <= 0)
> > + break;
> > + lbn = lblkno(fs, off);
> > + nextlbn = lbn + 1;
> > + bsize = blksize(fs, ip, lbn);
> > + blkoffset = blkoff(fs, off);
> > + xfersize = bsize - blkoffset;
> > + if (readcnt < xfersize)
> > + xfersize = readcnt;
> > + if (bytesinfile < xfersize)
> > + xfersize = bytesinfile;
> > +
> > + if (lblktosize(fs, nextlbn) >= DIP(ip, size))
> > + error = bread(vp, lbn, fs->fs_bsize, &bp);
> > + else if (lbn - 1 == ip->i_ci.ci_lastr || readcnt > xfersize)
> > + error = bread_cluster(vp, lbn, fs->fs_bsize, &bp);
> > + else
> > + error = bread(vp, lbn, fs->fs_bsize, &bp);
> > + if (error)
> > + break;
> > + ip->i_ci.ci_lastr = lbn;
> > +
> > + buf_adjcnt(bp, bsize);
> > + size = bsize - bp->b_resid;
> > + if (size <= blkoffset) {
> > + brelse(bp);
> > + bp = NULL;
> > + break;
> > + }
> > + if (size < blkoffset + xfersize)
> > + xfersize = size - blkoffset;
> > +
> > + dp = (struct direct *)(bp->b_data + blkoffset);
> > + edp = (char *)dp + xfersize;
> > +
> > + while (error == 0 &&
> > + (char *)dp + offsetof(struct direct, d_name) < edp &&
> > + dp->d_reclen > offsetof(struct direct, d_name) &&
> > + (char *)dp + dp->d_reclen <= edp) {
> > + u.dn.d_reclen = roundup(dp->d_namlen + 1 +
> > + offsetof(struct dirent, d_name), 8);
> > + if (u.dn.d_reclen > uio->uio_resid) {
> > + done = 1;
> > + break;
> > + }
> > + off += dp->d_reclen;
> > + u.dn.d_off = off;
> > + u.dn.d_fileno = dp->d_ino;
> > + u.dn.d_type = dp->d_type;
> > + u.dn.d_namlen = dp->d_namlen;
> > + memcpy(u.dn.d_name, dp->d_name, u.dn.d_namlen);
> > + memset(u.dn.d_name + u.dn.d_namlen, 0,
> > + u.dn.d_reclen - u.dn.d_namlen -
> > + offsetof(struct dirent, d_name));
> > +
> > + if (memchr(u.dn.d_name, '/', u.dn.d_namlen) != NULL) {
> > + error = EINVAL;
> > + break;
> > + }
> > +
> > + error = uiomove(&u.dn, u.dn.d_reclen, uio);
> > + dp = (struct direct *)((char *)dp + dp->d_reclen);
> > + }
> > +
> > + if (error == 0 &&
> > + (char *)dp + offsetof(struct direct, d_name) < edp &&
> > + dp->d_reclen <= offsetof(struct direct, d_name))
> > + error = EIO;
> > +
> > + brelse(bp);
> > + bp = NULL;
> > + readcnt -= xfersize;
> > + }
> > + if (bp != NULL)
> > + brelse(bp);
> > +
> > + uio->uio_offset = off;
> > + *ap->a_eofflag = DIP(ip, size) <= off;
> > +
> > + if (!(vp->v_mount->mnt_flag & MNT_NOATIME) ||
> > + (ip->i_flag & (IN_CHANGE | IN_UPDATE)))
> > + ip->i_flag |= IN_ACCESS;
> > +
> > + return (error);
> > +}
> >
> > /*
> > * Vnode op for reading.
> >
> >
> > --
> > wbr, Kirill
> >
>
--
:wq Claudio
ffs_readdir