From: Claudio Jeker Subject: Re: ffs_readdir To: Helg Cc: "Kirill A. Korinsky" , OpenBSD tech Date: Tue, 7 Apr 2026 10:32:29 +0200 On Tue, Apr 07, 2026 at 09:18:11AM +0200, Helg wrote: > On Mon, Apr 06, 2026 at 02:57:21PM +0200, Kirill A. Korinsky wrote: > > tech@, > > > > I'd like to share with you a draft of ffs_readdir() > > > > Idea is using lower and cheaper internal API to make listing of > > directories via ffs_readdir() faster / cheaper than ufs_readdir() > > > > This code may be wrong or it can contain bugs. Anyway, it boots, and I > > already use it without any noticeble issue so far. > > > > Performance impact? > > > > ufs_readdir() > > > > ~ $ time find /home/cvs >/dev/null > > 0m54.19s real 0m01.90s user 0m22.82s system > > ~ $ time find /home/cvs >/dev/null > > 0m05.91s real 0m00.44s user 0m05.51s system > > ~ $ time find /home/cvs >/dev/null > > 0m05.91s real 0m00.58s user 0m05.30s system > > ~ $ > > > > ffs_readdir() > > > > ~ $ time find /home/cvs >/dev/null > > 0m42.88s real 0m01.84s user 0m11.29s system > > ~ $ time find /home/cvs >/dev/null > > 0m02.45s real 0m00.40s user 0m02.07s system > > ~ $ time find /home/cvs >/dev/null > > 0m02.34s real 0m00.52s user 0m01.81s system > > ~ $ > > > > Anyone willing to test it? > > I would recommend that you keep some of the original comments from > ufs that explain all the block and dirent calculations. > > Are ffs and ufs so fundamentally different that ufs_readdir can't > also benefit from this optimisation? > > What's the purpose of the outer loop. Is this to support the readdir > buffer being larger than a block and enabling readdir to read more > than one block per call to readdir? ufs_readdir could benefit from > this too. > > I suggest to return EIO instead of EINVAL if a name contains '/'. > I know this is what ufs does but reading getdents(2) this is not a > documented reason for returning EINVAL. I would stay away from EIO as an error for this. EIO is for I/O errors and normally indicate that the disk is dying. This is an invalid directory structure and should use some other error, this is why I picked EINVAL. If we decide to change this errno here then we should go over all other readdir functions and adjust the code there too. > > Index: sys/ufs/ffs/ffs_extern.h > > =================================================================== > > RCS file: /home/cvs/src/sys/ufs/ffs/ffs_extern.h,v > > diff -u -p -r1.51 ffs_extern.h > > --- sys/ufs/ffs/ffs_extern.h 8 Oct 2024 02:58:26 -0000 1.51 > > +++ sys/ufs/ffs/ffs_extern.h 6 Apr 2026 12:49:45 -0000 > > @@ -150,6 +150,7 @@ int ffs_sysctl(int *, u_int, void *, siz > > int ffs_sbupdate(struct ufsmount *, int); > > > > /* ffs_vnops.c */ > > +int ffs_readdir(void *); > > int ffs_read(void *); > > int ffs_write(void *); > > int ffs_fsync(void *); > > Index: sys/ufs/ffs/ffs_vnops.c > > =================================================================== > > RCS file: /home/cvs/src/sys/ufs/ffs/ffs_vnops.c,v > > diff -u -p -r1.103 ffs_vnops.c > > --- sys/ufs/ffs/ffs_vnops.c 27 Mar 2025 23:30:54 -0000 1.103 > > +++ sys/ufs/ffs/ffs_vnops.c 6 Apr 2026 12:49:45 -0000 > > @@ -38,6 +38,7 @@ > > #include > > #include > > #include > > +#include > > #include > > #include > > #include > > @@ -78,7 +79,7 @@ const struct vops ffs_vops = { > > .vop_mkdir = ufs_mkdir, > > .vop_rmdir = ufs_rmdir, > > .vop_symlink = ufs_symlink, > > - .vop_readdir = ufs_readdir, > > + .vop_readdir = ffs_readdir, > > .vop_readlink = ufs_readlink, > > .vop_abortop = vop_generic_abortop, > > .vop_inactive = ufs_inactive, > > @@ -173,6 +174,134 @@ const struct vops ffs_fifovops = { > > .vop_advlock = fifo_advlock > > }; > > #endif /* FIFO */ > > + > > +/* > > + * Vnode op for reading directories. > > + */ > > +int > > +ffs_readdir(void *v) > > +{ > > + struct vop_readdir_args *ap = v; > > + struct uio *uio = ap->a_uio; > > + struct vnode *vp = ap->a_vp; > > + struct inode *ip = VTOI(vp); > > + struct fs *fs = ip->i_fs; > > + union { > > + struct dirent dn; > > + char __pad[roundup(sizeof(struct dirent), 8)]; > > + } u; > > + struct direct *dp; > > + struct buf *bp; > > + char *edp; > > + off_t bytesinfile, off; > > + daddr_t lbn, nextlbn; > > + size_t count, entries; > > + int blkoffset, bsize, done, error, readcnt, size, xfersize; > > + > > + if (uio->uio_rw != UIO_READ) > > + return (EINVAL); > > + > > + count = uio->uio_resid; > > + entries = (uio->uio_offset + count) & (DIRBLKSIZ - 1); > > + > > + if (count <= entries) > > + return (EINVAL); > > + > > + readcnt = (int)(min(count, (size_t)64 * 1024) - entries); > > + off = uio->uio_offset; > > + done = 0; > > + error = 0; > > + bp = NULL; > > + > > + memset(&u, 0, sizeof(u)); > > + > > + while (error == 0 && done == 0 && readcnt > 0) { > > + bp = NULL; > > + if ((bytesinfile = DIP(ip, size) - off) <= 0) > > + break; > > + lbn = lblkno(fs, off); > > + nextlbn = lbn + 1; > > + bsize = blksize(fs, ip, lbn); > > + blkoffset = blkoff(fs, off); > > + xfersize = bsize - blkoffset; > > + if (readcnt < xfersize) > > + xfersize = readcnt; > > + if (bytesinfile < xfersize) > > + xfersize = bytesinfile; > > + > > + if (lblktosize(fs, nextlbn) >= DIP(ip, size)) > > + error = bread(vp, lbn, fs->fs_bsize, &bp); > > + else if (lbn - 1 == ip->i_ci.ci_lastr || readcnt > xfersize) > > + error = bread_cluster(vp, lbn, fs->fs_bsize, &bp); > > + else > > + error = bread(vp, lbn, fs->fs_bsize, &bp); > > + if (error) > > + break; > > + ip->i_ci.ci_lastr = lbn; > > + > > + buf_adjcnt(bp, bsize); > > + size = bsize - bp->b_resid; > > + if (size <= blkoffset) { > > + brelse(bp); > > + bp = NULL; > > + break; > > + } > > + if (size < blkoffset + xfersize) > > + xfersize = size - blkoffset; > > + > > + dp = (struct direct *)(bp->b_data + blkoffset); > > + edp = (char *)dp + xfersize; > > + > > + while (error == 0 && > > + (char *)dp + offsetof(struct direct, d_name) < edp && > > + dp->d_reclen > offsetof(struct direct, d_name) && > > + (char *)dp + dp->d_reclen <= edp) { > > + u.dn.d_reclen = roundup(dp->d_namlen + 1 + > > + offsetof(struct dirent, d_name), 8); > > + if (u.dn.d_reclen > uio->uio_resid) { > > + done = 1; > > + break; > > + } > > + off += dp->d_reclen; > > + u.dn.d_off = off; > > + u.dn.d_fileno = dp->d_ino; > > + u.dn.d_type = dp->d_type; > > + u.dn.d_namlen = dp->d_namlen; > > + memcpy(u.dn.d_name, dp->d_name, u.dn.d_namlen); > > + memset(u.dn.d_name + u.dn.d_namlen, 0, > > + u.dn.d_reclen - u.dn.d_namlen - > > + offsetof(struct dirent, d_name)); > > + > > + if (memchr(u.dn.d_name, '/', u.dn.d_namlen) != NULL) { > > + error = EINVAL; > > + break; > > + } > > + > > + error = uiomove(&u.dn, u.dn.d_reclen, uio); > > + dp = (struct direct *)((char *)dp + dp->d_reclen); > > + } > > + > > + if (error == 0 && > > + (char *)dp + offsetof(struct direct, d_name) < edp && > > + dp->d_reclen <= offsetof(struct direct, d_name)) > > + error = EIO; > > + > > + brelse(bp); > > + bp = NULL; > > + readcnt -= xfersize; > > + } > > + if (bp != NULL) > > + brelse(bp); > > + > > + uio->uio_offset = off; > > + *ap->a_eofflag = DIP(ip, size) <= off; > > + > > + if (!(vp->v_mount->mnt_flag & MNT_NOATIME) || > > + (ip->i_flag & (IN_CHANGE | IN_UPDATE))) > > + ip->i_flag |= IN_ACCESS; > > + > > + return (error); > > +} > > > > /* > > * Vnode op for reading. > > > > > > -- > > wbr, Kirill > > > -- :wq Claudio