Download raw body.
ffs_readdir
On Sat, May 16, 2026 at 09:27:44PM +0200, Kirill A. Korinsky wrote:
> On Thu, 07 May 2026 18:25:32 +0200,
> Kirill A. Korinsky <kirill@korins.ky> wrote:
> >
> > On Wed, 08 Apr 2026 02:37:27 +0200,
> > "Kirill A. Korinsky" <kirill@korins.ky> wrote:
> > >
> > > On Tue, 07 Apr 2026 09:18:11 +0200,
> > > Helg <helg-openbsd@gmx.de> wrote:
> > > >
> > > > Are ffs and ufs so fundamentally different that ufs_readdir can't
> > > > also benefit from this optimisation?
> > > >
> > >
> > > Thanks for challenge it.
> > >
> > > I, actually implemented it and with UFS_BUFATOFF() it looks cleaner.
> > >
> > > Benchamrks, the base line:
> > >
> > > ~ $ time find /home/cvs | md5
> > > e9bcc64014befa712a396051d2696703
> > > 0m53.46s real 0m01.62s user 0m22.16s system
> > > ~ $ time find /home/cvs | md5
> > > e9bcc64014befa712a396051d2696703
> > > 0m06.03s real 0m00.65s user 0m05.52s system
> > > ~ $ time find /home/cvs | md5
> > > e9bcc64014befa712a396051d2696703
> > > 0m06.67s real 0m00.77s user 0m06.16s system
> > > ~ $
> > >
> > > new ufs_readdir()
> > >
> > > ~ $ time find /home/cvs | md5
> > > e9bcc64014befa712a396051d2696703
> > > 0m44.67s real 0m01.94s user 0m11.99s system
> > > ~ $ time find /home/cvs | md5
> > > e9bcc64014befa712a396051d2696703
> > > 0m02.29s real 0m00.55s user 0m01.81s system
> > > ~ $ time find /home/cvs | md5
> > > e9bcc64014befa712a396051d2696703
> > > 0m02.50s real 0m00.58s user 0m01.99s system
> > > ~ $
> > >
> > > ffs_readdir():
> > >
> > > ~ $ time find /home/cvs | md5
> > > e9bcc64014befa712a396051d2696703
> > > 0m45.20s real 0m02.13s user 0m12.84s system
> > > ~ $ time find /home/cvs | md5
> > > e9bcc64014befa712a396051d2696703
> > > 0m02.27s real 0m00.47s user 0m01.89s system
> > > ~ $ time find /home/cvs | md5
> > > e9bcc64014befa712a396051d2696703
> > > 0m02.27s real 0m00.53s user 0m01.85s system
> > > ~ $
> > >
> > > I think it is clear that ffs_readdir() is a wrong way and the right
> > > direction is ufs_readdir().
> > >
> >
> > I use that diff on few machines for last month. No regression observed.
> >
> > I think it worth to go in in that early stage of development that buys
> > plenty of time to backout if it brokes something.
> >
> > Thouhgs? Ok?
> >
>
> Anyone?
See below... just a few minor questions.
> Index: sys/ufs/ufs/ufs_vnops.c
> ===================================================================
> RCS file: /home/cvs/src/sys/ufs/ufs/ufs_vnops.c,v
> diff -u -p -r1.164 ufs_vnops.c
> --- sys/ufs/ufs/ufs_vnops.c 18 Oct 2024 05:52:33 -0000 1.164
> +++ sys/ufs/ufs/ufs_vnops.c 8 Apr 2026 00:04:04 -0000
> @@ -1323,18 +1323,22 @@ int
> ufs_readdir(void *v)
> {
> struct vop_readdir_args *ap = v;
> - struct uio auio, *uio = ap->a_uio;
> - struct iovec aiov;
> + struct uio *uio = ap->a_uio;
> + struct vnode *vp = ap->a_vp;
> + struct inode *ip = VTOI(vp);
> union {
> struct dirent dn;
> char __pad[roundup(sizeof(struct dirent), 8)];
> } u;
> off_t off = uio->uio_offset;
> + off_t bytesinfile;
> struct direct *dp;
> + struct buf *bp;
> char *edp;
> - caddr_t diskbuf;
> + char *dirbuf;
> size_t count, entries;
> - int bufsize, readcnt, error;
> + int done, error, readcnt, xfersize;
> + long size;
>
> if (uio->uio_rw != UIO_READ)
> return (EINVAL);
> @@ -1346,75 +1350,102 @@ ufs_readdir(void *v)
> if (count <= entries)
> return (EINVAL);
>
> - /*
> - * Convert and copy back the on-disk struct direct format to
> - * the user-space struct dirent format, one entry at a time
> - */
> -
> /* read from disk, stopping on a block boundary, max 64kB */
> readcnt = min(count, 64*1024) - entries;
>
> - auio = *uio;
> - auio.uio_iov = &aiov;
> - auio.uio_iovcnt = 1;
> - auio.uio_resid = readcnt;
> - auio.uio_segflg = UIO_SYSSPACE;
> - aiov.iov_len = readcnt;
> - bufsize = readcnt;
> - diskbuf = malloc(bufsize, M_TEMP, M_WAITOK);
> - aiov.iov_base = diskbuf;
> - error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
> - readcnt -= auio.uio_resid;
> - dp = (struct direct *)diskbuf;
> - edp = &diskbuf[readcnt];
> + off = uio->uio_offset;
> + done = 0;
> + error = 0;
> + bp = NULL;
>
> memset(&u, 0, sizeof(u));
>
> - /*
> - * While
> - * - we haven't failed to VOP_READ or uiomove()
> - * - there's space in the read buf for the head of an entry
> - * - that entry has a valid d_reclen, and
> - * - there's space for the *entire* entry
> - * then we're good to process this one.
> - */
> - while (error == 0 &&
> - (char *)dp + offsetof(struct direct, d_name) < edp &&
> - dp->d_reclen > offsetof(struct direct, d_name) &&
> - (char *)dp + dp->d_reclen <= edp) {
> - u.dn.d_reclen = roundup(dp->d_namlen+1 +
> - offsetof(struct dirent, d_name), 8);
> - if (u.dn.d_reclen > uio->uio_resid)
> + while (error == 0 && done == 0 && readcnt > 0) {
> + bp = NULL;
> + if ((bytesinfile = DIP(ip, size) - off) <= 0)
> break;
> - off += dp->d_reclen;
> - u.dn.d_off = off;
> - u.dn.d_fileno = dp->d_ino;
> - u.dn.d_type = dp->d_type;
> - u.dn.d_namlen = dp->d_namlen;
> - memcpy(u.dn.d_name, dp->d_name, u.dn.d_namlen);
> - memset(u.dn.d_name + u.dn.d_namlen, 0, u.dn.d_reclen
> - - u.dn.d_namlen - offsetof(struct dirent, d_name));
> -
> - if (memchr(u.dn.d_name, '/', u.dn.d_namlen) != NULL) {
> - error = EINVAL;
> + if ((error = UFS_BUFATOFF(ip, off, &dirbuf, &bp)) != 0)
> + break;
> + size = bp->b_bcount - bp->b_resid -
> + (dirbuf - (char *)bp->b_data);
Can you explain why you calculate the size like this?
Is there no other way?
> + if (size <= 0) {
> + brelse(bp);
> + bp = NULL;
No need to brelse() and do the bp = NULL here. If you break let the
if (bp != NULL)
brelse(bp);
at the end of the loop do its job.
> break;
> }
>
> - error = uiomove(&u.dn, u.dn.d_reclen, uio);
> - dp = (struct direct *)((char *)dp + dp->d_reclen);
> - }
> + xfersize = readcnt;
> + if (size < xfersize)
> + xfersize = size;
> + if (bytesinfile < xfersize)
> + xfersize = bytesinfile;
> +
> + /*
> + * Convert and copy back the on-disk struct direct format to
> + * the user-space struct dirent format, one entry at a time
> + */
> + dp = (struct direct *)dirbuf;
> + edp = dirbuf + xfersize;
> +
> + /*
> + * While
> + * - we haven't failed to read or uiomove()
> + * - there's space in the read buf for the head of an entry
> + * - that entry has a valid d_reclen, and
> + * - there's space for the *entire* entry
> + * then we're good to process this one.
> + */
> + while (error == 0 &&
> + (char *)dp + offsetof(struct direct, d_name) < edp &&
> + dp->d_reclen > offsetof(struct direct, d_name) &&
> + (char *)dp + dp->d_reclen <= edp) {
> + u.dn.d_reclen = roundup(dp->d_namlen + 1 +
> + offsetof(struct dirent, d_name), 8);
> + if (u.dn.d_reclen > uio->uio_resid) {
> + done = 1;
> + break;
> + }
> + off += dp->d_reclen;
> + u.dn.d_off = off;
> + u.dn.d_fileno = dp->d_ino;
> + u.dn.d_type = dp->d_type;
> + u.dn.d_namlen = dp->d_namlen;
> + memcpy(u.dn.d_name, dp->d_name, u.dn.d_namlen);
> + memset(u.dn.d_name + u.dn.d_namlen, 0,
> + u.dn.d_reclen - u.dn.d_namlen -
> + offsetof(struct dirent, d_name));
> +
> + if (memchr(u.dn.d_name, '/', u.dn.d_namlen) != NULL) {
> + error = EINVAL;
> + break;
> + }
>
> - /*
> - * If there was room for an entry in what we read but its
> - * d_reclen is bogus, fail
> - */
> - if ((char *)dp + offsetof(struct direct, d_name) < edp &&
> - dp->d_reclen <= offsetof(struct direct, d_name))
> - error = EIO;
> - free(diskbuf, M_TEMP, bufsize);
> + error = uiomove(&u.dn, u.dn.d_reclen, uio);
> + dp = (struct direct *)((char *)dp + dp->d_reclen);
> + }
> +
> + /*
> + * If there was room for an entry in what we read but its
> + * d_reclen is bogus, fail
> + */
> + if (error == 0 &&
> + (char *)dp + offsetof(struct direct, d_name) < edp &&
> + dp->d_reclen <= offsetof(struct direct, d_name))
> + error = EIO;
Why did you add the error == 0 case here?
In the old code it would do this check in all cases and it should not
matter. Now it can clobber the EINVAL return error.
> +
> + brelse(bp);
> + bp = NULL;
> + readcnt -= xfersize;
> + }
> + if (bp != NULL)
> + brelse(bp);
>
> uio->uio_offset = off;
> - *ap->a_eofflag = DIP(VTOI(ap->a_vp), size) <= off;
> + *ap->a_eofflag = DIP(ip, size) <= off;
> +
> + if (!(vp->v_mount->mnt_flag & MNT_NOATIME) ||
> + (ip->i_flag & (IN_CHANGE | IN_UPDATE)))
> + ip->i_flag |= IN_ACCESS;
>
> return (error);
> }
>
>
> --
> wbr, Kirill
>
--
:wq Claudio
ffs_readdir