From: Claudio Jeker Subject: Re: ffs_readdir To: helg-openbsd@gmx.de, tech@openbsd.org Date: Wed, 20 May 2026 15:22:03 +0200 On Sat, May 16, 2026 at 09:27:44PM +0200, Kirill A. Korinsky wrote: > On Thu, 07 May 2026 18:25:32 +0200, > Kirill A. Korinsky wrote: > > > > On Wed, 08 Apr 2026 02:37:27 +0200, > > "Kirill A. Korinsky" wrote: > > > > > > On Tue, 07 Apr 2026 09:18:11 +0200, > > > Helg wrote: > > > > > > > > Are ffs and ufs so fundamentally different that ufs_readdir can't > > > > also benefit from this optimisation? > > > > > > > > > > Thanks for challenge it. > > > > > > I, actually implemented it and with UFS_BUFATOFF() it looks cleaner. > > > > > > Benchamrks, the base line: > > > > > > ~ $ time find /home/cvs | md5 > > > e9bcc64014befa712a396051d2696703 > > > 0m53.46s real 0m01.62s user 0m22.16s system > > > ~ $ time find /home/cvs | md5 > > > e9bcc64014befa712a396051d2696703 > > > 0m06.03s real 0m00.65s user 0m05.52s system > > > ~ $ time find /home/cvs | md5 > > > e9bcc64014befa712a396051d2696703 > > > 0m06.67s real 0m00.77s user 0m06.16s system > > > ~ $ > > > > > > new ufs_readdir() > > > > > > ~ $ time find /home/cvs | md5 > > > e9bcc64014befa712a396051d2696703 > > > 0m44.67s real 0m01.94s user 0m11.99s system > > > ~ $ time find /home/cvs | md5 > > > e9bcc64014befa712a396051d2696703 > > > 0m02.29s real 0m00.55s user 0m01.81s system > > > ~ $ time find /home/cvs | md5 > > > e9bcc64014befa712a396051d2696703 > > > 0m02.50s real 0m00.58s user 0m01.99s system > > > ~ $ > > > > > > ffs_readdir(): > > > > > > ~ $ time find /home/cvs | md5 > > > e9bcc64014befa712a396051d2696703 > > > 0m45.20s real 0m02.13s user 0m12.84s system > > > ~ $ time find /home/cvs | md5 > > > e9bcc64014befa712a396051d2696703 > > > 0m02.27s real 0m00.47s user 0m01.89s system > > > ~ $ time find /home/cvs | md5 > > > e9bcc64014befa712a396051d2696703 > > > 0m02.27s real 0m00.53s user 0m01.85s system > > > ~ $ > > > > > > I think it is clear that ffs_readdir() is a wrong way and the right > > > direction is ufs_readdir(). > > > > > > > I use that diff on few machines for last month. No regression observed. > > > > I think it worth to go in in that early stage of development that buys > > plenty of time to backout if it brokes something. > > > > Thouhgs? Ok? > > > > Anyone? See below... just a few minor questions. > Index: sys/ufs/ufs/ufs_vnops.c > =================================================================== > RCS file: /home/cvs/src/sys/ufs/ufs/ufs_vnops.c,v > diff -u -p -r1.164 ufs_vnops.c > --- sys/ufs/ufs/ufs_vnops.c 18 Oct 2024 05:52:33 -0000 1.164 > +++ sys/ufs/ufs/ufs_vnops.c 8 Apr 2026 00:04:04 -0000 > @@ -1323,18 +1323,22 @@ int > ufs_readdir(void *v) > { > struct vop_readdir_args *ap = v; > - struct uio auio, *uio = ap->a_uio; > - struct iovec aiov; > + struct uio *uio = ap->a_uio; > + struct vnode *vp = ap->a_vp; > + struct inode *ip = VTOI(vp); > union { > struct dirent dn; > char __pad[roundup(sizeof(struct dirent), 8)]; > } u; > off_t off = uio->uio_offset; > + off_t bytesinfile; > struct direct *dp; > + struct buf *bp; > char *edp; > - caddr_t diskbuf; > + char *dirbuf; > size_t count, entries; > - int bufsize, readcnt, error; > + int done, error, readcnt, xfersize; > + long size; > > if (uio->uio_rw != UIO_READ) > return (EINVAL); > @@ -1346,75 +1350,102 @@ ufs_readdir(void *v) > if (count <= entries) > return (EINVAL); > > - /* > - * Convert and copy back the on-disk struct direct format to > - * the user-space struct dirent format, one entry at a time > - */ > - > /* read from disk, stopping on a block boundary, max 64kB */ > readcnt = min(count, 64*1024) - entries; > > - auio = *uio; > - auio.uio_iov = &aiov; > - auio.uio_iovcnt = 1; > - auio.uio_resid = readcnt; > - auio.uio_segflg = UIO_SYSSPACE; > - aiov.iov_len = readcnt; > - bufsize = readcnt; > - diskbuf = malloc(bufsize, M_TEMP, M_WAITOK); > - aiov.iov_base = diskbuf; > - error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); > - readcnt -= auio.uio_resid; > - dp = (struct direct *)diskbuf; > - edp = &diskbuf[readcnt]; > + off = uio->uio_offset; > + done = 0; > + error = 0; > + bp = NULL; > > memset(&u, 0, sizeof(u)); > > - /* > - * While > - * - we haven't failed to VOP_READ or uiomove() > - * - there's space in the read buf for the head of an entry > - * - that entry has a valid d_reclen, and > - * - there's space for the *entire* entry > - * then we're good to process this one. > - */ > - while (error == 0 && > - (char *)dp + offsetof(struct direct, d_name) < edp && > - dp->d_reclen > offsetof(struct direct, d_name) && > - (char *)dp + dp->d_reclen <= edp) { > - u.dn.d_reclen = roundup(dp->d_namlen+1 + > - offsetof(struct dirent, d_name), 8); > - if (u.dn.d_reclen > uio->uio_resid) > + while (error == 0 && done == 0 && readcnt > 0) { > + bp = NULL; > + if ((bytesinfile = DIP(ip, size) - off) <= 0) > break; > - off += dp->d_reclen; > - u.dn.d_off = off; > - u.dn.d_fileno = dp->d_ino; > - u.dn.d_type = dp->d_type; > - u.dn.d_namlen = dp->d_namlen; > - memcpy(u.dn.d_name, dp->d_name, u.dn.d_namlen); > - memset(u.dn.d_name + u.dn.d_namlen, 0, u.dn.d_reclen > - - u.dn.d_namlen - offsetof(struct dirent, d_name)); > - > - if (memchr(u.dn.d_name, '/', u.dn.d_namlen) != NULL) { > - error = EINVAL; > + if ((error = UFS_BUFATOFF(ip, off, &dirbuf, &bp)) != 0) > + break; > + size = bp->b_bcount - bp->b_resid - > + (dirbuf - (char *)bp->b_data); Can you explain why you calculate the size like this? Is there no other way? > + if (size <= 0) { > + brelse(bp); > + bp = NULL; No need to brelse() and do the bp = NULL here. If you break let the if (bp != NULL) brelse(bp); at the end of the loop do its job. > break; > } > > - error = uiomove(&u.dn, u.dn.d_reclen, uio); > - dp = (struct direct *)((char *)dp + dp->d_reclen); > - } > + xfersize = readcnt; > + if (size < xfersize) > + xfersize = size; > + if (bytesinfile < xfersize) > + xfersize = bytesinfile; > + > + /* > + * Convert and copy back the on-disk struct direct format to > + * the user-space struct dirent format, one entry at a time > + */ > + dp = (struct direct *)dirbuf; > + edp = dirbuf + xfersize; > + > + /* > + * While > + * - we haven't failed to read or uiomove() > + * - there's space in the read buf for the head of an entry > + * - that entry has a valid d_reclen, and > + * - there's space for the *entire* entry > + * then we're good to process this one. > + */ > + while (error == 0 && > + (char *)dp + offsetof(struct direct, d_name) < edp && > + dp->d_reclen > offsetof(struct direct, d_name) && > + (char *)dp + dp->d_reclen <= edp) { > + u.dn.d_reclen = roundup(dp->d_namlen + 1 + > + offsetof(struct dirent, d_name), 8); > + if (u.dn.d_reclen > uio->uio_resid) { > + done = 1; > + break; > + } > + off += dp->d_reclen; > + u.dn.d_off = off; > + u.dn.d_fileno = dp->d_ino; > + u.dn.d_type = dp->d_type; > + u.dn.d_namlen = dp->d_namlen; > + memcpy(u.dn.d_name, dp->d_name, u.dn.d_namlen); > + memset(u.dn.d_name + u.dn.d_namlen, 0, > + u.dn.d_reclen - u.dn.d_namlen - > + offsetof(struct dirent, d_name)); > + > + if (memchr(u.dn.d_name, '/', u.dn.d_namlen) != NULL) { > + error = EINVAL; > + break; > + } > > - /* > - * If there was room for an entry in what we read but its > - * d_reclen is bogus, fail > - */ > - if ((char *)dp + offsetof(struct direct, d_name) < edp && > - dp->d_reclen <= offsetof(struct direct, d_name)) > - error = EIO; > - free(diskbuf, M_TEMP, bufsize); > + error = uiomove(&u.dn, u.dn.d_reclen, uio); > + dp = (struct direct *)((char *)dp + dp->d_reclen); > + } > + > + /* > + * If there was room for an entry in what we read but its > + * d_reclen is bogus, fail > + */ > + if (error == 0 && > + (char *)dp + offsetof(struct direct, d_name) < edp && > + dp->d_reclen <= offsetof(struct direct, d_name)) > + error = EIO; Why did you add the error == 0 case here? In the old code it would do this check in all cases and it should not matter. Now it can clobber the EINVAL return error. > + > + brelse(bp); > + bp = NULL; > + readcnt -= xfersize; > + } > + if (bp != NULL) > + brelse(bp); > > uio->uio_offset = off; > - *ap->a_eofflag = DIP(VTOI(ap->a_vp), size) <= off; > + *ap->a_eofflag = DIP(ip, size) <= off; > + > + if (!(vp->v_mount->mnt_flag & MNT_NOATIME) || > + (ip->i_flag & (IN_CHANGE | IN_UPDATE))) > + ip->i_flag |= IN_ACCESS; > > return (error); > } > > > -- > wbr, Kirill > -- :wq Claudio