Download raw body.
[patch] ext4fs rw
Hi tech@,
Here is a patch to attach ext4fs drives to OpenBSD with full ext4
support (compatible with recent Linux).
Performances are 610MB/s read/write vs 830MB/s for FFS2 on an NVMe.
No journalling. Recovery at mount time not tested.
All tests pass e2fstools / e2fsck without trouble. Please test and
reply without too much flames as I'm rather new to the kernel side of
development.
I hope this helps,
Cheers,
diff --git a/sbin/mount_ext4fs/Makefile b/sbin/mount_ext4fs/Makefile
new file mode 100644
index 000000000..939f3e8a5
--- /dev/null
+++ b/sbin/mount_ext4fs/Makefile
@@ -0,0 +1,11 @@
+# $OpenBSD: Makefile,v 1.1 1996/06/27 07:20:28 downsj Exp $
+
+PROG= mount_ext4fs
+SRCS= mount_ext4fs.c getmntopts.c
+MAN= mount_ext4fs.8
+
+MOUNT= ${.CURDIR}/../mount
+CFLAGS+= -I${MOUNT}
+.PATH: ${MOUNT}
+
+.include <bsd.prog.mk>
diff --git a/sbin/mount_ext4fs/mount_ext4fs.8 b/sbin/mount_ext4fs/mount_ext4fs.8
new file mode 100644
index 000000000..95aef0db2
--- /dev/null
+++ b/sbin/mount_ext4fs/mount_ext4fs.8
@@ -0,0 +1,88 @@
+.\" $OpenBSD: $
+.\"
+.\" Copyright (c) 1993, 1994
+.\" The Regents of the University of California. All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\" 3. Neither the name of the University nor the names of its contributors
+.\" may be used to endorse or promote products derived from this software
+.\" without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"
+.Dd $Mdocdate: April 23 2020 $
+.Dt MOUNT_EXT4FS 8
+.Os
+.Sh NAME
+.Nm mount_ext4fs
+.Nd mount an ext4fs file system
+.Sh SYNOPSIS
+.Nm mount_ext4fs
+.Op Fl o Ar options
+.Ar special
+.Ar node
+.Sh DESCRIPTION
+The
+.Nm
+command attaches an ext4fs file system
+.Ar special
+device on to the file system tree at the point
+.Ar node .
+This command is invoked by
+.Xr mount 8
+when using the syntax
+.Bd -ragged -offset 4n
+.Nm mount Op options
+-t ext4fs
+.Ar special node
+.Ed
+.Pp
+The
+.Ar special
+device must correspond to a partition registered in the
+.Xr disklabel 5 .
+.Pp
+This command is normally executed by
+.Xr mount 8
+at boot time.
+.Pp
+The options are as follows:
+.Bl -tag -width Ds
+.It Fl o Ar options
+Options are specified with a
+.Fl o
+flag followed by a comma separated string of options.
+See the
+.Xr mount 8
+man page for possible options and their meanings.
+.El
+.Sh SEE ALSO
+.Xr mount 2 ,
+.Xr disklabel 5 ,
+.Xr fstab 5 ,
+.Xr disklabel 8 ,
+.Xr mount 8 ,
+.Xr umount 8
+.Sh HISTORY
+The
+.Nm
+function first appeared in
+.Fx 2.2 .
diff --git a/sbin/mount_ext4fs/mount_ext4fs.c b/sbin/mount_ext4fs/mount_ext4fs.c
new file mode 100644
index 000000000..1148a9cab
--- /dev/null
+++ b/sbin/mount_ext4fs/mount_ext4fs.c
@@ -0,0 +1,113 @@
+/* $OpenBSD: $ */
+/* $NetBSD: mount_ffs.c,v 1.3 1996/04/13 01:31:19 jtc Exp $ */
+
+/*-
+ * Copyright (c) 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/types.h>
+#include <sys/mount.h>
+
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "mntopts.h"
+
+void ext4fs_usage(void);
+
+static const struct mntopt mopts[] = {
+ MOPT_STDOPTS,
+ MOPT_UPDATE,
+ { NULL }
+};
+
+int
+main(int argc, char *argv[])
+{
+ struct ufs_args args; /* XXX ffs_args */
+ int ch, mntflags;
+ char fs_name[PATH_MAX], *errcause;
+
+ mntflags = 0;
+ optind = optreset = 1; /* Reset for parse of new argv. */
+ while ((ch = getopt(argc, argv, "o:")) != -1)
+ switch (ch) {
+ case 'o':
+ getmntopts(optarg, mopts, &mntflags);
+ break;
+ default:
+ ext4fs_usage();
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 2)
+ ext4fs_usage();
+
+ args.fspec = argv[0]; /* The name of the device file. */
+ if (realpath(argv[1], fs_name) == NULL) /* The mount point. */
+ err(1, "realpath %s", argv[1]);
+
+ #define DEFAULT_ROOTUID -2
+ args.export_info.ex_root = DEFAULT_ROOTUID;
+
+ if (mntflags & MNT_RDONLY)
+ args.export_info.ex_flags = MNT_EXRDONLY;
+ else
+ args.export_info.ex_flags = 0;
+ if (mount(MOUNT_EXT4FS, fs_name, mntflags, &args) == -1) {
+ switch (errno) {
+ case EMFILE:
+ errcause = "mount table full";
+ break;
+ case EINVAL:
+ errcause =
+ "specified device does not match mounted device";
+ break;
+ case EOPNOTSUPP:
+ errcause = "filesystem not supported by kernel";
+ break;
+ default:
+ errcause = strerror(errno);
+ break;
+ }
+ errx(1, "%s on %s: %s", args.fspec, fs_name, errcause);
+ }
+ exit(0);
+}
+
+void
+ext4fs_usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: mount_ext4fs [-o options] special node\n");
+ exit(1);
+}
diff --git a/sys/conf/GENERIC b/sys/conf/GENERIC
index f9f615912..86e3e597c 100644
--- a/sys/conf/GENERIC
+++ b/sys/conf/GENERIC
@@ -32,6 +32,7 @@ option FFS2 # UFS2
option UFS_DIRHASH # hash large directories
option QUOTA # UFS quotas
option EXT2FS # Second Extended Filesystem
+option EXT4FS # Fourth Extended Filesystem
option MFS # memory file system
option NFSCLIENT # Network File System client
option NFSSERVER # Network File System server
diff --git a/sys/conf/files b/sys/conf/files
index e5e66dca6..873653e75 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -994,15 +994,15 @@ file ufs/ffs/ffs_vfsops.c ffs | mfs
file ufs/ffs/ffs_vnops.c ffs | mfs
file ufs/mfs/mfs_vfsops.c mfs
file ufs/mfs/mfs_vnops.c mfs
-file ufs/ufs/ufs_bmap.c ffs | mfs | ext2fs
+file ufs/ufs/ufs_bmap.c ffs | mfs | ext2fs | ext4fs
file ufs/ufs/ufs_dirhash.c ufs_dirhash & (ffs | mfs)
-file ufs/ufs/ufs_ihash.c ffs | mfs | ext2fs | fuse
-file ufs/ufs/ufs_inode.c ffs | mfs | ext2fs
-file ufs/ufs/ufs_lookup.c ffs | mfs | ext2fs
-file ufs/ufs/ufs_quota.c quota & ( ffs | mfs | ext2fs )
+file ufs/ufs/ufs_ihash.c ffs | mfs | ext2fs | ext4fs | fuse
+file ufs/ufs/ufs_inode.c ffs | mfs | ext2fs | ext4fs
+file ufs/ufs/ufs_lookup.c ffs | mfs | ext2fs | ext4fs
+file ufs/ufs/ufs_quota.c quota & ( ffs | mfs | ext2fs | ext4fs )
file ufs/ufs/ufs_quota_stub.c ffs | mfs
-file ufs/ufs/ufs_vfsops.c ffs | mfs | ext2fs
-file ufs/ufs/ufs_vnops.c ffs | mfs | ext2fs
+file ufs/ufs/ufs_vfsops.c ffs | mfs | ext2fs | ext4fs
+file ufs/ufs/ufs_vnops.c ffs | mfs | ext2fs | ext4fs
file ufs/ext2fs/ext2fs_alloc.c ext2fs
file ufs/ext2fs/ext2fs_balloc.c ext2fs
file ufs/ext2fs/ext2fs_bmap.c ext2fs
@@ -1014,6 +1014,10 @@ file ufs/ext2fs/ext2fs_readwrite.c ext2fs
file ufs/ext2fs/ext2fs_subr.c ext2fs
file ufs/ext2fs/ext2fs_vfsops.c ext2fs
file ufs/ext2fs/ext2fs_vnops.c ext2fs
+file ufs/ext4fs/ext4fs_crc32c.c ext4fs
+file ufs/ext4fs/ext4fs_journal.c ext4fs
+file ufs/ext4fs/ext4fs_vfsops.c ext4fs
+file ufs/ext4fs/ext4fs_vnops.c ext4fs
file uvm/uvm_addr.c
file uvm/uvm_amap.c
file uvm/uvm_anon.c
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
index 8d31047ac..4045a0bd5 100644
--- a/sys/kern/vfs_init.c
+++ b/sys/kern/vfs_init.c
@@ -100,6 +100,11 @@ static struct vfsconf vfsconflist[] = {
{ &tmpfs_vfsops, MOUNT_TMPFS, 19, 0, MNT_LOCAL,
sizeof(struct tmpfs_args) },
#endif
+
+#ifdef EXT4FS
+ { &ext4fs_vfsops, MOUNT_EXT4FS, 22, 0, MNT_LOCAL,
+ sizeof(struct ufs_args) },
+#endif
};
diff --git a/sys/sys/disklabel.h b/sys/sys/disklabel.h
index 3d938666e..86a0fb267 100644
--- a/sys/sys/disklabel.h
+++ b/sys/sys/disklabel.h
@@ -275,6 +275,7 @@ static const char * const dktypenames[] = {
#define FS_RAID 19 /* RAIDframe or softraid */
#define FS_NTFS 20 /* Windows/NT file system */
#define FS_UDF 21 /* UDF (DVD) filesystem */
+#define FS_EXT4FS 22 /* ext4fs */
#ifdef DKTYPENAMES
static const char * const fstypenames[] = {
@@ -300,6 +301,7 @@ static const char * const fstypenames[] = {
"RAID",
"NTFS",
"UDF",
+ "ext4fs",
NULL
};
@@ -327,6 +329,7 @@ static char *fstypesnames[] = {
"", /* 19 */
"ntfs", /* 20 */
"udf", /* 21 */
+ "ext4fs", /* 22 */
NULL
};
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index a0010c55f..9cbe4607e 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -320,6 +320,7 @@ struct statfs {
#define MOUNT_AFS "afs" /* Andrew Filesystem */
#define MOUNT_CD9660 "cd9660" /* ISO9660 (aka CDROM) Filesystem */
#define MOUNT_EXT2FS "ext2fs" /* Second Extended Filesystem */
+#define MOUNT_EXT4FS "ext4fs" /* Fourth Extended Filesystem */
#define MOUNT_NCPFS "ncpfs" /* NetWare Network File System */
#define MOUNT_NTFS "ntfs" /* NTFS */
#define MOUNT_UDF "udf" /* UDF */
@@ -556,6 +557,7 @@ extern const struct vfsops msdosfs_vfsops;
extern const struct vfsops nfs_vfsops;
extern const struct vfsops cd9660_vfsops;
extern const struct vfsops ext2fs_vfsops;
+extern const struct vfsops ext4fs_vfsops;
extern const struct vfsops ntfs_vfsops;
extern const struct vfsops udf_vfsops;
extern const struct vfsops fusefs_vfsops;
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 6f1664ce1..c9970f6a9 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -67,12 +67,14 @@ enum vtagtype {
VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_MSDOSFS,
VT_PORTAL, VT_PROCFS, VT_AFS, VT_ISOFS, VT_ADOSFS,
VT_EXT2FS, VT_VFS, VT_NTFS, VT_UDF, VT_FUSEFS, VT_TMPFS,
+ VT_EXT4FS,
};
#define VTAG_NAMES \
"NON", "UFS", "NFS", "MFS", "MSDOSFS", \
"unused", "unused", "unused", "ISOFS", "unused", \
- "EXT2FS", "VFS", "NTFS", "UDF", "FUSEFS", "TMPFS"
+ "EXT2FS", "VFS", "NTFS", "UDF", "FUSEFS", "TMPFS", \
+ "EXT4FS"
/*
* Each underlying filesystem allocates its own private area and hangs
diff --git a/sys/ufs/ext4fs/ext4fs.h b/sys/ufs/ext4fs/ext4fs.h
new file mode 100644
index 000000000..e95191eff
--- /dev/null
+++ b/sys/ufs/ext4fs/ext4fs.h
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2025 kmx.io.
+ * Copyright (c) 1997 Manuel Bouyer.
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Modified for ext4fs by kmx.io.
+ */
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/ucred.h>
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ext4fs/ext4fs_crc32c.h>
+
+struct fid;
+struct inode;
+struct nameidata;
+struct statfs;
+struct vfsconf;
+
+#define EXT4FS_EXTENT_DEPTH_MAX 5
+#define EXT4FS_FUNCTION_MAX 32
+#define EXT4FS_REV_EXT2 0
+#define EXT4FS_REV_DYNAMIC 1
+#define EXT4FS_REV_MINOR 0
+#define EXT4FS_LAST_MOUNTED_MAX 64
+#define EXT4FS_LOG_MIN_BLOCK_SIZE 10
+#define EXT4FS_MAGIC 0xEF53
+#define EXT4FS_MOUNT_OPTS_MAX 64
+#define EXT4FS_LINK_MAX 65000
+#define EXT4FS_MAXNAMLEN 255
+#define EXT4FS_SUPER_BLOCK_OFFSET 1024
+#define EXT4FS_SUPER_BLOCK_SIZE 1024
+#define EXT4FS_VOLUME_NAME_MAX 16
+
+#define EXT4FS_DIRECT_ADDR_IN_INODE 12
+#define EXT4FS_INDIRECT_ADDR_IN_INODE 3
+#define EXT4FS_SYMLINK_LEN_MAX \
+ ((EXT4FS_DIRECT_ADDR_IN_INODE + \
+ EXT4FS_INDIRECT_ADDR_IN_INODE) * sizeof(u_int32_t))
+
+#define EXT4FS_NINDIR(fs) ((fs)->m_block_size / sizeof(u_int32_t))
+
+#define EXT4FS_LBLKNO(fs, offset) ((offset) >> (fs)->m_block_size_shift)
+#define EXT4FS_BLKOFF(fs, offset) ((offset) & ((fs)->m_block_size - 1))
+#define EXT4FS_FSBTODB(fs, b) ((b) << (fs)->m_fs_block_to_disk_block)
+
+#define EXT4FS_CHECKSUM_TYPE_NONE 0x0000
+#define EXT4FS_CHECKSUM_TYPE_CRC32C 0x0001
+
+#define EXT4FS_ENCODING_NONE 0x0000 // legacy behavior
+#define EXT4FS_ENCODING_UTF8 0x0001 // UTF-8, Unicode 12.1.0
+
+#define EXT4FS_ENCODING_FLAG_NONE 0x0000
+#define EXT4FS_ENCODING_FLAG_STRICT_MODE 0x0001 // Reject invalid encoding
+
+#define EXT4FS_ERRORS_CONTINUE 1 // Log and keep going
+#define EXT4FS_ERRORS_RO 2 // Remount read-only
+#define EXT4FS_ERRORS_PANIC 3 // Kernel panic
+
+#define EXT4FS_FEATURE_COMPAT_DIR_PREALLOC 0x0001
+#define EXT4FS_FEATURE_COMPAT_IMAGIC_INODES 0x0002
+#define EXT4FS_FEATURE_COMPAT_HAS_JOURNAL 0x0004
+#define EXT4FS_FEATURE_COMPAT_EXT_ATTR 0x0008
+#define EXT4FS_FEATURE_COMPAT_RESIZE_INODE 0x0010
+#define EXT4FS_FEATURE_COMPAT_DIR_INDEX 0x0020
+
+#define EXT4FS_FEATURE_INCOMPAT_COMPRESSION 0x00001
+#define EXT4FS_FEATURE_INCOMPAT_FILETYPE 0x00002
+#define EXT4FS_FEATURE_INCOMPAT_RECOVER 0x00004
+#define EXT4FS_FEATURE_INCOMPAT_JOURNAL_DEV 0x00008
+#define EXT4FS_FEATURE_INCOMPAT_META_BG 0x00010
+#define EXT4FS_FEATURE_INCOMPAT_EXTENTS 0x00040
+#define EXT4FS_FEATURE_INCOMPAT_64BIT 0x00080
+#define EXT4FS_FEATURE_INCOMPAT_MMP 0x00100
+#define EXT4FS_FEATURE_INCOMPAT_FLEX_BG 0x00200
+#define EXT4FS_FEATURE_INCOMPAT_EA_INODE 0x00400
+#define EXT4FS_FEATURE_INCOMPAT_DIRDATA 0x01000
+#define EXT4FS_FEATURE_INCOMPAT_CSUM_SEED 0x02000
+#define EXT4FS_FEATURE_INCOMPAT_LARGEDIR 0x04000
+#define EXT4FS_FEATURE_INCOMPAT_INLINE_DATA 0x08000
+#define EXT4FS_FEATURE_INCOMPAT_ENCRYPT 0x10000
+
+#define EXT4FS_FEATURE_INCOMPAT_SUPPORTED \
+ (EXT4FS_FEATURE_INCOMPAT_FILETYPE | \
+ EXT4FS_FEATURE_INCOMPAT_RECOVER | \
+ EXT4FS_FEATURE_INCOMPAT_EXTENTS | \
+ EXT4FS_FEATURE_INCOMPAT_64BIT | \
+ EXT4FS_FEATURE_INCOMPAT_FLEX_BG | \
+ EXT4FS_FEATURE_INCOMPAT_CSUM_SEED)
+
+#define EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
+#define EXT4FS_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
+#define EXT4FS_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
+#define EXT4FS_FEATURE_RO_COMPAT_HUGE_FILE 0x0008
+#define EXT4FS_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
+#define EXT4FS_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
+#define EXT4FS_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
+#define EXT4FS_FEATURE_RO_COMPAT_HAS_SNAPSHOT 0x0080
+#define EXT4FS_FEATURE_RO_COMPAT_QUOTA 0x0100
+#define EXT4FS_FEATURE_RO_COMPAT_BIGALLOC 0x0200
+#define EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
+#define EXT4FS_FEATURE_RO_COMPAT_REPLICA 0x0800
+#define EXT4FS_FEATURE_RO_COMPAT_READONLY 0x1000
+#define EXT4FS_FEATURE_RO_COMPAT_PROJECT 0x2000
+
+#define EXT4FS_FEATURE_RO_COMPAT_SUPPORTED \
+ (EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER | \
+ EXT4FS_FEATURE_RO_COMPAT_LARGE_FILE | \
+ EXT4FS_FEATURE_RO_COMPAT_HUGE_FILE | \
+ EXT4FS_FEATURE_RO_COMPAT_DIR_NLINK | \
+ EXT4FS_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM)
+
+#define EXT4FS_FLAG_SIGNED_HASH 0x0001
+#define EXT4FS_FLAG_UNSIGNED_HASH 0x0002
+#define EXT4FS_FLAG_TEST_FILESYS 0x0004
+#define EXT4FS_FLAG_64BIT 0x0008
+#define EXT4FS_FLAG_MOUNT_OPT_CHECK 0x0010
+
+#define EXT4FS_INODE_BAD_BLOCKS 1
+#define EXT4FS_INODE_ROOT_DIR 2
+#define EXT4FS_INODE_USER_QUOTA 3
+#define EXT4FS_INODE_GROUP_QUOTA 4
+#define EXT4FS_INODE_BOOT_LOADER 5
+#define EXT4FS_INODE_JOURNAL 8
+#define EXT4FS_INODE_FIRST 11
+
+#define EXTFS_INODE_FLAG_SECURE_RM 0x00000001
+#define EXTFS_INODE_FLAG_UN_RM 0x00000002
+#define EXTFS_INODE_FLAG_COMPRESSION 0x00000004
+#define EXTFS_INODE_FLAG_SYNC 0x00000008
+#define EXTFS_INODE_FLAG_IMMUTABLE 0x00000010
+#define EXTFS_INODE_FLAG_APPEND 0x00000020
+#define EXTFS_INODE_FLAG_NO_DUMP 0x00000040
+#define EXTFS_INODE_FLAG_NO_ATIME 0x00000080
+#define EXTFS_INODE_FLAG_DIRTY 0x00000100
+#define EXTFS_INODE_FLAG_COMPRESSED_BLOCKS 0x00000200
+#define EXTFS_INODE_FLAG_NO_COMPRESSION 0x00000400
+#define EXTFS_INODE_FLAG_ENCRYPTED 0x00000800
+#define EXTFS_INODE_FLAG_INDEX 0x00001000
+#define EXTFS_INODE_FLAG_IMAGIC 0x00002000
+#define EXTFS_INODE_FLAG_JOURNAL_DATA 0x00004000
+#define EXTFS_INODE_FLAG_NO_TAIL 0x00008000
+#define EXTFS_INODE_FLAG_DIR_SYNC 0x00010000
+#define EXTFS_INODE_FLAG_TOP_DIR 0x00020000
+#define EXTFS_INODE_FLAG_HUGE_FILE 0x00040000
+#define EXTFS_INODE_FLAG_EXTENTS 0x00080000
+#define EXTFS_INODE_FLAG_EXTENDED_ATTRIBUTES_INODE 0x00200000
+#define EXTFS_INODE_FLAG_EOF_BLOCKS 0x00400000
+#define EXTFS_INODE_FLAG_INLINE_DATA 0x10000000
+#define EXTFS_INODE_FLAG_PROJECT_ID_INHERITANCE 0x20000000
+#define EXTFS_INODE_FLAG_CASEFOLD 0x40000000
+
+#define EXT4FS_MOUNT_READONLY 0x0001
+#define EXT4FS_MOUNT_NO_ATIME 0x0002
+#define EXT4FS_MOUNT_DIRSYNC 0x0004
+#define EXT4FS_MOUNT_DATA_JOURNAL 0x0008
+#define EXT4FS_MOUNT_DATA_ORDERED 0x0010
+#define EXT4FS_MOUNT_DATA_WRITEBACK 0x0020
+#define EXT4FS_MOUNT_ERRORS_CONTINUE 0x0040
+#define EXT4FS_MOUNT_ERRORS_REMOUNT_RO 0x0080
+#define EXT4FS_MOUNT_ERRORS_PANIC 0x0100
+#define EXT4FS_MOUNT_DISCARD 0x0200
+#define EXT4FS_MOUNT_NO_BUFFER_HEADS 0x0400
+#define EXT4FS_MOUNT_SKIP_JOURNAL 0x0800
+#define EXT4FS_MOUNT_NOAUTO_DELAYED_ALLOC 0x1000
+
+#define EXT4FS_OS_LINUX 0
+#define EXT4FS_OS_HURD 1
+#define EXT4FS_OS_MASIX 2
+#define EXT4FS_OS_FREEBSD 3
+#define EXT4FS_OS_LITES 4
+#define EXT4FS_OS_OPENBSD 5
+
+#define EXT4FS_STATE_VALID 0x0001 // Clean unmount
+#define EXT4FS_STATE_ERROR 0x0002 // Errors detected (fsck needed)
+
+#define EXT4FS_BGD_FLAG_INODE_UNINIT 0x0001
+#define EXT4FS_BGD_FLAG_BLOCK_UNINIT 0x0002
+#define EXT4FS_BGD_FLAG_INODE_ZEROED 0x0004
+#define EXT4FS_BGD_FLAG_DIRTY 0x0008
+#define EXT4FS_BGD_FLAG_BLOCK_ZEROED 0x0010
+#define EXT4FS_BGD_FLAG_READ_ONLY 0x0020
+
+struct ext4fs {
+ u_int32_t sb_inodes_count;
+ u_int32_t sb_blocks_count_lo;
+ u_int32_t sb_reserved_blocks_count_lo;
+ u_int32_t sb_free_blocks_count_lo;
+ // 0x10
+ u_int32_t sb_free_inodes_count;
+ u_int32_t sb_first_data_block;
+ u_int32_t sb_log_block_size; // log2(block size) - 10
+ u_int32_t sb_log_cluster_size; // log2(cluster size) - 10
+ // 0x20
+ u_int32_t sb_blocks_per_group;
+ u_int32_t sb_clusters_per_group;
+ u_int32_t sb_inodes_per_group;
+ u_int32_t sb_mount_time_lo;
+ // 0x30
+ u_int32_t sb_write_time_lo;
+ u_int16_t sb_mount_count;
+ int16_t sb_max_mount_count_before_fsck;
+ u_int16_t sb_magic;
+ u_int16_t sb_state; // EXT4FS_STATE_*
+ u_int16_t sb_errors; // EXT4FS_ERRORS_*
+ u_int16_t sb_revision_level_minor;
+ // 0x40
+ u_int32_t sb_check_time_lo;
+ u_int32_t sb_check_interval;
+ u_int32_t sb_creator_os; // EXT4FS_OS_*
+ u_int32_t sb_revision_level;
+ // 0x50
+ u_int16_t sb_default_reserved_uid;
+ u_int16_t sb_default_reserved_gid;
+ u_int32_t sb_first_non_reserved_inode;
+ u_int16_t sb_inode_size;
+ u_int16_t sb_block_group_id;
+ u_int32_t sb_feature_compat;
+ // 0x60
+ u_int32_t sb_feature_incompat;
+ u_int32_t sb_feature_ro_compat;
+ u_int8_t sb_uuid[16];
+ char sb_volume_name[EXT4FS_VOLUME_NAME_MAX];
+ char sb_last_mounted[EXT4FS_LAST_MOUNTED_MAX];
+ u_int32_t sb_algorithm_usage_bitmap;
+ u_int8_t sb_preallocate_blocks;
+ u_int8_t sb_preallocate_dir_blocks;
+ u_int16_t sb_reserved_bgdt_blocks;
+ // 0xD0
+ u_int8_t sb_journal_uuid[16]; // UUID of journal superblock
+ // 0xE0
+ u_int32_t sb_journal_inode_number;
+ u_int32_t sb_journal_device_number;
+ u_int32_t sb_last_orphan;
+ u_int32_t sb_hash_seed[4];
+ u_int8_t sb_default_hash_version;
+ u_int8_t sb_journal_backup_type;
+ u_int16_t sb_block_group_descriptor_size;
+ // 0x100
+ u_int32_t sb_default_mount_opts;
+ u_int32_t sb_first_meta_block_group;
+ u_int32_t sb_newfs_time_lo;
+ u_int32_t sb_jnl_blocks[17]; // Backup of journal inode
+ // 0x150
+ u_int32_t sb_blocks_count_hi;
+ u_int32_t sb_reserved_blocks_count_hi;
+ u_int32_t sb_free_blocks_count_hi;
+ u_int16_t sb_inode_size_extra_min;
+ u_int16_t sb_inode_size_extra_want;
+ // 0x160
+ u_int32_t sb_flags;
+ u_int16_t sb_raid_stride_block_count;
+ u_int16_t sb_mmp_interval;
+ u_int64_t sb_mmp_block;
+ // 0x170
+ u_int32_t sb_raid_stripe_width_block_count;
+ u_int8_t sb_log_groups_per_flex;
+ u_int8_t sb_checksum_type;
+ u_int16_t sb_reserved_176;
+ u_int64_t sb_kilobytes_written;
+ // 0x180
+ u_int32_t sb_ext3_snapshot_inode;
+ u_int32_t sb_ext3_snapshot_id;
+ u_int64_t sb_ext3_snapshot_reserved_blocks_count;
+ // 0x190
+ u_int32_t sb_ext3_snapshot_list;
+ u_int32_t sb_error_count;
+ u_int32_t sb_first_error_time_lo;
+ u_int32_t sb_first_error_inode;
+ // 0x1A0
+ u_int64_t sb_first_error_block;
+ char sb_first_error_function[EXT4FS_FUNCTION_MAX];
+ u_int32_t sb_first_error_line;
+ u_int32_t sb_last_error_time_lo;
+ // 0x1D0
+ u_int32_t sb_last_error_inode;
+ u_int32_t sb_last_error_line;
+ u_int64_t sb_last_error_block;
+ // 0x1E0
+ char sb_last_error_function[EXT4FS_FUNCTION_MAX];
+ // 0x200
+ char sb_mount_opts[EXT4FS_MOUNT_OPTS_MAX];
+ // 0x240
+ u_int32_t sb_user_quota_inode;
+ u_int32_t sb_group_quota_inode;
+ u_int32_t sb_overhead_clusters;
+ u_int32_t sb_backup_block_groups[2];
+ u_int8_t sb_encrypt_algos[4];
+ u_int8_t sb_encrypt_pw_salt[16];
+ u_int32_t sb_lost_and_found_inode;
+ u_int32_t sb_project_quota_inode;
+ // 0x270
+ u_int32_t sb_checksum_seed;
+ u_int8_t sb_write_time_hi;
+ u_int8_t sb_mount_time_hi;
+ u_int8_t sb_newfs_time_hi;
+ u_int8_t sb_check_time_hi;
+ u_int8_t sb_first_error_time_hi;
+ u_int8_t sb_last_error_time_hi;
+ u_int8_t sb_first_error_code;
+ u_int8_t sb_last_error_code;
+ u_int16_t sb_encoding;
+ u_int16_t sb_encoding_flags;
+ // 0x280
+ u_int16_t sb_orphan_file_inode;
+ u_int16_t sb_reserved_284;
+ u_int32_t sb_reserved_288[94];
+ u_int32_t sb_checksum;
+} __attribute__((packed));
+
+struct m_ext4fs {
+ /* little-endian super-block */
+ struct ext4fs m_sble;
+ /* computed from little-endian super-block */
+ u_int32_t m_inodes_count;
+ u_int64_t m_blocks_count;
+ u_int64_t m_reserved_blocks_count;
+ u_int64_t m_free_blocks_count;
+ u_int32_t m_free_inodes_count;
+ u_int32_t m_first_data_block;
+ u_int32_t m_log_block_size; // log2(block size) - 10
+ u_int32_t m_log_cluster_size; // log2(cluster size) - 10
+ u_int32_t m_blocks_per_group;
+ u_int32_t m_clusters_per_group;
+ u_int32_t m_inodes_per_group;
+ u_int64_t m_mount_time;
+ u_int32_t m_write_time;
+ u_int16_t m_mount_count;
+ int16_t m_max_mount_count_before_fsck;
+ u_int16_t m_state; // EXT4FS_STATE_*
+ u_int16_t m_errors; // EXT4FS_ERRORS_*
+ u_int16_t m_revision_level_minor;
+ u_int64_t m_check_time;
+ u_int32_t m_check_interval;
+ u_int32_t m_creator_os; // EXT4FS_OS_*
+ u_int32_t m_revision_level;
+ u_int16_t m_default_reserved_uid;
+ u_int16_t m_default_reserved_gid;
+ u_int32_t m_first_non_reserved_inode;
+ u_int16_t m_inode_size;
+ u_int16_t m_block_group_id;
+ u_int32_t m_feature_compat;
+ u_int32_t m_feature_incompat;
+ u_int32_t m_feature_ro_compat;
+ u_int32_t m_algorithm_usage_bitmap;
+ u_int16_t m_reserved_bgdt_blocks;
+ u_int32_t m_journal_inode_number;
+ u_int32_t m_journal_device_number;
+ u_int32_t m_last_orphan;
+ u_int16_t m_block_group_descriptor_size;
+ u_int32_t m_default_mount_opts;
+ u_int32_t m_first_meta_block_group;
+ u_int64_t m_newfs_time;
+ u_int16_t m_inode_size_extra_min;
+ u_int16_t m_inode_size_extra_want;
+ u_int32_t m_flags;
+ u_int16_t m_raid_stride_block_count;
+ u_int16_t m_mmp_interval;
+ u_int64_t m_mmp_block;
+ u_int32_t m_raid_stripe_width_block_count;
+ u_int64_t m_kilobytes_written;
+ u_int32_t m_error_count;
+ u_int64_t m_first_error_time;
+ u_int32_t m_first_error_inode;
+ u_int64_t m_first_error_block;
+ u_int32_t m_first_error_line;
+ u_int64_t m_last_error_time;
+ u_int32_t m_last_error_inode;
+ u_int32_t m_last_error_line;
+ u_int64_t m_last_error_block;
+ u_int32_t m_user_quota_inode;
+ u_int32_t m_group_quota_inode;
+ u_int32_t m_overhead_clusters;
+ u_int32_t m_backup_block_groups[2];
+ u_int32_t m_lost_and_found_inode;
+ u_int32_t m_project_quota_inode;
+ u_int32_t m_checksum_seed;
+ u_int16_t m_encoding;
+ u_int16_t m_encoding_flags;
+ u_int16_t m_orphan_file_inode;
+ int m_read_only;
+ int m_fs_was_modified;
+ /* computed by ext4fs_sbfill */
+ u_int64_t m_block_group_descriptor_blocks_count;
+ u_int64_t m_block_group_count;
+ u_int64_t m_block_size;
+ u_int64_t m_block_size_shift;
+ u_int32_t m_fs_block_to_disk_block;
+ u_int32_t m_inodes_per_block;
+ u_int32_t m_inode_table_blocks_per_group;
+ u_int32_t m_resize_dind_block;
+ struct ext4fs_block_group_descriptor *m_gd;
+};
+
+struct ext4fs_block_group_descriptor {
+ u_int32_t bgd_block_bitmap_block_lo;
+ u_int32_t bgd_inode_bitmap_block_lo;
+ u_int32_t bgd_inode_table_block_lo;
+ u_int16_t bgd_free_blocks_count_lo;
+ u_int16_t bgd_free_inodes_count_lo;
+ // 0x10
+ u_int16_t bgd_used_dirs_count_lo;
+ u_int16_t bgd_flags;
+ u_int32_t bgd_exclude_bitmap_block_lo;
+ u_int16_t bgd_block_bitmap_checksum_lo;
+ u_int16_t bgd_inode_bitmap_checksum_lo;
+ u_int16_t bgd_inode_table_unused_lo;
+ u_int16_t bgd_checksum;
+ // 0x20
+ u_int32_t bgd_block_bitmap_block_hi;
+ u_int32_t bgd_inode_bitmap_block_hi;
+ u_int32_t bgd_inode_table_block_hi;
+ u_int16_t bgd_free_blocks_count_hi;
+ u_int16_t bgd_free_inodes_count_hi;
+ // 0x30
+ u_int16_t bgd_used_dirs_count_hi;
+ u_int16_t bgd_inode_table_unused_hi;
+ u_int32_t bgd_exclude_bitmap_block_hi;
+ u_int16_t bgd_block_bitmap_checksum_hi;
+ u_int16_t bgd_inode_bitmap_checksum_hi;
+ u_int32_t bgd_reserved_3c;
+ // 0x40
+} __attribute__((packed));
+
+
+/* Directory entry file types */
+#define EXT4FS_FT_UNKNOWN 0
+#define EXT4FS_FT_REG_FILE 1
+#define EXT4FS_FT_DIR 2
+#define EXT4FS_FT_CHRDEV 3
+#define EXT4FS_FT_BLKDEV 4
+#define EXT4FS_FT_FIFO 5
+#define EXT4FS_FT_SOCK 6
+#define EXT4FS_FT_SYMLINK 7
+#define EXT4FS_FT_MAX 8
+
+struct ext4fs_directory {
+ u_int32_t e4d_ino;
+ u_int16_t e4d_reclen;
+ u_int8_t e4d_namlen;
+ u_int8_t e4d_type;
+ char e4d_name[EXT4FS_MAXNAMLEN];
+} __attribute__((packed));
+
+/* Directory block checksum tail (last 12 bytes of block when metadata_csum) */
+#define EXT4FS_DIR_TAIL_FT 0xDE
+#define EXT4FS_DIR_TAIL_SIZE 12
+
+struct ext4fs_directory_tail {
+ u_int32_t det_reserved_zero1; /* must be 0 (fake inode = 0) */
+ u_int16_t det_rec_len; /* always EXT4FS_DIR_TAIL_SIZE */
+ u_int8_t det_reserved_zero2; /* must be 0 (namlen = 0) */
+ u_int8_t det_reserved_ft; /* EXT4FS_DIR_TAIL_FT */
+ u_int32_t det_checksum;
+} __attribute__((packed));
+
+struct ext4fs_feature {
+ int f_mask;
+ const char * f_name;
+};
+
+static const struct ext4fs_feature ext4fs_feature_incompat[] = {
+ {EXT4FS_FEATURE_INCOMPAT_COMPRESSION, "compression"},
+ {EXT4FS_FEATURE_INCOMPAT_FILETYPE, "filetype"},
+ {EXT4FS_FEATURE_INCOMPAT_RECOVER, "recover"},
+ {EXT4FS_FEATURE_INCOMPAT_JOURNAL_DEV, "journal_dev"},
+ {EXT4FS_FEATURE_INCOMPAT_META_BG, "meta_bg"},
+ {EXT4FS_FEATURE_INCOMPAT_EXTENTS, "extents"},
+ {EXT4FS_FEATURE_INCOMPAT_64BIT, "64bit"},
+ {EXT4FS_FEATURE_INCOMPAT_MMP, "mmp"},
+ {EXT4FS_FEATURE_INCOMPAT_FLEX_BG, "flex_bg"},
+ {EXT4FS_FEATURE_INCOMPAT_EA_INODE, "ea_inode"},
+ {EXT4FS_FEATURE_INCOMPAT_DIRDATA, "dirdata"},
+ {EXT4FS_FEATURE_INCOMPAT_CSUM_SEED, "csum_seed"},
+ {EXT4FS_FEATURE_INCOMPAT_LARGEDIR, "largedir"},
+ {EXT4FS_FEATURE_INCOMPAT_INLINE_DATA, "inline_data"},
+ {EXT4FS_FEATURE_INCOMPAT_ENCRYPT, "encrypt"},
+};
+
+static const struct ext4fs_feature ext4fs_feature_ro_compat[] = {
+ {EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER, "sparse-super"},
+ {EXT4FS_FEATURE_RO_COMPAT_LARGE_FILE, "large-file"},
+ {EXT4FS_FEATURE_RO_COMPAT_BTREE_DIR, "btree-dir"},
+ {EXT4FS_FEATURE_RO_COMPAT_HUGE_FILE, "huge-file"},
+ {EXT4FS_FEATURE_RO_COMPAT_GDT_CSUM, "gdt-csum"},
+ {EXT4FS_FEATURE_RO_COMPAT_DIR_NLINK, "dir-nlink"},
+ {EXT4FS_FEATURE_RO_COMPAT_EXTRA_ISIZE, "extra-isize"},
+ {EXT4FS_FEATURE_RO_COMPAT_HAS_SNAPSHOT, "has-snapshot"},
+ {EXT4FS_FEATURE_RO_COMPAT_QUOTA, "quota"},
+ {EXT4FS_FEATURE_RO_COMPAT_BIGALLOC, "bigalloc"},
+ {EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM, "metadata-csum"},
+ {EXT4FS_FEATURE_RO_COMPAT_REPLICA, "replica"},
+ {EXT4FS_FEATURE_RO_COMPAT_READONLY, "readonly"},
+ {EXT4FS_FEATURE_RO_COMPAT_PROJECT, "project"},
+};
+
+#define EXT4FS_ITIMES(ip) do { \
+ if ((ip)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) { \
+ struct timespec _ts; \
+ (ip)->i_flag |= IN_MODIFIED; \
+ getnanotime(&_ts); \
+ if ((ip)->i_flag & IN_ACCESS) { \
+ (ip)->i_e4din->dinode.i_atime = \
+ htole32((u_int32_t)_ts.tv_sec); \
+ (ip)->i_e4din->dinode.i_atime_extra = \
+ htole32(_ts.tv_nsec << 2); \
+ } \
+ if ((ip)->i_flag & IN_UPDATE) { \
+ (ip)->i_e4din->dinode.i_mtime = \
+ htole32((u_int32_t)_ts.tv_sec); \
+ (ip)->i_e4din->dinode.i_mtime_extra = \
+ htole32(_ts.tv_nsec << 2); \
+ } \
+ if ((ip)->i_flag & IN_CHANGE) { \
+ (ip)->i_e4din->dinode.i_ctime = \
+ htole32((u_int32_t)_ts.tv_sec); \
+ (ip)->i_e4din->dinode.i_ctime_extra = \
+ htole32(_ts.tv_nsec << 2); \
+ (ip)->i_modrev++; \
+ } \
+ (ip)->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); \
+ } \
+} while (0)
+
+struct ext4fs_sync_args {
+ int allerror;
+ int waitfor;
+ struct proc *p;
+ struct ucred *cred;
+};
+
+extern struct pool ext4fs_inode_pool;
+extern struct pool ext4fs_dinode_pool;
+
+/* VFS operations */
+int ext4fs_fhtovp(struct mount *, struct fid *, struct vnode **);
+int ext4fs_init(struct vfsconf *);
+int ext4fs_mount(struct mount *, const char *, void *,
+ struct nameidata *, struct proc *);
+int ext4fs_statfs(struct mount *, struct statfs *, struct proc *);
+int ext4fs_sync(struct mount *, int, int, struct ucred *,
+ struct proc *);
+int ext4fs_sysctl(int *, u_int, void *, size_t *, void *, size_t,
+ struct proc *);
+int ext4fs_unmount(struct mount *, int, struct proc *);
+int ext4fs_vget(struct mount *, ino_t, struct vnode **);
+int ext4fs_vptofh(struct vnode *, struct fid *);
+
+/* VNode operations */
+
+int ext4fs_lookup(void *);
+int ext4fs_create(void *);
+int ext4fs_mknod(void *);
+int ext4fs_open(void *);
+int ext4fs_access(void *);
+int ext4fs_getattr(void *);
+int ext4fs_setattr(void *);
+int ext4fs_read(void *);
+int ext4fs_write(void *);
+int ext4fs_fsync(void *);
+int ext4fs_remove(void *);
+int ext4fs_link(void *);
+int ext4fs_rename(void *);
+int ext4fs_mkdir(void *);
+int ext4fs_rmdir(void *);
+int ext4fs_symlink(void *);
+int ext4fs_readdir(void *);
+int ext4fs_readlink(void *);
+int ext4fs_inactive(void *);
+int ext4fs_reclaim(void *);
+int ext4fs_bmap(void *);
+int ext4fs_strategy(void *);
+int ext4fs_print(void *);
+int ext4fs_pathconf(void *);
+int ext4fs_advlock(void *);
+
+int ext4fs_update(struct inode *, int);
+
+u_int32_t ext4fs_sb_csum(struct ext4fs *);
+int ext4fs_sb_csum_verify(struct ext4fs *);
+u_int32_t ext4fs_csum_seed(struct m_ext4fs *);
+u_int32_t ext4fs_bitmap_csum(struct m_ext4fs *, u_int32_t, void *, size_t);
+u_int16_t ext4fs_bgd_csum(struct m_ext4fs *,
+ struct ext4fs_block_group_descriptor *, u_int32_t);
+int ext4fs_bgd_csum_verify(struct m_ext4fs *,
+ struct ext4fs_block_group_descriptor *, u_int32_t);
+u_int32_t ext4fs_inode_csum(struct m_ext4fs *,
+ struct ext4fs_dinode_256 *, u_int32_t);
+int ext4fs_inode_csum_verify(struct m_ext4fs *,
+ struct ext4fs_dinode_256 *, u_int32_t);
+
+/* Directory entry size: 8 bytes header + name, rounded up to 4 */
+#define EXT4FS_DIRSIZ(namlen) (((8 + (namlen)) + 3) & ~3)
+
+/* Convert inode mode to directory file type */
+static inline u_int8_t
+ext4fs_mode_to_ft(u_int16_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFREG: return EXT4FS_FT_REG_FILE;
+ case S_IFDIR: return EXT4FS_FT_DIR;
+ case S_IFCHR: return EXT4FS_FT_CHRDEV;
+ case S_IFBLK: return EXT4FS_FT_BLKDEV;
+ case S_IFIFO: return EXT4FS_FT_FIFO;
+ case S_IFSOCK: return EXT4FS_FT_SOCK;
+ case S_IFLNK: return EXT4FS_FT_SYMLINK;
+ default: return EXT4FS_FT_UNKNOWN;
+ }
+}
+
+/* Block allocation / free */
+int ext4fs_blkalloc(struct inode *, u_int64_t, u_int32_t, u_int64_t *,
+ u_int32_t *);
+void ext4fs_blkfree(struct inode *, u_int64_t);
+
+/* Inode allocation / free */
+int ext4fs_inode_alloc(struct inode *, mode_t, struct ucred *,
+ struct vnode **);
+void ext4fs_inode_free(struct inode *, ufsino_t, mode_t);
+
+/* Directory operations */
+int ext4fs_direnter(struct inode *, struct vnode *,
+ struct componentname *);
+int ext4fs_dirremove(struct vnode *, struct componentname *);
+int ext4fs_dirempty(struct inode *, ufsino_t, struct ucred *);
+int ext4fs_dirrewrite(struct inode *, struct inode *,
+ struct componentname *);
+
+/* Truncation */
+int ext4fs_truncate(struct inode *, off_t, int, struct ucred *);
+
+/* Size update */
+void ext4fs_setsize(struct inode *, u_int64_t);
+
+/* Superblock / BGD write-back */
+int ext4fs_bgd_write(struct m_ext4fs *, struct vnode *, u_int32_t);
+int ext4fs_sbwrite(struct mount *);
diff --git a/sys/ufs/ext4fs/ext4fs_crc32c.c b/sys/ufs/ext4fs/ext4fs_crc32c.c
new file mode 100644
index 000000000..a28e8a05c
--- /dev/null
+++ b/sys/ufs/ext4fs/ext4fs_crc32c.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2025 kmx.io.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+
+#include <ufs/ext4fs/ext4fs_dinode.h>
+#include <ufs/ext4fs/ext4fs.h>
+
+/*
+ * CRC32C lookup table, generated using the Castagnoli polynomial
+ * 0x1EDC6F41 (bit-reversed: 0x82F63B78).
+ *
+ * This table is for little-endian CRC computation.
+ */
+static const u_int32_t crc32c_table[256] = {
+ 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
+ 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+ 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
+ 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+ 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
+ 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+ 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
+ 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+ 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
+ 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+ 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
+ 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+ 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
+ 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+ 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
+ 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+ 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
+ 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+ 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
+ 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+ 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
+ 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+ 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
+ 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+ 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
+ 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+ 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
+ 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+ 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
+ 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+ 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
+ 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+ 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
+ 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+ 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
+ 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+ 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
+ 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+ 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
+ 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+ 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
+ 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+ 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
+ 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+ 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
+ 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+ 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
+ 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+ 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
+ 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+ 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
+ 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+ 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
+ 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+ 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
+ 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+ 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
+ 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+ 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
+ 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+ 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
+ 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+ 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
+ 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351
+};
+
+/*
+ * Compute CRC32C of a buffer.
+ *
+ * crc: initial CRC value (use ~0 to start fresh, or previous CRC to continue)
+ * buf: pointer to data buffer
+ * len: length of data in bytes
+ *
+ * Returns the updated CRC32C value.
+ */
+u_int32_t
+ext4fs_crc32c(u_int32_t crc, const void *buf, size_t len)
+{
+ const u_int8_t *p = buf;
+
+ crc = ~crc;
+ while (len--)
+ crc = (crc >> 8) ^ crc32c_table[(crc & 0xff) ^ *p++];
+
+ return ~crc;
+}
+
+/*
+ * Compute CRC32C in the style used by ext4.
+ *
+ * ext4 computes CRC32C starting with ~0 (or a seed), then inverts the
+ * final result before storing it.
+ *
+ * crc: seed value (use ~0 for standard ext4 checksum, or sb_checksum_seed)
+ * buf: pointer to data buffer
+ * len: length of data in bytes
+ *
+ * Returns the final CRC32C value (NOT inverted - caller should invert
+ * if comparing against stored checksum, or pass result to next call).
+ */
+u_int32_t
+ext4fs_crc32c_le(u_int32_t crc, const void *buf, size_t len)
+{
+ return ext4fs_crc32c(crc, buf, len);
+}
+
+/*
+ * Compute the checksum seed for an ext4 filesystem.
+ *
+ * If the CSUM_SEED feature is set, use the pre-computed seed from the
+ * superblock. Otherwise, compute it from the filesystem UUID.
+ */
+u_int32_t
+ext4fs_csum_seed(struct m_ext4fs *fs)
+{
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_CSUM_SEED)
+ return ~fs->m_checksum_seed;
+
+ /* Compute seed from UUID */
+ return ext4fs_crc32c(0, fs->m_sble.sb_uuid,
+ sizeof(fs->m_sble.sb_uuid));
+}
+
+/*
+ * Compute the CRC32C checksum of an ext4 superblock.
+ *
+ * The checksum covers the entire superblock except for the checksum
+ * field itself (last 4 bytes). The checksum field is treated as zero
+ * during computation.
+ */
+u_int32_t
+ext4fs_sb_csum(struct ext4fs *sb)
+{
+ u_int32_t crc;
+ size_t offset;
+
+ /* Offset of sb_checksum field within the superblock */
+ offset = offsetof(struct ext4fs, sb_checksum);
+
+ /* Compute CRC up to (but not including) the checksum field */
+ crc = ext4fs_crc32c(0, sb, offset);
+
+ return ~crc;
+}
+
+/*
+ * Compute the CRC32C checksum of a block group descriptor.
+ *
+ * When CSUM_SEED is set, the seed comes from sb_checksum_seed.
+ * Otherwise, compute it from the UUID.
+ * The block_group_id is always chained into the CRC (after the seed).
+ */
+u_int16_t
+ext4fs_bgd_csum(struct m_ext4fs *fs,
+ struct ext4fs_block_group_descriptor *bgd, u_int32_t block_group_id)
+{
+ u_int32_t crc;
+ u_int32_t seed;
+ u_int32_t block_group_id_le;
+ size_t size;
+ struct ext4fs_block_group_descriptor tmp;
+
+ if (!(fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 0;
+
+ seed = ext4fs_csum_seed(fs);
+ block_group_id_le = htole32(block_group_id);
+ seed = ext4fs_crc32c(seed, &block_group_id_le,
+ sizeof(block_group_id_le));
+
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ size = fs->m_block_group_descriptor_size;
+ else
+ size = 32;
+ if (size > sizeof(tmp))
+ size = sizeof(tmp);
+
+ memcpy(&tmp, bgd, size);
+ tmp.bgd_checksum = 0;
+ crc = ext4fs_crc32c(seed, &tmp, size);
+
+ return (~crc) & 0xFFFF;
+}
+
+/*
+ * Verify a block group descriptor checksum.
+ *
+ * Returns 0 if the checksum is valid, or EINVAL if it doesn't match.
+ */
+int
+ext4fs_bgd_csum_verify(struct m_ext4fs *fs,
+ struct ext4fs_block_group_descriptor *bgd, u_int32_t block_group_id)
+{
+ u_int16_t provided, calculated;
+
+ if (!(fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 0;
+
+ provided = letoh16(bgd->bgd_checksum);
+ calculated = ext4fs_bgd_csum(fs, bgd, block_group_id);
+
+ if (provided != calculated) {
+ printf("ext4fs: bgd %u checksum mismatch: "
+ "stored=0x%04x calculated=0x%04x\n",
+ block_group_id, provided, calculated);
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the CRC32C checksum of an inode.
+ *
+ * The checksum covers the inode number, generation, and the full
+ * 256-byte inode with checksum fields zeroed.
+ */
+u_int32_t
+ext4fs_inode_csum(struct m_ext4fs *fs,
+ struct ext4fs_dinode_256 *dp, u_int32_t ino)
+{
+ u_int32_t crc;
+ u_int32_t seed;
+ u_int32_t ino_le;
+ struct ext4fs_dinode_256 tmp;
+
+ if (!(fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 0;
+
+ seed = ext4fs_csum_seed(fs);
+
+ ino_le = htole32(ino);
+ crc = ext4fs_crc32c(seed, &ino_le, sizeof(ino_le));
+ crc = ext4fs_crc32c(crc, &dp->dinode.i_nfs_generation,
+ sizeof(dp->dinode.i_nfs_generation));
+
+ tmp = *dp;
+ tmp.dinode.i_checksum_lo = 0;
+ tmp.dinode.i_checksum_hi = 0;
+ crc = ext4fs_crc32c(crc, &tmp, sizeof(tmp));
+
+ return ~crc;
+}
+
+/*
+ * Verify an inode checksum.
+ *
+ * Returns 0 if the checksum is valid, or EINVAL if it doesn't match.
+ */
+int
+ext4fs_inode_csum_verify(struct m_ext4fs *fs,
+ struct ext4fs_dinode_256 *dp, u_int32_t ino)
+{
+ u_int32_t provided, calculated;
+
+ if (!(fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 0;
+
+ provided = letoh16(dp->dinode.i_checksum_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ provided |= (u_int32_t)letoh16(dp->dinode.i_checksum_hi) << 16;
+ calculated = ext4fs_inode_csum(fs, dp, ino);
+
+ if (provided != calculated) {
+ printf("ext4fs: inode %u checksum mismatch: "
+ "stored=0x%08x calculated=0x%08x\n",
+ ino, provided, calculated);
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+u_int32_t
+ext4fs_bitmap_csum(struct m_ext4fs *fs, u_int32_t group,
+ void *bitmap, size_t size)
+{
+ u_int32_t crc, seed;
+
+ if (!(fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 0;
+
+ seed = ext4fs_csum_seed(fs);
+ crc = ext4fs_crc32c(seed, bitmap, size);
+
+ return ~crc;
+}
+
+/*
+ * Write the checksum tail at the end of a directory block.
+ *
+ * The tail is a 12-byte structure placed at block_size - 12.
+ * Checksum covers: UUID seed, inode number, inode generation, block data.
+ */
+void
+ext4fs_dir_set_csum(struct m_ext4fs *fs, u_int32_t ino, u_int32_t gen_le,
+ void *buf)
+{
+ struct ext4fs_directory_tail *tail;
+ u_int32_t crc, seed, ino_le;
+
+ if (!(fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ tail = (struct ext4fs_directory_tail *)
+ ((char *)buf + fs->m_block_size - EXT4FS_DIR_TAIL_SIZE);
+ tail->det_reserved_zero1 = 0;
+ tail->det_rec_len = htole16(EXT4FS_DIR_TAIL_SIZE);
+ tail->det_reserved_zero2 = 0;
+ tail->det_reserved_ft = EXT4FS_DIR_TAIL_FT;
+ tail->det_checksum = 0;
+
+ seed = ext4fs_csum_seed(fs);
+ ino_le = htole32(ino);
+ crc = ext4fs_crc32c(seed, &ino_le, sizeof(ino_le));
+ crc = ext4fs_crc32c(crc, &gen_le, sizeof(gen_le));
+ crc = ext4fs_crc32c(crc, buf, fs->m_block_size - EXT4FS_DIR_TAIL_SIZE);
+ tail->det_checksum = htole32(~crc);
+}
+
+/*
+ * Verify the superblock checksum.
+ *
+ * Returns 0 if the checksum is valid, or EINVAL if it doesn't match.
+ * If metadata checksums are not enabled, always returns 0.
+ */
+int
+ext4fs_sb_csum_verify(struct ext4fs *sb)
+{
+ u_int32_t provided, calculated;
+
+ /* Check if metadata checksums are enabled */
+ if (!(letoh32(sb->sb_feature_ro_compat) &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return 0;
+
+ provided = letoh32(sb->sb_checksum);
+ calculated = ext4fs_sb_csum(sb);
+
+ if (provided != calculated) {
+ printf("ext4fs: superblock checksum mismatch: "
+ "stored=0x%08x calculated=0x%08x\n",
+ provided, calculated);
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Write the checksum tail of an extent tree block.
+ *
+ * The tail is a 4-byte le32 checksum placed right after eh_max entries.
+ * Checksum covers: UUID seed, inode number, inode generation,
+ * then the block data up to and including the zeroed tail.
+ */
+void
+ext4fs_extent_block_csum_set(struct m_ext4fs *fs, u_int32_t ino,
+ u_int32_t gen_le, void *buf)
+{
+ u_int32_t crc, seed, ino_le;
+ u_int32_t *tail;
+ struct ext4fs_extent_header *eh;
+ size_t tail_offset;
+
+ if (!(fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM))
+ return;
+
+ eh = (struct ext4fs_extent_header *)buf;
+ /* Tail is right after eh_max entries */
+ tail_offset = sizeof(struct ext4fs_extent_header) +
+ (size_t)letoh16(eh->eh_max) * sizeof(struct ext4fs_extent);
+ tail = (u_int32_t *)((char *)buf + tail_offset);
+
+ seed = ext4fs_csum_seed(fs);
+ ino_le = htole32(ino);
+ crc = ext4fs_crc32c(seed, &ino_le, sizeof(ino_le));
+ crc = ext4fs_crc32c(crc, &gen_le, sizeof(gen_le));
+ *tail = 0;
+ crc = ext4fs_crc32c(crc, buf, tail_offset);
+ *tail = htole32(~crc);
+}
diff --git a/sys/ufs/ext4fs/ext4fs_crc32c.h b/sys/ufs/ext4fs/ext4fs_crc32c.h
new file mode 100644
index 000000000..2208a3d4d
--- /dev/null
+++ b/sys/ufs/ext4fs/ext4fs_crc32c.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2025 kmx.io.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef _EXT4FS_CRC32C_H_
+#define _EXT4FS_CRC32C_H_
+
+#include <sys/types.h>
+
+/*
+ * CRC32C uses the Castagnoli polynomial: 0x1EDC6F41
+ * This is different from the standard CRC32 (ISO 3309) polynomial.
+ *
+ * ext4 stores checksums as the bitwise inverse of the CRC32C value.
+ */
+
+/* Compute CRC32C of a buffer, starting from an initial CRC value */
+u_int32_t ext4fs_crc32c(u_int32_t crc, const void *buf, size_t len);
+
+/* Compute CRC32C with initial value of ~0, then invert result (ext4 style) */
+u_int32_t ext4fs_crc32c_le(u_int32_t crc, const void *buf, size_t len);
+
+struct m_ext4fs;
+
+/* Compute block or inode bitmap checksum (group number + bitmap data) */
+u_int32_t ext4fs_bitmap_csum(struct m_ext4fs *fs, u_int32_t group,
+ void *bitmap, size_t size);
+
+/*
+ * Write a directory block checksum tail at the end of buf.
+ * ino: directory inode number, gen_le: i_nfs_generation (already LE).
+ * No-op if METADATA_CSUM is not enabled.
+ */
+void ext4fs_dir_set_csum(struct m_ext4fs *fs, u_int32_t ino,
+ u_int32_t gen_le, void *buf);
+
+/*
+ * Write the extent tree block checksum tail.
+ * ino: inode number, gen_le: i_nfs_generation (already LE on disk).
+ * No-op if METADATA_CSUM is not enabled.
+ */
+void ext4fs_extent_block_csum_set(struct m_ext4fs *fs, u_int32_t ino,
+ u_int32_t gen_le, void *buf);
+
+#endif /* _EXT4FS_CRC32C_H_ */
diff --git a/sys/ufs/ext4fs/ext4fs_dinode.h b/sys/ufs/ext4fs/ext4fs_dinode.h
new file mode 100644
index 000000000..9ca795cb0
--- /dev/null
+++ b/sys/ufs/ext4fs/ext4fs_dinode.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2025 kmx.io.
+ * Copyright (c) 1997 Manuel Bouyer.
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Modified for ext4fs by kmx.io.
+ */
+
+#include <sys/stat.h>
+
+#define EXT4FS_EXTENT_HEADER_MAGIC 0xF30A
+
+struct ext4fs_extent_header {
+ u_int16_t eh_magic;
+ u_int16_t eh_entries;
+ u_int16_t eh_max;
+ u_int16_t eh_depth;
+ u_int32_t eh_generation;
+} __attribute__((packed));
+
+struct ext4fs_extent {
+ u_int32_t e_block;
+ u_int16_t e_len;
+ u_int16_t e_start_hi;
+ u_int32_t e_start_lo;
+} __attribute__((packed));
+
+struct ext4fs_extent_idx {
+ u_int32_t ei_block;
+ u_int32_t ei_leaf_lo;
+ u_int16_t ei_leaf_hi;
+ u_int16_t ei_unused;
+} __attribute__((packed));
+
+struct ext4fs_dinode {
+ u_int16_t i_mode;
+ u_int16_t i_uid_lo;
+ u_int32_t i_size_lo;
+ u_int32_t i_atime;
+ u_int32_t i_ctime;
+ /* 0x10 */
+ u_int32_t i_mtime;
+ u_int32_t i_dtime;
+ u_int16_t i_gid_lo;
+ u_int16_t i_links_count;
+ u_int32_t i_blocks_lo;
+ /* 0x20 */
+ u_int32_t i_flags;
+ u_int32_t i_version;
+ union {
+ u_int32_t i_block[15];
+ struct {
+ struct ext4fs_extent_header i_extent_header;
+ union {
+ struct ext4fs_extent i_extent[4];
+ struct ext4fs_extent_idx i_extent_idx[4];
+ };
+ };
+ };
+ u_int32_t i_nfs_generation;
+ u_int32_t i_extended_attributes_lo;
+ u_int32_t i_size_hi;
+ /* 0x70 */
+ u_int32_t i_fragment_address;
+ u_int16_t i_blocks_hi;
+ u_int16_t i_extended_attributes_hi;
+ u_int16_t i_uid_hi;
+ u_int16_t i_gid_hi;
+ u_int16_t i_checksum_lo;
+ u_int16_t i_reserved_7e;
+ /* 0x80 */
+ u_int16_t i_extra_isize;
+ u_int16_t i_checksum_hi;
+ u_int32_t i_ctime_extra;
+ u_int32_t i_mtime_extra;
+ u_int32_t i_atime_extra;
+ /* 0x90 */
+ u_int32_t i_crtime;
+ u_int32_t i_crtime_extra;
+ u_int32_t i_version_hi;
+ u_int32_t i_project_id;
+ /* 0xA0 */
+} __attribute__((packed));
+
+struct ext4fs_dinode_256 {
+ struct ext4fs_dinode dinode;
+ u_int8_t extended_attributes[256 - sizeof(struct ext4fs_dinode)];
+};
diff --git a/sys/ufs/ext4fs/ext4fs_extern.h b/sys/ufs/ext4fs/ext4fs_extern.h
new file mode 100644
index 000000000..783e7f3bf
--- /dev/null
+++ b/sys/ufs/ext4fs/ext4fs_extern.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2025 kmx.io.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <sys/vnode.h>
+
+extern const struct vops ext4fs_vops;
+
+#define IS_EXT4_VNODE(vp) ((vp)->v_tag == VT_EXT4FS)
diff --git a/sys/ufs/ext4fs/ext4fs_journal.c b/sys/ufs/ext4fs/ext4fs_journal.c
new file mode 100644
index 000000000..c109d3c3b
--- /dev/null
+++ b/sys/ufs/ext4fs/ext4fs_journal.c
@@ -0,0 +1,944 @@
+/*
+ * Copyright (c) 2025 kmx.io.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * JBD2 journal replay for ext4fs.
+ *
+ * Implements the standard three-pass replay algorithm:
+ * 1. SCAN - walk the journal to find valid transactions
+ * 2. REVOKE - collect revoked blocks
+ * 3. REPLAY - write surviving data blocks to the filesystem
+ *
+ * All JBD2 on-disk fields are big-endian.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext4fs/ext4fs.h>
+#include <ufs/ext4fs/ext4fs_journal.h>
+
+/*
+ * Build the journal block map from sb_jnl_blocks[0..14].
+ *
+ * sb_jnl_blocks[0..14] is a copy of inode 8's i_block[0..14], which
+ * contains an extent tree in the same format as regular file inodes.
+ * The values are little-endian (copied from the inode).
+ *
+ * sb_jnl_blocks[15] = i_size_lo, sb_jnl_blocks[16] = i_size_hi.
+ */
+static int
+jbd2_build_blockmap(struct jbd2_replay_ctx *ctx)
+{
+ struct m_ext4fs *fs = ctx->rc_fs;
+ struct ext4fs *sble = &fs->m_sble;
+ u_int32_t *iblock = sble->sb_jnl_blocks;
+ struct ext4fs_extent_header *eh;
+ struct ext4fs_extent *ext;
+ struct ext4fs_extent_idx *idx;
+ u_int16_t depth, entries, i;
+ u_int32_t jblock, maxblocks;
+ u_int64_t pblock;
+ u_int32_t len;
+
+ maxblocks = ctx->rc_maxlen;
+ ctx->rc_blockmap = mallocarray(maxblocks,
+ sizeof(struct jbd2_blockmap_entry), M_TEMP, M_WAITOK | M_ZERO);
+ ctx->rc_blockmap_count = maxblocks;
+
+ /* Parse extent header from i_block[0..2] (first 12 bytes) */
+ eh = (struct ext4fs_extent_header *)iblock;
+ if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC) {
+ printf("ext4fs: journal inode has bad extent magic 0x%x\n",
+ letoh16(eh->eh_magic));
+ return (EINVAL);
+ }
+
+ depth = letoh16(eh->eh_depth);
+ entries = letoh16(eh->eh_entries);
+
+ if (depth == 0) {
+ /* Leaf extents follow the header directly */
+ ext = (struct ext4fs_extent *)(eh + 1);
+ for (i = 0; i < entries; i++) {
+ u_int32_t lblk = letoh32(ext[i].e_block);
+ len = letoh16(ext[i].e_len);
+ pblock = (u_int64_t)letoh16(ext[i].e_start_hi) << 32 |
+ letoh32(ext[i].e_start_lo);
+
+ for (jblock = 0; jblock < len; jblock++) {
+ u_int32_t j = lblk + jblock;
+ if (j < maxblocks)
+ ctx->rc_blockmap[j].jb_fsblock =
+ pblock + jblock;
+ }
+ }
+ } else {
+ /* Depth > 0: index nodes, need to read leaf blocks */
+ idx = (struct ext4fs_extent_idx *)(eh + 1);
+ for (i = 0; i < entries; i++) {
+ struct buf *bp;
+ struct ext4fs_extent_header *leh;
+ struct ext4fs_extent *lext;
+ u_int16_t lentries, j;
+ u_int64_t leaf_block;
+ int error;
+
+ leaf_block =
+ (u_int64_t)letoh16(idx[i].ei_leaf_hi) << 32 |
+ letoh32(idx[i].ei_leaf_lo);
+
+ error = bread(ctx->rc_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, leaf_block),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ printf("ext4fs: journal blockmap: "
+ "can't read index block\n");
+ return (error);
+ }
+
+ leh = (struct ext4fs_extent_header *)bp->b_data;
+ if (letoh16(leh->eh_magic) !=
+ EXT4FS_EXTENT_HEADER_MAGIC) {
+ brelse(bp);
+ printf("ext4fs: journal blockmap: "
+ "bad leaf magic\n");
+ return (EINVAL);
+ }
+ if (letoh16(leh->eh_depth) != 0) {
+ brelse(bp);
+ printf("ext4fs: journal blockmap: "
+ "depth > 1 not supported\n");
+ return (EINVAL);
+ }
+
+ lentries = letoh16(leh->eh_entries);
+ lext = (struct ext4fs_extent *)(leh + 1);
+
+ for (j = 0; j < lentries; j++) {
+ u_int32_t lblk = letoh32(lext[j].e_block);
+ len = letoh16(lext[j].e_len);
+ pblock =
+ (u_int64_t)letoh16(lext[j].e_start_hi)
+ << 32 | letoh32(lext[j].e_start_lo);
+
+ for (jblock = 0; jblock < len; jblock++) {
+ u_int32_t k = lblk + jblock;
+ if (k < maxblocks)
+ ctx->rc_blockmap[k].
+ jb_fsblock =
+ pblock + jblock;
+ }
+ }
+ brelse(bp);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Read a journal block by journal-relative block number.
+ */
+static int
+jbd2_read_block(struct jbd2_replay_ctx *ctx, u_int32_t jblock,
+ struct buf **bpp)
+{
+ struct m_ext4fs *fs = ctx->rc_fs;
+ u_int64_t fsblock;
+
+ if (jblock >= ctx->rc_blockmap_count) {
+ printf("ext4fs: journal block %u out of range (%u)\n",
+ jblock, ctx->rc_blockmap_count);
+ return (EIO);
+ }
+
+ fsblock = ctx->rc_blockmap[jblock].jb_fsblock;
+ if (fsblock == 0) {
+ printf("ext4fs: journal block %u not mapped\n", jblock);
+ return (EIO);
+ }
+
+ return bread(ctx->rc_devvp, (daddr_t)EXT4FS_FSBTODB(fs, fsblock),
+ fs->m_block_size, bpp);
+}
+
+/*
+ * Wrap journal block number circularly.
+ */
+static u_int32_t
+jbd2_next_block(struct jbd2_replay_ctx *ctx, u_int32_t block)
+{
+ block++;
+ if (block >= ctx->rc_maxlen)
+ block = ctx->rc_first;
+ return block;
+}
+
+/*
+ * Parse one descriptor tag from a descriptor block.
+ *
+ * Returns 0 on success, sets *target to the filesystem block,
+ * *flags to the tag flags, and advances *offset past the tag.
+ */
+static int
+jbd2_parse_tag(struct jbd2_replay_ctx *ctx, char *buf, u_int32_t bufsize,
+ u_int32_t *offset, u_int64_t *target, u_int32_t *flags)
+{
+ int has_csum_v3, has_64bit;
+ u_int32_t tag_size;
+
+ has_csum_v3 = ctx->rc_features_incompat &
+ JBD2_FEATURE_INCOMPAT_CSUM_V3;
+ has_64bit = ctx->rc_features_incompat &
+ JBD2_FEATURE_INCOMPAT_64BIT;
+
+ if (has_csum_v3) {
+ struct jbd2_block_tag3 *tag3;
+
+ tag_size = sizeof(struct jbd2_block_tag3);
+ if (*offset + tag_size > bufsize)
+ return (EINVAL);
+
+ tag3 = (struct jbd2_block_tag3 *)(buf + *offset);
+ *target = betoh32(tag3->t_blocknr);
+ *flags = betoh32(tag3->t_flags);
+ if (has_64bit)
+ *target |= (u_int64_t)betoh32(tag3->t_blocknr_high)
+ << 32;
+
+ *offset += tag_size;
+ if (!(*flags & JBD2_FLAG_SAME_UUID))
+ *offset += 16; /* skip UUID */
+ } else {
+ struct jbd2_block_tag *tag;
+
+ tag_size = 8; /* minimum: blocknr + checksum + flags */
+ if (*offset + tag_size > bufsize)
+ return (EINVAL);
+
+ tag = (struct jbd2_block_tag *)(buf + *offset);
+ *target = betoh32(tag->t_blocknr);
+ *flags = betoh16(tag->t_flags);
+
+ *offset += tag_size;
+ if (has_64bit) {
+ if (*offset + 4 > bufsize)
+ return (EINVAL);
+ *target |= (u_int64_t)betoh32(tag->t_blocknr_high)
+ << 32;
+ *offset += 4;
+ }
+ if (!(*flags & JBD2_FLAG_SAME_UUID))
+ *offset += 16;
+ }
+
+ return (0);
+}
+
+/*
+ * Add a block to the revocation table.
+ */
+static void
+jbd2_revoke_add(struct jbd2_replay_ctx *ctx, u_int64_t block,
+ u_int32_t sequence)
+{
+ u_int32_t i;
+
+ /* Update existing entry if present */
+ for (i = 0; i < ctx->rc_revoke_count; i++) {
+ if (ctx->rc_revoke[i].re_block == block) {
+ if (sequence > ctx->rc_revoke[i].re_sequence ||
+ (sequence < 0x10000 &&
+ ctx->rc_revoke[i].re_sequence > 0xFFFF0000))
+ ctx->rc_revoke[i].re_sequence = sequence;
+ return;
+ }
+ }
+
+ /* Grow table if needed */
+ if (ctx->rc_revoke_count >= ctx->rc_revoke_alloc) {
+ struct jbd2_revoke_entry *newrev;
+ u_int32_t newalloc;
+
+ newalloc = ctx->rc_revoke_alloc ? ctx->rc_revoke_alloc * 2
+ : 64;
+ newrev = mallocarray(newalloc,
+ sizeof(struct jbd2_revoke_entry), M_TEMP,
+ M_WAITOK | M_ZERO);
+ if (ctx->rc_revoke_count > 0)
+ memcpy(newrev, ctx->rc_revoke,
+ ctx->rc_revoke_count *
+ sizeof(struct jbd2_revoke_entry));
+ if (ctx->rc_revoke != NULL)
+ free(ctx->rc_revoke, M_TEMP,
+ ctx->rc_revoke_alloc *
+ sizeof(struct jbd2_revoke_entry));
+ ctx->rc_revoke = newrev;
+ ctx->rc_revoke_alloc = newalloc;
+ }
+
+ ctx->rc_revoke[ctx->rc_revoke_count].re_block = block;
+ ctx->rc_revoke[ctx->rc_revoke_count].re_sequence = sequence;
+ ctx->rc_revoke_count++;
+}
+
+/*
+ * Check if a block is revoked at or after the given sequence.
+ */
+static int
+jbd2_revoke_check(struct jbd2_replay_ctx *ctx, u_int64_t block,
+ u_int32_t sequence)
+{
+ u_int32_t i;
+
+ for (i = 0; i < ctx->rc_revoke_count; i++) {
+ if (ctx->rc_revoke[i].re_block == block &&
+ (ctx->rc_revoke[i].re_sequence >= sequence ||
+ (ctx->rc_revoke[i].re_sequence < 0x10000 &&
+ sequence > 0xFFFF0000)))
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Check if a block looks like a valid journal header.
+ */
+static int
+jbd2_check_header(struct buf *bp, u_int32_t expected_seq, u_int32_t type)
+{
+ struct jbd2_header *hdr;
+
+ hdr = (struct jbd2_header *)bp->b_data;
+ if (betoh32(hdr->h_magic) != JBD2_MAGIC)
+ return (0);
+ if (betoh32(hdr->h_sequence) != expected_seq)
+ return (0);
+ if (type != 0 && betoh32(hdr->h_blocktype) != type)
+ return (0);
+ return (1);
+}
+
+/*
+ * Count data blocks described by a descriptor block's tags.
+ */
+static int
+jbd2_count_tags(struct jbd2_replay_ctx *ctx, struct buf *bp,
+ u_int32_t *count)
+{
+ char *buf;
+ u_int32_t offset, bufsize, flags;
+ u_int64_t target;
+ int error;
+
+ buf = (char *)bp->b_data;
+ bufsize = ctx->rc_blocksize;
+ offset = sizeof(struct jbd2_header);
+ *count = 0;
+
+ while (offset < bufsize) {
+ error = jbd2_parse_tag(ctx, buf, bufsize, &offset,
+ &target, &flags);
+ if (error)
+ break;
+ (*count)++;
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ }
+ return (0);
+}
+
+/*
+ * Pass 1: SCAN
+ *
+ * Walk the journal from s_start/s_sequence, verify each transaction
+ * has a matching DESCRIPTOR and COMMIT block, and find the end of
+ * the valid journal.
+ */
+static int
+jbd2_pass_scan(struct jbd2_replay_ctx *ctx)
+{
+ u_int32_t block, seq, next_seq;
+ u_int32_t tag_count;
+ struct buf *bp;
+ struct jbd2_header *hdr;
+ int error;
+
+ block = ctx->rc_start;
+ seq = ctx->rc_sequence;
+ ctx->rc_end_sequence = seq;
+
+ printf("ext4fs: journal scan: start block %u sequence %u\n",
+ block, seq);
+
+ while (1) {
+ error = jbd2_read_block(ctx, block, &bp);
+ if (error) {
+ printf("ext4fs: journal scan: read error at "
+ "block %u\n", block);
+ break;
+ }
+
+ hdr = (struct jbd2_header *)bp->b_data;
+ if (betoh32(hdr->h_magic) != JBD2_MAGIC) {
+ brelse(bp);
+ break;
+ }
+ if (betoh32(hdr->h_sequence) != seq) {
+ brelse(bp);
+ break;
+ }
+
+ switch (betoh32(hdr->h_blocktype)) {
+ case JBD2_DESCRIPTOR_BLOCK:
+ /* Count tags to know how many data blocks follow */
+ error = jbd2_count_tags(ctx, bp, &tag_count);
+ brelse(bp);
+ if (error)
+ goto done;
+
+ /* Skip over data blocks */
+ {
+ u_int32_t i;
+ for (i = 0; i < tag_count; i++)
+ block = jbd2_next_block(ctx, block);
+ }
+
+ /* Next block should be commit */
+ block = jbd2_next_block(ctx, block);
+ error = jbd2_read_block(ctx, block, &bp);
+ if (error) {
+ printf("ext4fs: journal scan: missing commit "
+ "for seq %u\n", seq);
+ goto done;
+ }
+
+ if (!jbd2_check_header(bp, seq,
+ JBD2_COMMIT_BLOCK)) {
+ brelse(bp);
+ printf("ext4fs: journal scan: bad commit "
+ "for seq %u\n", seq);
+ goto done;
+ }
+ brelse(bp);
+
+ /* Valid transaction */
+ next_seq = seq + 1;
+ ctx->rc_end_sequence = next_seq;
+ seq = next_seq;
+ block = jbd2_next_block(ctx, block);
+ break;
+
+ case JBD2_REVOKE_BLOCK:
+ /*
+ * A revoke block by itself is part of a transaction.
+ * There may be multiple revoke blocks before the
+ * commit.
+ */
+ brelse(bp);
+ block = jbd2_next_block(ctx, block);
+ break;
+
+ case JBD2_COMMIT_BLOCK:
+ /* Unexpected standalone commit — end of journal */
+ brelse(bp);
+ goto done;
+
+ default:
+ brelse(bp);
+ goto done;
+ }
+ }
+
+done:
+ printf("ext4fs: journal scan: end sequence %u (%u transactions)\n",
+ ctx->rc_end_sequence,
+ ctx->rc_end_sequence - ctx->rc_sequence);
+ return (0);
+}
+
+/*
+ * Pass 2: REVOKE
+ *
+ * Walk the journal again, collecting revoked blocks.
+ */
+static int
+jbd2_pass_revoke(struct jbd2_replay_ctx *ctx)
+{
+ u_int32_t block, seq;
+ u_int32_t tag_count;
+ struct buf *bp;
+ struct jbd2_header *hdr;
+ struct jbd2_revoke_header *rh;
+ int has_64bit;
+ int error;
+
+ has_64bit = ctx->rc_features_incompat &
+ JBD2_FEATURE_INCOMPAT_64BIT;
+
+ block = ctx->rc_start;
+ seq = ctx->rc_sequence;
+
+ while (seq < ctx->rc_end_sequence) {
+ error = jbd2_read_block(ctx, block, &bp);
+ if (error)
+ return (error);
+
+ hdr = (struct jbd2_header *)bp->b_data;
+ if (betoh32(hdr->h_magic) != JBD2_MAGIC ||
+ betoh32(hdr->h_sequence) != seq) {
+ brelse(bp);
+ break;
+ }
+
+ switch (betoh32(hdr->h_blocktype)) {
+ case JBD2_DESCRIPTOR_BLOCK:
+ error = jbd2_count_tags(ctx, bp, &tag_count);
+ brelse(bp);
+ if (error)
+ return (error);
+ {
+ u_int32_t i;
+ for (i = 0; i < tag_count; i++)
+ block = jbd2_next_block(ctx, block);
+ }
+ /* Skip commit block */
+ block = jbd2_next_block(ctx, block);
+ block = jbd2_next_block(ctx, block);
+ seq++;
+ break;
+
+ case JBD2_REVOKE_BLOCK:
+ rh = (struct jbd2_revoke_header *)bp->b_data;
+ {
+ u_int32_t rcount, off;
+ rcount = betoh32(rh->r_count);
+ off = sizeof(struct jbd2_revoke_header);
+ while (off < rcount) {
+ u_int64_t revblk;
+ if (has_64bit) {
+ if (off + 8 > rcount)
+ break;
+ revblk =
+ (u_int64_t)betoh32(
+ *(u_int32_t *)
+ ((char *)bp->b_data +
+ off)) << 32 |
+ betoh32(
+ *(u_int32_t *)
+ ((char *)bp->b_data +
+ off + 4));
+ off += 8;
+ } else {
+ if (off + 4 > rcount)
+ break;
+ revblk = betoh32(
+ *(u_int32_t *)
+ ((char *)bp->b_data +
+ off));
+ off += 4;
+ }
+ jbd2_revoke_add(ctx, revblk, seq);
+ }
+ }
+ brelse(bp);
+ block = jbd2_next_block(ctx, block);
+ break;
+
+ case JBD2_COMMIT_BLOCK:
+ brelse(bp);
+ block = jbd2_next_block(ctx, block);
+ seq++;
+ break;
+
+ default:
+ brelse(bp);
+ block = jbd2_next_block(ctx, block);
+ break;
+ }
+ }
+
+ if (ctx->rc_revoke_count > 0)
+ printf("ext4fs: journal revoke: %u blocks revoked\n",
+ ctx->rc_revoke_count);
+
+ return (0);
+}
+
+/*
+ * Pass 3: REPLAY
+ *
+ * Walk the journal a third time, writing data blocks to the filesystem
+ * that have not been revoked.
+ */
+static int
+jbd2_pass_replay(struct jbd2_replay_ctx *ctx)
+{
+ struct m_ext4fs *fs = ctx->rc_fs;
+ u_int32_t block, seq;
+ u_int32_t replayed = 0;
+ struct buf *bp, *dbp, *wbp;
+ struct jbd2_header *hdr;
+ char *buf;
+ u_int32_t offset, bufsize, flags;
+ u_int64_t target;
+ int error;
+
+ block = ctx->rc_start;
+ seq = ctx->rc_sequence;
+
+ while (seq < ctx->rc_end_sequence) {
+ error = jbd2_read_block(ctx, block, &bp);
+ if (error)
+ return (error);
+
+ hdr = (struct jbd2_header *)bp->b_data;
+ if (betoh32(hdr->h_magic) != JBD2_MAGIC ||
+ betoh32(hdr->h_sequence) != seq) {
+ brelse(bp);
+ break;
+ }
+
+ switch (betoh32(hdr->h_blocktype)) {
+ case JBD2_DESCRIPTOR_BLOCK:
+ buf = (char *)bp->b_data;
+ bufsize = ctx->rc_blocksize;
+ offset = sizeof(struct jbd2_header);
+
+ /* Iterate over tags, each followed by a data block */
+ while (offset < bufsize) {
+ error = jbd2_parse_tag(ctx, buf, bufsize,
+ &offset, &target, &flags);
+ if (error)
+ break;
+
+ /* Advance to data block */
+ block = jbd2_next_block(ctx, block);
+
+ /* Skip if revoked */
+ if (jbd2_revoke_check(ctx, target, seq)) {
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ continue;
+ }
+
+ /* Read the journal data block */
+ error = jbd2_read_block(ctx, block, &dbp);
+ if (error) {
+ printf("ext4fs: journal replay: "
+ "read error block %u\n", block);
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ continue;
+ }
+
+ /* Read the target filesystem block */
+ error = bread(ctx->rc_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, target),
+ fs->m_block_size, &wbp);
+ if (error) {
+ brelse(dbp);
+ printf("ext4fs: journal replay: "
+ "can't read target %llu\n",
+ (unsigned long long)target);
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ continue;
+ }
+
+ /* Copy data */
+ memcpy(wbp->b_data, dbp->b_data,
+ fs->m_block_size);
+ brelse(dbp);
+
+ /* Un-escape: restore JBD2 magic if needed */
+ if (flags & JBD2_FLAG_ESCAPE) {
+ u_int32_t magic = htobe32(JBD2_MAGIC);
+ memcpy(wbp->b_data, &magic, 4);
+ }
+
+ /* Write to filesystem */
+ error = bwrite(wbp);
+ if (error) {
+ printf("ext4fs: journal replay: "
+ "write error target %llu\n",
+ (unsigned long long)target);
+ } else {
+ replayed++;
+ }
+
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ }
+
+ brelse(bp);
+
+ /* Skip to commit block and past it */
+ block = jbd2_next_block(ctx, block);
+ /* Skip the commit block */
+ block = jbd2_next_block(ctx, block);
+ seq++;
+ break;
+
+ case JBD2_REVOKE_BLOCK:
+ brelse(bp);
+ block = jbd2_next_block(ctx, block);
+ break;
+
+ case JBD2_COMMIT_BLOCK:
+ brelse(bp);
+ block = jbd2_next_block(ctx, block);
+ seq++;
+ break;
+
+ default:
+ brelse(bp);
+ block = jbd2_next_block(ctx, block);
+ break;
+ }
+ }
+
+ ctx->rc_replay_count = replayed;
+ printf("ext4fs: journal replay: %u blocks replayed\n", replayed);
+
+ return (0);
+}
+
+/*
+ * Main entry point: replay the ext4 journal.
+ *
+ * Called during mount when the RECOVER incompat flag is set.
+ * Reads the journal superblock backup from sb_jnl_blocks,
+ * runs the three-pass replay, then clears the journal.
+ */
+int
+ext4fs_journal_replay(struct vnode *devvp, struct m_ext4fs *fs)
+{
+ struct jbd2_replay_ctx ctx;
+ struct jbd2_superblock *jsb;
+ struct buf *bp;
+ u_int64_t jblock0;
+ int error;
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.rc_devvp = devvp;
+ ctx.rc_fs = fs;
+
+ /*
+ * Locate journal block 0 from the block map.
+ * We need to build the map first.
+ */
+
+ /* Read journal superblock: first build the blockmap,
+ * then read journal block 0 */
+
+ /* Temporarily set rc_maxlen from sb_jnl_blocks.
+ * Journal size = sb_jnl_blocks[15] | sb_jnl_blocks[16] << 32,
+ * in bytes. Divide by blocksize for blocks. */
+ {
+ u_int64_t jsize;
+ jsize = letoh32(fs->m_sble.sb_jnl_blocks[15]) |
+ (u_int64_t)letoh32(fs->m_sble.sb_jnl_blocks[16]) << 32;
+ ctx.rc_maxlen = jsize / fs->m_block_size;
+ }
+
+ if (ctx.rc_maxlen == 0) {
+ printf("ext4fs: journal has zero size\n");
+ return (EINVAL);
+ }
+
+ /* Build journal block → filesystem block mapping */
+ error = jbd2_build_blockmap(&ctx);
+ if (error)
+ goto out;
+
+ /* Read journal superblock (journal block 0) */
+ jblock0 = ctx.rc_blockmap[0].jb_fsblock;
+ if (jblock0 == 0) {
+ printf("ext4fs: journal block 0 not mapped\n");
+ error = EINVAL;
+ goto out;
+ }
+
+ error = bread(devvp, (daddr_t)EXT4FS_FSBTODB(fs, jblock0),
+ fs->m_block_size, &bp);
+ if (error) {
+ printf("ext4fs: can't read journal superblock\n");
+ goto out;
+ }
+
+ jsb = (struct jbd2_superblock *)bp->b_data;
+
+ /* Validate journal superblock */
+ if (betoh32(jsb->s_header.h_magic) != JBD2_MAGIC) {
+ printf("ext4fs: bad journal magic 0x%x\n",
+ betoh32(jsb->s_header.h_magic));
+ brelse(bp);
+ error = EINVAL;
+ goto out;
+ }
+ {
+ u_int32_t btype = betoh32(jsb->s_header.h_blocktype);
+ if (btype != JBD2_SUPERBLOCK_V1 &&
+ btype != JBD2_SUPERBLOCK_V2) {
+ printf("ext4fs: bad journal superblock version %u\n",
+ btype);
+ brelse(bp);
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ ctx.rc_blocksize = betoh32(jsb->s_blocksize);
+ ctx.rc_maxlen = betoh32(jsb->s_maxlen);
+ ctx.rc_first = betoh32(jsb->s_first);
+ ctx.rc_sequence = betoh32(jsb->s_sequence);
+ ctx.rc_start = betoh32(jsb->s_start);
+
+ if (betoh32(jsb->s_header.h_blocktype) == JBD2_SUPERBLOCK_V2)
+ ctx.rc_features_incompat = betoh32(jsb->s_feature_incompat);
+ else
+ ctx.rc_features_incompat = 0;
+
+ brelse(bp);
+
+ if (ctx.rc_blocksize != fs->m_block_size) {
+ printf("ext4fs: journal blocksize %u != fs blocksize %llu\n",
+ ctx.rc_blocksize, (unsigned long long)fs->m_block_size);
+ error = EINVAL;
+ goto out;
+ }
+
+ /* If s_start == 0, journal is clean — nothing to replay */
+ if (ctx.rc_start == 0) {
+ printf("ext4fs: journal is clean, no replay needed\n");
+ error = 0;
+ goto out;
+ }
+
+ /* Rebuild blockmap with correct maxlen from journal superblock */
+ free(ctx.rc_blockmap, M_TEMP,
+ ctx.rc_blockmap_count * sizeof(struct jbd2_blockmap_entry));
+ ctx.rc_blockmap = NULL;
+ ctx.rc_blockmap_count = 0;
+ error = jbd2_build_blockmap(&ctx);
+ if (error)
+ goto out;
+
+ printf("ext4fs: replaying journal (sequence %u, start block %u, "
+ "%u journal blocks)\n",
+ ctx.rc_sequence, ctx.rc_start, ctx.rc_maxlen);
+
+ /* Pass 1: SCAN */
+ error = jbd2_pass_scan(&ctx);
+ if (error)
+ goto out;
+
+ if (ctx.rc_end_sequence == ctx.rc_sequence) {
+ printf("ext4fs: journal has no valid transactions\n");
+ goto clear;
+ }
+
+ /* Pass 2: REVOKE */
+ error = jbd2_pass_revoke(&ctx);
+ if (error)
+ goto out;
+
+ /* Pass 3: REPLAY */
+ error = jbd2_pass_replay(&ctx);
+ if (error)
+ goto out;
+
+clear:
+ /*
+ * Mark journal clean: set s_start=0 in journal superblock.
+ */
+ error = bread(devvp, (daddr_t)EXT4FS_FSBTODB(fs, jblock0),
+ fs->m_block_size, &bp);
+ if (error) {
+ printf("ext4fs: can't reread journal superblock\n");
+ goto out;
+ }
+
+ jsb = (struct jbd2_superblock *)bp->b_data;
+ jsb->s_start = htobe32(0);
+ /* Advance sequence past what we replayed */
+ jsb->s_sequence = htobe32(ctx.rc_end_sequence);
+ error = bwrite(bp);
+ if (error) {
+ printf("ext4fs: can't write journal superblock\n");
+ goto out;
+ }
+
+ /*
+ * Clear RECOVER flag and set STATE_VALID in ext4 superblock.
+ */
+ {
+ u_int32_t incompat;
+
+ incompat = letoh32(fs->m_sble.sb_feature_incompat);
+ incompat &= ~EXT4FS_FEATURE_INCOMPAT_RECOVER;
+ fs->m_sble.sb_feature_incompat = htole32(incompat);
+ fs->m_feature_incompat = incompat;
+
+ fs->m_sble.sb_state = htole16(EXT4FS_STATE_VALID);
+ fs->m_state = EXT4FS_STATE_VALID;
+
+ /* Recompute superblock checksum */
+ fs->m_sble.sb_checksum =
+ htole32(ext4fs_sb_csum(&fs->m_sble));
+
+ /* Write superblock to disk */
+ error = bread(devvp,
+ (daddr_t)(EXT4FS_SUPER_BLOCK_OFFSET / DEV_BSIZE),
+ EXT4FS_SUPER_BLOCK_SIZE, &bp);
+ if (error) {
+ printf("ext4fs: can't read superblock for update\n");
+ goto out;
+ }
+ memcpy(bp->b_data, &fs->m_sble, sizeof(struct ext4fs));
+ error = bwrite(bp);
+ if (error) {
+ printf("ext4fs: can't write superblock\n");
+ goto out;
+ }
+ }
+
+ printf("ext4fs: journal replay complete\n");
+
+out:
+ if (ctx.rc_blockmap != NULL)
+ free(ctx.rc_blockmap, M_TEMP,
+ ctx.rc_blockmap_count *
+ sizeof(struct jbd2_blockmap_entry));
+ if (ctx.rc_revoke != NULL)
+ free(ctx.rc_revoke, M_TEMP,
+ ctx.rc_revoke_alloc *
+ sizeof(struct jbd2_revoke_entry));
+
+ return (error);
+}
diff --git a/sys/ufs/ext4fs/ext4fs_journal.h b/sys/ufs/ext4fs/ext4fs_journal.h
new file mode 100644
index 000000000..318e757e2
--- /dev/null
+++ b/sys/ufs/ext4fs/ext4fs_journal.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2025 kmx.io.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _EXT4FS_JOURNAL_H_
+#define _EXT4FS_JOURNAL_H_
+
+/*
+ * JBD2 journal on-disk structures.
+ * All JBD2 fields are big-endian.
+ */
+
+#define JBD2_MAGIC 0xC03B3998
+
+/* Block types */
+#define JBD2_DESCRIPTOR_BLOCK 1
+#define JBD2_COMMIT_BLOCK 2
+#define JBD2_SUPERBLOCK_V1 3
+#define JBD2_SUPERBLOCK_V2 4
+#define JBD2_REVOKE_BLOCK 5
+
+/* Descriptor tag flags */
+#define JBD2_FLAG_ESCAPE 0x01
+#define JBD2_FLAG_SAME_UUID 0x02
+#define JBD2_FLAG_DELETED 0x04
+#define JBD2_FLAG_LAST_TAG 0x08
+
+/* Journal feature flags (in journal superblock) */
+#define JBD2_FEATURE_COMPAT_CHECKSUM 0x01
+
+#define JBD2_FEATURE_INCOMPAT_REVOKE 0x01
+#define JBD2_FEATURE_INCOMPAT_64BIT 0x02
+#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x04
+#define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x08
+#define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x10
+
+/* Common block header (12 bytes) */
+struct jbd2_header {
+ u_int32_t h_magic;
+ u_int32_t h_blocktype;
+ u_int32_t h_sequence;
+} __attribute__((packed));
+
+/* Journal superblock */
+struct jbd2_superblock {
+ struct jbd2_header s_header;
+ /* 0x0C */
+ u_int32_t s_blocksize;
+ u_int32_t s_maxlen;
+ u_int32_t s_first;
+ /* 0x18 */
+ u_int32_t s_sequence;
+ u_int32_t s_start;
+ /* 0x20 */
+ u_int32_t s_errno;
+ /* V2+ fields */
+ u_int32_t s_feature_compat;
+ u_int32_t s_feature_incompat;
+ u_int32_t s_feature_ro_compat;
+ /* 0x30 */
+ u_int8_t s_uuid[16];
+ /* 0x40 */
+ u_int32_t s_nr_users;
+ u_int32_t s_dynsuper;
+ /* 0x48 */
+ u_int32_t s_max_transaction;
+ u_int32_t s_max_trans_data;
+ /* 0x50 */
+ u_int8_t s_checksum_type;
+ u_int8_t s_padding2[3];
+ /* 0x54 */
+ u_int8_t s_padding[168];
+ /* 0xFC */
+ u_int32_t s_checksum;
+ /* 0x100 */
+ u_int8_t s_users[16 * 48];
+} __attribute__((packed));
+
+/* Descriptor block tag v3 (CSUM_V3, 16 bytes without UUID) */
+struct jbd2_block_tag3 {
+ u_int32_t t_blocknr;
+ u_int32_t t_flags;
+ u_int32_t t_blocknr_high;
+ u_int32_t t_checksum;
+} __attribute__((packed));
+
+/* Descriptor block tag v2 (no CSUM_V3) */
+struct jbd2_block_tag {
+ u_int32_t t_blocknr;
+ u_int16_t t_checksum;
+ u_int16_t t_flags;
+ u_int32_t t_blocknr_high; /* only if 64BIT */
+} __attribute__((packed));
+
+/* Revoke block header */
+struct jbd2_revoke_header {
+ struct jbd2_header r_header;
+ u_int32_t r_count; /* bytes used in this block */
+} __attribute__((packed));
+
+/* Revocation table entry */
+struct jbd2_revoke_entry {
+ u_int64_t re_block;
+ u_int32_t re_sequence;
+};
+
+/* Block map entry: journal block -> filesystem block */
+struct jbd2_blockmap_entry {
+ u_int64_t jb_fsblock; /* filesystem block number */
+};
+
+/* In-memory replay context */
+struct jbd2_replay_ctx {
+ struct vnode *rc_devvp;
+ struct m_ext4fs *rc_fs;
+
+ /* Journal geometry (from journal superblock, host order) */
+ u_int32_t rc_blocksize;
+ u_int32_t rc_maxlen;
+ u_int32_t rc_first;
+ u_int32_t rc_sequence; /* starting sequence */
+ u_int32_t rc_start; /* starting block */
+
+ /* Journal feature flags */
+ u_int32_t rc_features_incompat;
+
+ /* Block map: journal block number -> filesystem block */
+ struct jbd2_blockmap_entry *rc_blockmap;
+ u_int32_t rc_blockmap_count;
+
+ /* Revocation table */
+ struct jbd2_revoke_entry *rc_revoke;
+ u_int32_t rc_revoke_count;
+ u_int32_t rc_revoke_alloc;
+
+ /* Scan result */
+ u_int32_t rc_end_sequence;
+ u_int32_t rc_replay_count;
+};
+
+int ext4fs_journal_replay(struct vnode *, struct m_ext4fs *);
+
+#endif /* _EXT4FS_JOURNAL_H_ */
diff --git a/sys/ufs/ext4fs/ext4fs_vfsops.c b/sys/ufs/ext4fs/ext4fs_vfsops.c
new file mode 100644
index 000000000..5440f9898
--- /dev/null
+++ b/sys/ufs/ext4fs/ext4fs_vfsops.c
@@ -0,0 +1,1460 @@
+/*
+ * Copyright (c) 2025 kmx.io.
+ * Copyright (c) 1997 Manuel Bouyer.
+ * Copyright (c) 1989, 1991, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Modified for ext4fs by kmx.io.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+//#include <sys/socket.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/disk.h>
+//#include <sys/mbuf.h>
+#include <sys/fcntl.h>
+//#include <sys/disklabel.h>
+//#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/rwlock.h>
+#include <sys/stat.h>
+#include <sys/dkio.h>
+#include <sys/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+//#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext4fs/ext4fs.h>
+#include <ufs/ext4fs/ext4fs_extern.h>
+#include <ufs/ext4fs/ext4fs_journal.h>
+
+struct pool ext4fs_inode_pool;
+struct pool ext4fs_dinode_pool;
+
+#define PRINTF_FEATURES(mask, features) \
+ for (i = 0; i < nitems(features); i++) \
+ if ((mask) & (features)[i].f_mask) \
+ printf("%s ", (features)[i].f_name)
+
+int ext4fs_block_group_has_super_block(int);
+int ext4fs_mountfs(struct vnode *, struct mount *, struct proc *);
+int ext4fs_sbcheck(struct ext4fs *, int);
+void ext4fs_sbload(struct ext4fs *, struct m_ext4fs *);
+int ext4fs_sbfill(struct vnode *, struct m_ext4fs *);
+
+const struct vfsops ext4fs_vfsops = {
+ .vfs_mount = ext4fs_mount,
+ .vfs_start = ufs_start,
+ .vfs_unmount = ext4fs_unmount,
+ .vfs_root = ufs_root,
+ .vfs_quotactl = ufs_quotactl,
+ .vfs_statfs = ext4fs_statfs,
+ .vfs_sync = ext4fs_sync,
+ .vfs_vget = ext4fs_vget,
+ .vfs_fhtovp = ext4fs_fhtovp,
+ .vfs_vptofh = ext4fs_vptofh,
+ .vfs_init = ext4fs_init,
+ .vfs_sysctl = ext4fs_sysctl,
+ .vfs_checkexp = ufs_check_export,
+};
+
+struct pool ext4fs_inode_pool;
+
+int
+ext4fs_block_group_has_super_block(int group)
+{
+ int a3, a5, a7;
+
+ if (group == 0 || group == 1)
+ return 1;
+ for (a3 = 3, a5 = 5, a7 = 7;
+ a3 <= group || a5 <= group || a7 <= group;
+ a3 *= 3, a5 *= 5, a7 *= 7)
+ if (group == a3 || group == a5 || group == a7)
+ return 1;
+ return 0;
+}
+
+int
+ext4fs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+ (void)mp;
+ (void)fhp;
+ (void)vpp;
+ printf("ext4fs_fhtovp: not implemented\n");
+ return (EOPNOTSUPP);
+}
+
+/*
+ * Flush out all the files in a filesystem.
+ */
+int
+ext4fs_flushfiles(struct mount *mp, int flags, struct proc *p)
+{
+ struct ufsmount *ump;
+ int error;
+
+ ump = VFSTOUFS(mp);
+ /*
+ * Flush all the files.
+ */
+ if ((error = vflush(mp, NULL, flags)) != 0)
+ return (error);
+ /*
+ * Flush filesystem metadata.
+ */
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_FSYNC(ump->um_devvp, p->p_ucred, MNT_WAIT, p);
+ VOP_UNLOCK(ump->um_devvp);
+ return (error);
+}
+
+int
+ext4fs_init(struct vfsconf *vfsp)
+{
+ int result;
+ (void)vfsp;
+ pool_init(&ext4fs_inode_pool, sizeof(struct inode), 0,
+ IPL_NONE, PR_WAITOK, "ext4inopl", NULL);
+ pool_init(&ext4fs_dinode_pool, sizeof(struct ext4fs_dinode_256), 0,
+ IPL_NONE, PR_WAITOK, "ext4dinopl", NULL);
+ if ((result = ufs_init(vfsp))) {
+ return result;
+ }
+ return (0);
+}
+
+int
+ext4fs_mount(struct mount *mp, const char *path, void *data,
+ struct nameidata *ndp, struct proc *p)
+{
+ struct ufs_args *args;
+ struct vnode *devvp;
+ int error;
+ struct m_ext4fs *mfs;
+ char fname[MNAMELEN];
+ char fspec[MNAMELEN];
+ struct ufsmount *ump = NULL;
+
+ args = data;
+ error = copyinstr(args->fspec, fspec, sizeof(fspec), NULL);
+ if (error)
+ goto error;
+
+ if (disk_map(fspec, fname, MNAMELEN, DM_OPENBLCK) == -1)
+ memcpy(fname, fspec, sizeof(fname));
+
+ NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fname, p);
+ if ((error = namei(ndp)) != 0)
+ goto error;
+ devvp = ndp->ni_vp;
+
+ if (devvp->v_type != VBLK) {
+ error = ENOTBLK;
+ goto error_devvp;
+ }
+ if (major(devvp->v_rdev) >= nblkdev) {
+ error = ENXIO;
+ goto error_devvp;
+ }
+ if ((mp->mnt_flag & MNT_UPDATE) == 0) {
+ error = ext4fs_mountfs(devvp, mp, p);
+ } else {
+ ump = VFSTOUFS(mp);
+ if (devvp != ump->um_devvp)
+ error = EINVAL; /* XXX needs translation */
+ else
+ vrele(devvp);
+ }
+ if (error)
+ goto error_devvp;
+ ump = VFSTOUFS(mp);
+ mfs = ump->um_e4fs;
+
+ strlcpy(mp->mnt_stat.f_mntfromname, fname,
+ sizeof(mp->mnt_stat.f_mntfromname));
+ strlcpy(mp->mnt_stat.f_mntonname, path,
+ sizeof(mp->mnt_stat.f_mntonname));
+
+ goto success;
+
+error_devvp:
+ vrele(devvp);
+
+error:
+success:
+ return (error);
+}
+
+/*
+ * Common code for mount and mountroot
+ */
+int
+ext4fs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
+{
+ struct ufsmount *ump;
+ struct buf *bp;
+ struct ext4fs *sble;
+ struct m_ext4fs *mfs;
+ dev_t dev;
+ int error, ronly;
+ struct ucred *cred;
+
+ dev = devvp->v_rdev;
+ cred = p ? p->p_ucred : NOCRED;
+ /*
+ * Disallow multiple mounts of the same device.
+ * Disallow mounting of a device that is currently in use
+ * (except for root, which might share swap device for miniroot).
+ * Flush out any old buffers remaining from a previous use.
+ */
+ if ((error = vfs_mountedon(devvp)) != 0)
+ return (error);
+ if (vcount(devvp) > 1 && devvp != rootvp)
+ return (EBUSY);
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ error = vinvalbuf(devvp, V_SAVE, cred, p, 0, INFSLP);
+ VOP_UNLOCK(devvp);
+ if (error != 0)
+ return (error);
+
+ ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+ error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
+ if (error)
+ return (error);
+
+ bp = NULL;
+ ump = NULL;
+
+ /*
+ * Read the superblock from disk.
+ */
+ error = bread(devvp, (daddr_t)(EXT4FS_SUPER_BLOCK_OFFSET /
+ DEV_BSIZE),
+ EXT4FS_SUPER_BLOCK_SIZE, &bp);
+ if (error)
+ goto out;
+ sble = (struct ext4fs *)bp->b_data;
+ error = ext4fs_sbcheck(sble, ronly);
+ if (error)
+ goto out;
+
+ ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
+ mfs = ump->um_e4fs = malloc(sizeof(struct m_ext4fs), M_UFSMNT,
+ M_WAITOK | M_ZERO);
+
+ /*
+ * Copy in the superblock, compute in-memory values
+ * and load group descriptors.
+ */
+ ext4fs_sbload(sble, mfs);
+ if ((error = ext4fs_sbfill(devvp, mfs)) != 0)
+ goto out;
+ brelse(bp);
+ bp = NULL;
+ sble = &mfs->m_sble;
+
+ /*
+ * If the filesystem needs journal recovery, replay it now.
+ * For r/o mounts, we temporarily reopen the device r/w.
+ */
+ if ((mfs->m_feature_compat & EXT4FS_FEATURE_COMPAT_HAS_JOURNAL) &&
+ (mfs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_RECOVER)) {
+ int reopen_ro = 0;
+
+ if (ronly) {
+ /* Reopen device r/w for replay */
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ VOP_CLOSE(devvp, FREAD, cred, p);
+ VOP_UNLOCK(devvp);
+ error = VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, p);
+ if (error) {
+ printf("ext4fs: can't reopen device r/w "
+ "for journal replay\n");
+ goto out;
+ }
+ reopen_ro = 1;
+ }
+
+ error = ext4fs_journal_replay(devvp, mfs);
+ if (error) {
+ printf("ext4fs: journal replay failed: %d\n", error);
+ printf("ext4fs: use Linux e2fsck to repair\n");
+ /* Leave RECOVER set, fail the mount */
+ if (reopen_ro) {
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ VOP_CLOSE(devvp, FREAD | FWRITE, cred, p);
+ VOP_UNLOCK(devvp);
+ VOP_OPEN(devvp, FREAD, FSCRED, p);
+ }
+ goto out;
+ }
+
+ /* Reopen device r/o if it was a r/o mount */
+ if (reopen_ro) {
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ VOP_CLOSE(devvp, FREAD | FWRITE, cred, p);
+ VOP_UNLOCK(devvp);
+ error = VOP_OPEN(devvp, FREAD, FSCRED, p);
+ if (error) {
+ printf("ext4fs: can't reopen device r/o\n");
+ goto out;
+ }
+ }
+
+ /*
+ * Replay may have changed group descriptors and
+ * superblock counters. Reload them.
+ */
+ if (mfs->m_gd != NULL) {
+ size_t gd_size = mfs->m_block_group_count *
+ sizeof(struct ext4fs_block_group_descriptor);
+ free(mfs->m_gd, M_UFSMNT, gd_size);
+ mfs->m_gd = NULL;
+ }
+ /* Re-read superblock from disk */
+ error = bread(devvp,
+ (daddr_t)(EXT4FS_SUPER_BLOCK_OFFSET / DEV_BSIZE),
+ EXT4FS_SUPER_BLOCK_SIZE, &bp);
+ if (error)
+ goto out;
+ ext4fs_sbload((struct ext4fs *)bp->b_data, mfs);
+ brelse(bp);
+ bp = NULL;
+
+ error = ext4fs_sbfill(devvp, mfs);
+ if (error)
+ goto out;
+ sble = &mfs->m_sble;
+ }
+
+ ump->um_e4fs->m_read_only = ronly;
+ ump->um_fstype = UM_EXT4FS;
+
+ if (ronly == 0) {
+ if (mfs->m_state == EXT4FS_STATE_VALID)
+ mfs->m_state = 0;
+ else
+ mfs->m_state = EXT4FS_STATE_ERROR;
+ mfs->m_fs_was_modified = 1;
+ }
+
+ mp->mnt_data = ump;
+ mp->mnt_stat.f_fsid.val[0] = (long)dev;
+ mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
+ mp->mnt_stat.f_namemax = MAXNAMLEN;
+ mp->mnt_flag |= MNT_LOCAL;
+ ump->um_mountp = mp;
+
+ ump->um_dev = dev;
+ ump->um_devvp = devvp;
+ ump->um_nindir = EXT4FS_NINDIR(mfs);
+ ump->um_bptrtodb = mfs->m_fs_block_to_disk_block;
+ ump->um_seqinc = 1; /* no frags */
+ ump->um_maxsymlinklen = EXT4FS_SYMLINK_LEN_MAX;
+ devvp->v_specmountpoint = mp;
+
+ if (ronly == 0)
+ ext4fs_sbwrite(mp);
+
+ return (0);
+out:
+ if (devvp->v_specinfo)
+ devvp->v_specmountpoint = NULL;
+ if (bp)
+ brelse(bp);
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+ (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+ VOP_UNLOCK(devvp);
+ if (ump) {
+ if (mfs && mfs->m_gd != NULL) {
+ size_t gd_size = mfs->m_block_group_count *
+ sizeof(struct ext4fs_block_group_descriptor);
+ free(mfs->m_gd, M_UFSMNT, gd_size);
+ }
+ free(mfs, M_UFSMNT, sizeof *mfs);
+ free(ump, M_UFSMNT, sizeof *ump);
+ mp->mnt_data = NULL;
+ }
+ return (error);
+}
+
+int
+ext4fs_sbcheck(struct ext4fs *sble, int ronly)
+{
+ u_int32_t mask, tmp;
+ int i;
+
+ tmp = letoh16(sble->sb_magic);
+ if (tmp != EXT4FS_MAGIC) {
+ printf("ext2fs: wrong magic number 0x%x\n", tmp);
+ return (EIO); /* XXX needs translation */
+ }
+
+ if (ext4fs_sb_csum_verify(sble) != 0) {
+ printf("ext4fs: superblock checksum verification failed\n");
+ return (EINVAL);
+ }
+
+ tmp = letoh32(sble->sb_log_block_size);
+ if (tmp > 2) {
+ /* skewed log(block size): 1024 -> 0 | 2048 -> 1 | 4096 -> 2 */
+ tmp += 10;
+ printf("ext2fs: wrong log2(block size) %d\n", tmp);
+ return (EIO); /* XXX needs translation */
+ }
+
+ if (letoh32(sble->sb_blocks_per_group) == 0) {
+ printf("ext4fs: zero blocks per group\n");
+ return (EIO);
+ }
+
+ if (letoh32(sble->sb_inodes_per_group) == 0) {
+ printf("ext4fs: zero inodes per group\n");
+ return (EIO);
+ }
+
+ tmp = letoh32(sble->sb_revision_level);
+ if (tmp != EXT4FS_REV_DYNAMIC) {
+ printf("ext2fs: wrong revision number 0x%x\n", tmp);
+ return (EIO); /* XXX needs translation */
+ }
+
+ tmp = letoh16(sble->sb_inode_size);
+ if (tmp != 256) {
+ printf("ext4fs: unsupported inode size: %d\n", tmp);
+ return (EINVAL);
+ }
+
+ tmp = letoh32(sble->sb_first_non_reserved_inode);
+ if (tmp != EXT4FS_INODE_FIRST) {
+ printf("ext4fs: first inode at 0x%x\n", tmp);
+ return (EINVAL); /* XXX needs translation */
+ }
+
+ tmp = letoh32(sble->sb_block_group_descriptor_size);
+ if (tmp != sizeof(struct ext4fs_block_group_descriptor)) {
+ printf("ext4fs: block group descriptor size is 0x%x\n",
+ tmp);
+ return (EINVAL);
+ }
+
+ tmp = letoh32(sble->sb_feature_incompat);
+ mask = tmp & ~EXT4FS_FEATURE_INCOMPAT_SUPPORTED;
+ if (mask) {
+ printf("ext4fs: unsupported incompat features: ");
+ PRINTF_FEATURES(mask, ext4fs_feature_incompat);
+ printf("\n");
+ return (EINVAL); /* XXX needs translation */
+ }
+
+ if (tmp & EXT4FS_FEATURE_INCOMPAT_RECOVER) {
+ printf("ext4fs: file system needs journal recovery\n");
+ if (!(letoh32(sble->sb_feature_compat) &
+ EXT4FS_FEATURE_COMPAT_HAS_JOURNAL)) {
+ printf("ext4fs: RECOVER set but no journal\n");
+ return (EINVAL);
+ }
+ /* Allow mount to proceed; replay happens in mountfs */
+ }
+
+ tmp = letoh32(sble->sb_feature_ro_compat) &
+ ~EXT4FS_FEATURE_RO_COMPAT_SUPPORTED;
+ if (!ronly && tmp) {
+ printf("ext4fs: unsupported R/O compat features: ");
+ PRINTF_FEATURES(tmp, ext4fs_feature_ro_compat);
+ printf("\n");
+ return (EROFS);
+ }
+
+ if (!ronly &&
+ !(letoh32(sble->sb_feature_incompat) &
+ EXT4FS_FEATURE_INCOMPAT_RECOVER) &&
+ !(letoh16(sble->sb_state) & EXT4FS_STATE_VALID)) {
+ printf("ext4fs: file system not clean, run e2fsck\n");
+ return (EROFS);
+ }
+
+ return (0);
+}
+
+int
+ext4fs_sbfill(struct vnode *devvp, struct m_ext4fs *mfs)
+{
+ struct ext4fs_dinode *rdp;
+ struct buf *bp;
+ daddr_t dblk;
+ u_int64_t ritb, rblk;
+ u_int32_t rgroup, rindex, roff;
+ size_t gd_size;
+ int error, i;
+
+ mfs->m_block_group_count = howmany(mfs->m_blocks_count -
+ mfs->m_first_data_block,
+ mfs->m_blocks_per_group);
+
+ mfs->m_block_size_shift = EXT4FS_LOG_MIN_BLOCK_SIZE +
+ mfs->m_log_block_size;
+ mfs->m_block_size = 1 << mfs->m_block_size_shift;
+ mfs->m_block_group_descriptor_blocks_count =
+ howmany(mfs->m_block_group_count,
+ mfs->m_block_size /
+ sizeof(struct ext4fs_block_group_descriptor));
+ mfs->m_fs_block_to_disk_block = mfs->m_log_block_size + 1;
+ mfs->m_inodes_per_block = mfs->m_block_size / mfs->m_inode_size;
+ mfs->m_inode_table_blocks_per_group = mfs->m_inodes_per_group /
+ mfs->m_inodes_per_block;
+
+ gd_size = mfs->m_block_group_count * sizeof(struct ext4fs_block_group_descriptor);
+ mfs->m_gd = malloc(gd_size, M_UFSMNT, M_WAITOK);
+
+ dblk = (mfs->m_first_data_block + 1) << mfs->m_fs_block_to_disk_block;
+ for (i = 0; i < mfs->m_block_group_descriptor_blocks_count; i++) {
+ size_t off = (size_t)i * mfs->m_block_size;
+ size_t n = mfs->m_block_size;
+
+ /* Don't copy past end of m_gd allocation */
+ if (off + n > gd_size)
+ n = gd_size - off;
+
+ error = bread(devvp, dblk + (i << mfs->m_fs_block_to_disk_block),
+ mfs->m_block_size, &bp);
+ if (error) {
+ printf("ext4fs_sbfill: failed to read block group descriptors: %d\n", error);
+ free(mfs->m_gd, M_UFSMNT, gd_size);
+ mfs->m_gd = NULL;
+ return (error);
+ }
+ memcpy((char *)mfs->m_gd + off, bp->b_data, n);
+ brelse(bp);
+ }
+
+ /* Verify block group descriptor checksums */
+ for (i = 0; i < mfs->m_block_group_count; i++) {
+ if ((error = ext4fs_bgd_csum_verify(mfs, &mfs->m_gd[i],
+ i)) != 0) {
+ printf("ext4fs_sbfill: block group %d checksum "
+ "verification failed\n", i);
+ free(mfs->m_gd, M_UFSMNT, gd_size);
+ mfs->m_gd = NULL;
+ return (error);
+ }
+ }
+
+ /*
+ * Read the resize inode (inode 7) to get its doubly-indirect
+ * block pointer. Needed for BLOCK_UNINIT bitmap reconstruction.
+ */
+ mfs->m_resize_dind_block = 0;
+ rgroup = (7 - 1) / mfs->m_inodes_per_group;
+ rindex = (7 - 1) % mfs->m_inodes_per_group;
+ ritb = letoh32(mfs->m_gd[rgroup].bgd_inode_table_block_lo);
+ rblk = ritb + (rindex * mfs->m_inode_size) / mfs->m_block_size;
+ roff = (rindex * mfs->m_inode_size) % mfs->m_block_size;
+ error = bread(devvp, (daddr_t)EXT4FS_FSBTODB(mfs, rblk),
+ mfs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ } else {
+ rdp = (struct ext4fs_dinode *)
+ ((char *)bp->b_data + roff);
+ mfs->m_resize_dind_block = letoh32(rdp->i_block[13]);
+ brelse(bp);
+ }
+
+ return (0);
+}
+
+void
+ext4fs_sbload(struct ext4fs *sble, struct m_ext4fs *dest)
+{
+ int feature_incompat_64bit;
+ feature_incompat_64bit = letoh32(sble->sb_feature_incompat) &
+ EXT4FS_FEATURE_INCOMPAT_64BIT;
+ /* Keep a copy of the raw little-endian superblock */
+ memcpy(&dest->m_sble, sble, sizeof(dest->m_sble));
+ dest->m_inodes_count = letoh32(sble->sb_inodes_count);
+ dest->m_blocks_count = letoh32(sble->sb_blocks_count_lo);
+ dest->m_reserved_blocks_count =
+ letoh32(sble->sb_reserved_blocks_count_lo);
+ dest->m_free_blocks_count =
+ letoh32(sble->sb_free_blocks_count_lo);
+ dest->m_free_inodes_count = letoh32(sble->sb_free_inodes_count);
+ dest->m_first_data_block = letoh32(sble->sb_first_data_block);
+ dest->m_log_block_size = letoh32(sble->sb_log_block_size);
+ dest->m_log_cluster_size = letoh32(sble->sb_log_cluster_size);
+ dest->m_blocks_per_group = letoh32(sble->sb_blocks_per_group);
+ dest->m_clusters_per_group = letoh32(sble->sb_clusters_per_group);
+ dest->m_inodes_per_group = letoh32(sble->sb_inodes_per_group);
+ dest->m_mount_time = letoh32(sble->sb_mount_time_lo);
+ dest->m_write_time = letoh32(sble->sb_write_time_lo);
+ dest->m_mount_count = letoh16(sble->sb_mount_count);
+ dest->m_max_mount_count_before_fsck = (int16_t)letoh16(sble->sb_max_mount_count_before_fsck);
+ dest->m_state = letoh16(sble->sb_state);
+ dest->m_errors = letoh16(sble->sb_errors);
+ dest->m_revision_level_minor = letoh16(sble->sb_revision_level_minor);
+ dest->m_check_time = letoh32(sble->sb_check_time_lo);
+ dest->m_check_interval = letoh32(sble->sb_check_interval);
+ dest->m_creator_os = letoh32(sble->sb_creator_os);
+ dest->m_revision_level = letoh32(sble->sb_revision_level);
+ dest->m_default_reserved_uid = letoh16(sble->sb_default_reserved_uid);
+ dest->m_default_reserved_gid = letoh16(sble->sb_default_reserved_gid);
+ dest->m_first_non_reserved_inode = letoh32(sble->sb_first_non_reserved_inode);
+ dest->m_inode_size = letoh16(sble->sb_inode_size);
+ dest->m_block_group_id = letoh16(sble->sb_block_group_id);
+ dest->m_feature_compat = letoh32(sble->sb_feature_compat);
+ dest->m_feature_incompat = letoh32(sble->sb_feature_incompat);
+ dest->m_feature_ro_compat = letoh32(sble->sb_feature_ro_compat);
+ dest->m_algorithm_usage_bitmap = letoh32(sble->sb_algorithm_usage_bitmap);
+ dest->m_reserved_bgdt_blocks = letoh16(sble->sb_reserved_bgdt_blocks);
+ dest->m_journal_inode_number = letoh32(sble->sb_journal_inode_number);
+ dest->m_journal_device_number = letoh32(sble->sb_journal_device_number);
+ dest->m_last_orphan = letoh32(sble->sb_last_orphan);
+ dest->m_block_group_descriptor_size = letoh16(sble->sb_block_group_descriptor_size);
+ dest->m_default_mount_opts = letoh32(sble->sb_default_mount_opts);
+ dest->m_first_meta_block_group = letoh32(sble->sb_first_meta_block_group);
+ dest->m_newfs_time = letoh32(sble->sb_newfs_time_lo);
+ dest->m_inode_size_extra_min = letoh16(sble->sb_inode_size_extra_min);
+ dest->m_inode_size_extra_want = letoh16(sble->sb_inode_size_extra_want);
+ dest->m_flags = letoh32(sble->sb_flags);
+ dest->m_raid_stride_block_count = letoh16(sble->sb_raid_stride_block_count);
+ dest->m_mmp_interval = letoh16(sble->sb_mmp_interval);
+ dest->m_mmp_block = letoh64(sble->sb_mmp_block);
+ dest->m_raid_stripe_width_block_count = letoh32(sble->sb_raid_stripe_width_block_count);
+ dest->m_kilobytes_written = letoh64(sble->sb_kilobytes_written);
+ dest->m_error_count = letoh32(sble->sb_error_count);
+ dest->m_first_error_time = letoh32(sble->sb_first_error_time_lo);
+ dest->m_first_error_inode = letoh32(sble->sb_first_error_inode);
+ dest->m_first_error_block = letoh64(sble->sb_first_error_block);
+ dest->m_first_error_line = letoh32(sble->sb_first_error_line);
+ dest->m_last_error_time = letoh32(sble->sb_last_error_time_lo);
+ dest->m_last_error_inode = letoh32(sble->sb_last_error_inode);
+ dest->m_last_error_line = letoh32(sble->sb_last_error_line);
+ dest->m_last_error_block = letoh64(sble->sb_last_error_block);
+ dest->m_user_quota_inode = letoh32(sble->sb_user_quota_inode);
+ dest->m_group_quota_inode = letoh32(sble->sb_group_quota_inode);
+ dest->m_overhead_clusters = letoh32(sble->sb_overhead_clusters);
+ dest->m_backup_block_groups[0] = letoh32(sble->sb_backup_block_groups[0]);
+ dest->m_backup_block_groups[1] = letoh32(sble->sb_backup_block_groups[1]);
+ dest->m_lost_and_found_inode = letoh32(sble->sb_lost_and_found_inode);
+ dest->m_project_quota_inode = letoh32(sble->sb_project_quota_inode);
+ dest->m_checksum_seed = letoh32(sble->sb_checksum_seed);
+ dest->m_encoding = letoh16(sble->sb_encoding);
+ dest->m_encoding_flags = letoh16(sble->sb_encoding_flags);
+ dest->m_orphan_file_inode = letoh16(sble->sb_orphan_file_inode);
+ if (feature_incompat_64bit) {
+ dest->m_blocks_count |= (u_int64_t)
+ letoh32(sble->sb_blocks_count_hi) << 32;
+ dest->m_reserved_blocks_count |= (u_int64_t)
+ letoh32(sble->sb_reserved_blocks_count_hi)
+ << 32;
+ dest->m_free_blocks_count |= (u_int64_t)
+ letoh32(sble->sb_free_blocks_count_hi) << 32;
+ dest->m_mount_time |= (u_int64_t)
+ letoh32(sble->sb_mount_time_hi) << 32;
+ dest->m_check_time |= (u_int64_t)
+ letoh32(sble->sb_check_time_hi) << 32;
+ dest->m_newfs_time |= (u_int64_t)
+ letoh32(sble->sb_newfs_time_hi) << 32;
+ dest->m_first_error_time |= (u_int64_t)
+ letoh32(sble->sb_first_error_time_hi) << 32;
+ dest->m_last_error_time |= (u_int64_t)
+ letoh32(sble->sb_last_error_time_hi) << 32;
+ }
+}
+
+int
+ext4fs_statfs(struct mount *mp, struct statfs *sbp, struct proc *p)
+{
+ struct ufsmount *ump;
+ struct m_ext4fs *mfs;
+ const u_int32_t overhead_per_group_block_bitmap = 1;
+ const u_int32_t overhead_per_group_inode_bitmap = 1;
+ u_int32_t overhead, overhead_per_group;
+ int ngroups;
+
+ (void)p;
+ ump = VFSTOUFS(mp);
+ mfs = ump->um_e4fs;
+
+ overhead_per_group = overhead_per_group_block_bitmap +
+ overhead_per_group_inode_bitmap +
+ mfs->m_inode_table_blocks_per_group;
+ overhead = mfs->m_first_data_block +
+ mfs->m_block_group_count * overhead_per_group;
+ if (mfs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER) {
+ int i;
+ for (i = 0, ngroups = 0; i < mfs->m_block_group_count; i++) {
+ if (ext4fs_block_group_has_super_block(i))
+ ngroups++;
+ }
+ } else {
+ ngroups = mfs->m_block_group_count;
+ }
+ overhead += ngroups *
+ (1 + mfs->m_block_group_descriptor_blocks_count);
+
+ sbp->f_bsize = mfs->m_block_size;
+ sbp->f_iosize = mfs->m_block_size;
+ sbp->f_blocks = mfs->m_blocks_count - overhead;
+ sbp->f_bfree = mfs->m_free_blocks_count;
+ if (sbp->f_bfree > mfs->m_reserved_blocks_count)
+ sbp->f_bavail = sbp->f_bfree - mfs->m_reserved_blocks_count;
+ else
+ sbp->f_bavail = 0;
+ sbp->f_files = mfs->m_inodes_count;
+ sbp->f_favail = sbp->f_ffree = mfs->m_free_inodes_count;
+ copy_statfs_info(sbp, mp);
+
+ return (0);
+}
+
+/*
+ * Write a block group descriptor back to disk with updated checksum.
+ */
+int
+ext4fs_bgd_write(struct m_ext4fs *fs, struct vnode *devvp, u_int32_t group)
+{
+ struct buf *bp;
+ struct ext4fs_block_group_descriptor *gd;
+ u_int32_t bgds_per_block, bgd_block, bgd_off;
+ daddr_t dblk;
+ int error;
+
+ bgds_per_block = fs->m_block_size /
+ sizeof(struct ext4fs_block_group_descriptor);
+ bgd_block = group / bgds_per_block;
+ bgd_off = (group % bgds_per_block) *
+ sizeof(struct ext4fs_block_group_descriptor);
+
+ dblk = (fs->m_first_data_block + 1 + bgd_block) <<
+ fs->m_fs_block_to_disk_block;
+
+ error = bread(devvp, dblk, fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+
+ /* Update in-memory checksum */
+ gd = &fs->m_gd[group];
+ gd->bgd_checksum = htole16(ext4fs_bgd_csum(fs, gd, group));
+
+ /* Copy to buffer and write */
+ memcpy((char *)bp->b_data + bgd_off, gd,
+ sizeof(struct ext4fs_block_group_descriptor));
+
+ bdwrite(bp);
+ return (0);
+}
+
+/*
+ * Write the superblock to disk with updated counters and checksum.
+ */
+int
+ext4fs_sbwrite(struct mount *mp)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct m_ext4fs *fs = ump->um_e4fs;
+ struct ext4fs *sble = &fs->m_sble;
+ struct buf *bp;
+ struct timespec ts;
+ int error;
+
+ /* Update counters in on-disk superblock */
+ sble->sb_free_blocks_count_lo =
+ htole32((u_int32_t)fs->m_free_blocks_count);
+ sble->sb_free_blocks_count_hi =
+ htole32((u_int32_t)(fs->m_free_blocks_count >> 32));
+ sble->sb_free_inodes_count = htole32(fs->m_free_inodes_count);
+
+ getnanotime(&ts);
+ sble->sb_write_time_lo = htole32((u_int32_t)ts.tv_sec);
+
+ sble->sb_state = htole16(fs->m_state);
+
+ /* Recompute checksum */
+ sble->sb_checksum = htole32(ext4fs_sb_csum(sble));
+
+ /* Write to disk at the fixed superblock offset */
+ error = bread(ump->um_devvp,
+ (daddr_t)(EXT4FS_SUPER_BLOCK_OFFSET / DEV_BSIZE),
+ EXT4FS_SUPER_BLOCK_SIZE, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+
+ memcpy(bp->b_data, sble, sizeof(struct ext4fs));
+ return (bwrite(bp));
+}
+
+static u_long ext4fs_gennumber;
+
+/*
+ * Allocate an inode in the file system.
+ */
+int
+ext4fs_inode_alloc(struct inode *pip, mode_t mode, struct ucred *cred,
+ struct vnode **vpp)
+{
+ struct m_ext4fs *fs = pip->i_e4fs;
+ struct vnode *pvp = ITOV(pip);
+ struct ext4fs_block_group_descriptor *gd;
+ struct buf *bp, *tbp;
+ struct inode *ip;
+ u_int32_t group, ngroups, ino_in_group, pbit, tb, it_blocks;
+ u_int32_t best, best_free, fi, g, free_inodes, icsum;
+ u_int32_t itu, first_unused, dirs;
+ u_int64_t bitmap_blk, itb;
+ ufsino_t ino;
+ char *ibp;
+ int error, i;
+
+ *vpp = NULL;
+
+ if (fs->m_free_inodes_count == 0)
+ return (ENOSPC);
+
+ ngroups = fs->m_block_group_count;
+
+ /* Pick starting group */
+ if ((mode & S_IFMT) == S_IFDIR) {
+ best = 0;
+ best_free = 0;
+ for (i = 0; i < ngroups; i++) {
+ fi = letoh16(fs->m_gd[i].bgd_free_inodes_count_lo);
+ if (fi > best_free) {
+ best_free = fi;
+ best = i;
+ }
+ }
+ group = best;
+ } else {
+ group = (pip->i_number - 1) / fs->m_inodes_per_group;
+ }
+
+ /* Scan groups starting from preferred */
+ for (i = 0; i < ngroups; i++) {
+ g = (group + i) % ngroups;
+ gd = &fs->m_gd[g];
+ free_inodes = letoh16(gd->bgd_free_inodes_count_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ free_inodes |=
+ (u_int32_t)letoh16(gd->bgd_free_inodes_count_hi)
+ << 16;
+ if (free_inodes == 0)
+ continue;
+
+ /* Read inode bitmap */
+ bitmap_blk = letoh32(gd->bgd_inode_bitmap_block_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ bitmap_blk |= (u_int64_t)
+ letoh32(gd->bgd_inode_bitmap_block_hi) << 32;
+
+ error = bread(pip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ continue;
+ }
+
+ ibp = (char *)bp->b_data;
+
+ /*
+ * If INODE_UNINIT, the bitmap is stale. Zero it,
+ * set padding bits, and zero all inode table blocks.
+ */
+ if (letoh16(gd->bgd_flags) &
+ EXT4FS_BGD_FLAG_INODE_UNINIT) {
+ memset(ibp, 0, fs->m_block_size);
+ for (pbit = fs->m_inodes_per_group;
+ pbit < fs->m_block_size * 8; pbit++)
+ setbit(ibp, pbit);
+
+ itb = letoh32(gd->bgd_inode_table_block_lo);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ itb |= (u_int64_t)letoh32(
+ gd->bgd_inode_table_block_hi) << 32;
+ it_blocks = fs->m_inode_table_blocks_per_group;
+ for (tb = 0; tb < it_blocks; tb++) {
+ error = bread(pip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs,
+ itb + tb),
+ fs->m_block_size, &tbp);
+ if (error) {
+ brelse(tbp);
+ continue;
+ }
+ memset(tbp->b_data, 0,
+ fs->m_block_size);
+ error = bwrite(tbp);
+ }
+ }
+
+ /* Find free inode bit */
+ for (ino_in_group = 0; ino_in_group < fs->m_inodes_per_group;
+ ino_in_group++) {
+ if (isclr(ibp, ino_in_group)) {
+ setbit(ibp, ino_in_group);
+
+ icsum = ext4fs_bitmap_csum(fs, g, ibp,
+ fs->m_inodes_per_group / 8);
+ gd->bgd_inode_bitmap_checksum_lo =
+ htole16(icsum & 0xFFFF);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_inode_bitmap_checksum_hi
+ = htole16(
+ (icsum >> 16) & 0xFFFF);
+
+ error = bwrite(bp);
+ if (error)
+ return (error);
+
+ /* Compute inode number (1-based) */
+ ino = g * fs->m_inodes_per_group +
+ ino_in_group + 1;
+
+ /* Get vnode for new inode */
+ error = VFS_VGET(pvp->v_mount, ino, vpp);
+ if (error) {
+ ext4fs_inode_free(pip, ino, mode);
+ return (error);
+ }
+
+ /* Clear INODE_UNINIT flag if set */
+ gd->bgd_flags = htole16(letoh16(gd->bgd_flags) &
+ ~EXT4FS_BGD_FLAG_INODE_UNINIT);
+
+ /* Update BGD free count */
+ free_inodes--;
+ gd->bgd_free_inodes_count_lo =
+ htole16(free_inodes & 0xFFFF);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_free_inodes_count_hi =
+ htole16((free_inodes >> 16) &
+ 0xFFFF);
+
+ if ((mode & S_IFMT) == S_IFDIR) {
+ dirs = letoh16(
+ gd->bgd_used_dirs_count_lo);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ dirs |= (u_int32_t)letoh16(
+ gd->bgd_used_dirs_count_hi)
+ << 16;
+ dirs++;
+ gd->bgd_used_dirs_count_lo =
+ htole16(dirs & 0xFFFF);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_used_dirs_count_hi =
+ htole16((dirs >> 16) &
+ 0xFFFF);
+ }
+
+ itu = letoh16(gd->bgd_inode_table_unused_lo);
+ first_unused =
+ fs->m_inodes_per_group - itu;
+ if (ino_in_group >= first_unused) {
+ itu = fs->m_inodes_per_group -
+ ino_in_group - 1;
+ gd->bgd_inode_table_unused_lo =
+ htole16(itu & 0xFFFF);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_inode_table_unused_hi =
+ htole16((itu >> 16) &
+ 0xFFFF);
+ }
+
+ ext4fs_bgd_write(fs, pip->i_devvp, g);
+
+ /* Update superblock counters */
+ fs->m_free_inodes_count--;
+ fs->m_sble.sb_free_inodes_count =
+ htole32(fs->m_free_inodes_count);
+ fs->m_fs_was_modified = 1;
+
+ ip = VTOI(*vpp);
+
+ /* Zero the dinode */
+ memset(ip->i_e4din, 0,
+ sizeof(struct ext4fs_dinode_256));
+
+ /* Initialize extent header */
+ ip->i_e4din->dinode.i_extent_header.eh_magic =
+ htole16(EXT4FS_EXTENT_HEADER_MAGIC);
+ ip->i_e4din->dinode.i_extent_header.eh_entries =
+ htole16(0);
+ ip->i_e4din->dinode.i_extent_header.eh_max =
+ htole16(4);
+ ip->i_e4din->dinode.i_extent_header.eh_depth =
+ htole16(0);
+ ip->i_e4din->dinode.i_flags =
+ htole32(EXTFS_INODE_FLAG_EXTENTS);
+
+ /* Set extra_isize */
+ ip->i_e4din->dinode.i_extra_isize =
+ htole16(sizeof(struct ext4fs_dinode) -
+ 128);
+
+ /* Set generation number */
+ if (++ext4fs_gennumber <
+ (u_long)gettime())
+ ext4fs_gennumber = gettime();
+ ip->i_e4din->dinode.i_nfs_generation =
+ htole32(ext4fs_gennumber);
+
+ return (0);
+ }
+ }
+
+ brelse(bp);
+ }
+
+ return (ENOSPC);
+}
+
+/*
+ * Free an inode.
+ */
+void
+ext4fs_inode_free(struct inode *pip, ufsino_t ino, mode_t mode)
+{
+ struct m_ext4fs *fs = pip->i_e4fs;
+ struct ext4fs_block_group_descriptor *gd;
+ struct buf *bp;
+ u_int64_t bitmap_blk;
+ u_int32_t group, ino_in_group, free_inodes, icsum;
+ char *ibp;
+ int error;
+
+ group = (ino - 1) / fs->m_inodes_per_group;
+ ino_in_group = (ino - 1) % fs->m_inodes_per_group;
+ gd = &fs->m_gd[group];
+
+ bitmap_blk = letoh32(gd->bgd_inode_bitmap_block_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ bitmap_blk |=
+ (u_int64_t)letoh32(gd->bgd_inode_bitmap_block_hi) << 32;
+
+ error = bread(pip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return;
+ }
+
+ ibp = (char *)bp->b_data;
+ clrbit(ibp, ino_in_group);
+
+ icsum = ext4fs_bitmap_csum(fs, group, ibp,
+ fs->m_inodes_per_group / 8);
+ gd->bgd_inode_bitmap_checksum_lo = htole16(icsum & 0xFFFF);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_inode_bitmap_checksum_hi =
+ htole16((icsum >> 16) & 0xFFFF);
+
+ error = bwrite(bp);
+ if (error)
+ return;
+
+ /* Update BGD */
+ free_inodes = letoh16(gd->bgd_free_inodes_count_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ free_inodes |=
+ (u_int32_t)letoh16(gd->bgd_free_inodes_count_hi) << 16;
+ free_inodes++;
+ gd->bgd_free_inodes_count_lo = htole16(free_inodes & 0xFFFF);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_free_inodes_count_hi =
+ htole16((free_inodes >> 16) & 0xFFFF);
+
+ if ((mode & S_IFMT) == S_IFDIR) {
+ u_int32_t dirs;
+ dirs = letoh16(gd->bgd_used_dirs_count_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ dirs |= (u_int32_t)
+ letoh16(gd->bgd_used_dirs_count_hi) << 16;
+ dirs--;
+ gd->bgd_used_dirs_count_lo = htole16(dirs & 0xFFFF);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_used_dirs_count_hi =
+ htole16((dirs >> 16) & 0xFFFF);
+ }
+
+ ext4fs_bgd_write(fs, pip->i_devvp, group);
+
+ /* Update superblock counters */
+ fs->m_free_inodes_count++;
+ fs->m_sble.sb_free_inodes_count =
+ htole32(fs->m_free_inodes_count);
+ fs->m_fs_was_modified = 1;
+}
+
+static int
+ext4fs_sync_vnode(struct vnode *vp, void *arg)
+{
+ struct ext4fs_sync_args *esa = arg;
+ struct inode *ip;
+ int error, s, skip;
+
+ if (vp->v_type == VNON)
+ return (0);
+
+ ip = VTOI(vp);
+ if (ip == NULL || ip->i_e4din == NULL)
+ return (0);
+
+ s = splbio();
+ skip = (ip->i_flag &
+ (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
+ LIST_EMPTY(&vp->v_dirtyblkhd);
+ splx(s);
+
+ if (skip)
+ return (0);
+
+ if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT))
+ return (0);
+
+ if ((error = VOP_FSYNC(vp, esa->cred, esa->waitfor, esa->p)) != 0)
+ esa->allerror = error;
+
+ vput(vp);
+ return (0);
+}
+
+int
+ext4fs_sync(struct mount *mp, int waitfor, int stall,
+ struct ucred *cred, struct proc *p)
+{
+ struct ufsmount *ump = VFSTOUFS(mp);
+ struct m_ext4fs *fs = ump->um_e4fs;
+ struct ext4fs_sync_args esa;
+ int error;
+
+ if (fs->m_read_only)
+ return (0);
+
+ esa.p = p;
+ esa.cred = cred;
+ esa.allerror = 0;
+ esa.waitfor = waitfor;
+
+ vfs_mount_foreach_vnode(mp, ext4fs_sync_vnode, &esa);
+
+ if (waitfor != MNT_LAZY) {
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)))
+ esa.allerror = error;
+ VOP_UNLOCK(ump->um_devvp);
+ }
+
+ /* Write superblock if modified */
+ if (fs->m_fs_was_modified) {
+ if ((error = ext4fs_sbwrite(mp)))
+ esa.allerror = error;
+ }
+
+ return (esa.allerror);
+}
+
+int
+ext4fs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
+ void *newp, size_t newlen, struct proc *p)
+{
+ (void)name;
+ (void)namelen;
+ (void)oldp;
+ (void)oldlenp;
+ (void)newp;
+ (void)newlen;
+ (void)p;
+ printf("ext4fs_sysctl: not implemented\n");
+ return (EOPNOTSUPP);
+}
+
+int
+ext4fs_unmount(struct mount *mp, int mntflags, struct proc *p)
+{
+ struct ufsmount *ump;
+ struct m_ext4fs *mfs;
+ int error, flags;
+
+ flags = 0;
+ if (mntflags & MNT_FORCE)
+ flags |= FORCECLOSE;
+ if ((error = ext4fs_flushfiles(mp, flags, p)) != 0)
+ return (error);
+ ump = VFSTOUFS(mp);
+ mfs = ump->um_e4fs;
+
+ if (!mfs->m_read_only && mfs->m_fs_was_modified) {
+ mfs->m_state = EXT4FS_STATE_VALID;
+ ext4fs_sbwrite(mp);
+ }
+
+ if (ump->um_devvp->v_type != VBAD)
+ ump->um_devvp->v_specmountpoint = NULL;
+ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+ (void)VOP_CLOSE(ump->um_devvp, mfs->m_read_only ? FREAD :
+ FREAD|FWRITE, NOCRED, p);
+ vput(ump->um_devvp);
+ if (mfs->m_gd != NULL) {
+ size_t gd_size = mfs->m_block_group_count *
+ sizeof(struct ext4fs_block_group_descriptor);
+ free(mfs->m_gd, M_UFSMNT, gd_size);
+ }
+ free(mfs, M_UFSMNT, sizeof *mfs);
+ free(ump, M_UFSMNT, sizeof *ump);
+ mp->mnt_data = NULL;
+ mp->mnt_flag &= ~MNT_LOCAL;
+ return (0);
+}
+
+int
+ext4fs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+ struct m_ext4fs *fs;
+ struct inode *ip;
+ struct ufsmount *ump;
+ struct buf *bp;
+ struct vnode *vp;
+ struct ext4fs_block_group_descriptor *gd;
+ struct ext4fs_dinode *dp;
+ dev_t dev;
+ daddr_t disk_block;
+ u_int64_t inode_table_block;
+ u_int32_t inode_group, inode_index, block_in_table, offset_in_block;
+ u_int32_t itable_unused;
+ u_int16_t bgd_flags, imode;
+ int error;
+
+ if (ino > (ufsino_t)-1)
+ panic("ext4fs_vget: alien ino_t %llu",
+ (unsigned long long)ino);
+
+ ump = VFSTOUFS(mp);
+ dev = ump->um_dev;
+ fs = ump->um_e4fs;
+
+ retry:
+ if ((*vpp = ufs_ihashget(dev, ino)) != NULL) {
+ return (0);
+ }
+
+ /* Allocate a new vnode/inode. */
+ if ((error = getnewvnode(VT_EXT4FS, mp, &ext4fs_vops, &vp)) != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+ ip = pool_get(&ext4fs_inode_pool, PR_WAITOK|PR_ZERO);
+ rrw_init_flags(&ip->i_lock, "inode", RWL_DUPOK | RWL_IS_VNODE);
+ vp->v_data = ip;
+ ip->i_vnode = vp;
+ ip->i_ump = ump;
+ ip->i_e4fs = fs;
+ ip->i_dev = dev;
+ ip->i_number = ino;
+
+ /*
+ * Put it onto its hash chain and lock it so that other requests for
+ * this inode will block if they arrive while we are sleeping waiting
+ * for old data structures to be purged or for the contents of the
+ * disk portion of this inode to be read.
+ */
+ error = ufs_ihashins(ip);
+
+ if (error) {
+ /*
+ * The vnode was locked by ufs_ihashins, then unlocked on error.
+ * We need to properly clean up the inode and vnode.
+ * vrele will trigger reclaim which will free the inode.
+ */
+ vrele(vp);
+
+ if (error == EEXIST)
+ goto retry;
+
+ return (error);
+ }
+
+ vref(ip->i_devvp);
+
+ /* Calculate inode location on disk */
+ if (ino == 0 || ino > fs->m_inodes_count) {
+ vput(vp);
+ *vpp = NULL;
+ return (ESTALE);
+ }
+ inode_group = (ino - 1) / fs->m_inodes_per_group;
+ if (inode_group >= fs->m_block_group_count) {
+ vput(vp);
+ *vpp = NULL;
+ return (ESTALE);
+ }
+ inode_index = (ino - 1) % fs->m_inodes_per_group;
+ block_in_table = inode_index / fs->m_inodes_per_block;
+ offset_in_block = (inode_index % fs->m_inodes_per_block) *
+ fs->m_inode_size;
+
+ gd = &fs->m_gd[inode_group];
+ inode_table_block = letoh32(gd->bgd_inode_table_block_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ inode_table_block |= (u_int64_t)
+ letoh32(gd->bgd_inode_table_block_hi) << 32;
+
+ /* Read the block containing this inode */
+ disk_block = (inode_table_block + block_in_table) <<
+ fs->m_fs_block_to_disk_block;
+ error = bread(ump->um_devvp, disk_block, fs->m_block_size, &bp);
+ if (error) {
+ vput(vp);
+ brelse(bp);
+ *vpp = NULL;
+ return (error);
+ }
+
+ dp = (struct ext4fs_dinode *)((char *)bp->b_data + offset_in_block);
+
+ /* Allocate space for on-disk inode */
+ ip->i_e4din = pool_get(&ext4fs_dinode_pool, PR_WAITOK|PR_ZERO);
+
+ /*
+ * If the group has INODE_UNINIT set, or the inode is in the
+ * unused portion of the inode table, the on-disk data is
+ * garbage. Keep the zeroed buffer and skip checksum verification.
+ */
+ bgd_flags = letoh16(gd->bgd_flags);
+ itable_unused = letoh16(gd->bgd_inode_table_unused_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ itable_unused |= (u_int32_t)
+ letoh16(gd->bgd_inode_table_unused_hi) << 16;
+ if ((bgd_flags & EXT4FS_BGD_FLAG_INODE_UNINIT) ||
+ inode_index >= fs->m_inodes_per_group - itable_unused) {
+ memset(dp, 0, fs->m_inode_size);
+ error = bwrite(bp);
+ if (error) {
+ pool_put(&ext4fs_dinode_pool, ip->i_e4din);
+ ip->i_e4din = NULL;
+ vput(vp);
+ *vpp = NULL;
+ return (error);
+ }
+ } else {
+ memcpy(ip->i_e4din, dp, fs->m_inode_size);
+ brelse(bp);
+
+ /* Verify inode checksum for initialized slots */
+ if (letoh16(ip->i_e4din->dinode.i_mode) != 0 ||
+ letoh16(ip->i_e4din->dinode.i_links_count) != 0 ||
+ letoh32(ip->i_e4din->dinode.i_dtime) != 0) {
+ error = ext4fs_inode_csum_verify(fs,
+ ip->i_e4din, ino);
+ if (error) {
+ pool_put(&ext4fs_dinode_pool,
+ ip->i_e4din);
+ ip->i_e4din = NULL;
+ vput(vp);
+ *vpp = NULL;
+ return (error);
+ }
+ }
+ }
+
+ /* Set vnode type based on inode mode */
+ imode = letoh16(ip->i_e4din->dinode.i_mode);
+ switch (imode & S_IFMT) {
+ case S_IFDIR:
+ vp->v_type = VDIR;
+ break;
+ case S_IFREG:
+ vp->v_type = VREG;
+ break;
+ case S_IFLNK:
+ vp->v_type = VLNK;
+ break;
+ case S_IFBLK:
+ vp->v_type = VBLK;
+ break;
+ case S_IFCHR:
+ vp->v_type = VCHR;
+ break;
+ case S_IFIFO:
+ vp->v_type = VFIFO;
+ break;
+ case S_IFSOCK:
+ vp->v_type = VSOCK;
+ break;
+ default:
+ vp->v_type = VNON;
+ break;
+ }
+
+ /* Set effective link count */
+ ip->i_effnlink = letoh16(ip->i_e4din->dinode.i_links_count);
+
+ /* Set VROOT flag for root inode */
+ if (ip->i_number == EXT4FS_INODE_ROOT_DIR)
+ vp->v_flag |= VROOT;
+
+ /* If the inode was deleted, reset all fields */
+ if (letoh32(ip->i_e4din->dinode.i_dtime) != 0) {
+ vp->v_type = VNON;
+ ip->i_effnlink = 0;
+ }
+
+ *vpp = vp;
+ return (0);
+}
+
+int
+ext4fs_vptofh(struct vnode *vp, struct fid *fhp)
+{
+ (void)vp;
+ (void)fhp;
+ printf("ext4fs_vptofh: not implemented\n");
+ return (EOPNOTSUPP);
+}
diff --git a/sys/ufs/ext4fs/ext4fs_vnops.c b/sys/ufs/ext4fs/ext4fs_vnops.c
new file mode 100644
index 000000000..4968514a8
--- /dev/null
+++ b/sys/ufs/ext4fs/ext4fs_vnops.c
@@ -0,0 +1,3842 @@
+/*
+ * Copyright (c) 2025 kmx.io.
+ * Copyright (c) 1997 Manuel Bouyer.
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Modified for ext4fs by kmx.io.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/pool.h>
+#include <sys/dirent.h>
+#include <sys/fcntl.h>
+#include <sys/lockf.h>
+#include <sys/specdev.h>
+#include <sys/unistd.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext4fs/ext4fs.h>
+#include <ufs/ext4fs/ext4fs_crc32c.h>
+
+/* Convert ext4 directory entry file type to BSD dirent type */
+static const u_int8_t ext4fs_type_to_dt[EXT4FS_FT_MAX] = {
+ [EXT4FS_FT_UNKNOWN] = DT_UNKNOWN,
+ [EXT4FS_FT_REG_FILE] = DT_REG,
+ [EXT4FS_FT_DIR] = DT_DIR,
+ [EXT4FS_FT_CHRDEV] = DT_CHR,
+ [EXT4FS_FT_BLKDEV] = DT_BLK,
+ [EXT4FS_FT_FIFO] = DT_FIFO,
+ [EXT4FS_FT_SOCK] = DT_SOCK,
+ [EXT4FS_FT_SYMLINK] = DT_LNK,
+};
+
+/*
+ * Look up the physical block number for a given logical block number
+ * using the extent tree in the inode.
+ * Returns 0 on success with the physical block stored in *pblk.
+ */
+static int
+ext4fs_extent_pblk(struct inode *ip, u_int64_t lbn, u_int64_t *pblk,
+ u_int64_t *ncontig)
+{
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct ext4fs_extent_header *eh;
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct buf *bp = NULL;
+ u_int16_t entries, depth;
+ int error, found, i;
+
+ /* Start with the extent header in the inode */
+ eh = &din->i_extent_header;
+ if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC)
+ return (EIO);
+
+ depth = letoh16(eh->eh_depth);
+ entries = letoh16(eh->eh_entries);
+
+ /* Walk down the extent tree */
+ while (depth > 0) {
+ struct ext4fs_extent_idx *idx;
+ u_int64_t child_blk;
+
+ /* Index node: find the child that covers lbn */
+ idx = (struct ext4fs_extent_idx *)(eh + 1);
+ found = -1;
+ for (i = 0; i < (int)entries; i++) {
+ if (letoh32(idx[i].ei_block) <= lbn)
+ found = i;
+ else
+ break;
+ }
+ if (found < 0) {
+ if (bp != NULL)
+ brelse(bp);
+ return (EIO);
+ }
+
+ /* Read the child node block */
+ child_blk = letoh32(idx[found].ei_leaf_lo);
+ child_blk |= (u_int64_t)letoh16(idx[found].ei_leaf_hi) << 32;
+
+ if (bp != NULL)
+ brelse(bp);
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, child_blk),
+ fs->m_block_size, &bp);
+ if (error) {
+ if (bp != NULL)
+ brelse(bp);
+ return (error);
+ }
+
+ eh = (struct ext4fs_extent_header *)bp->b_data;
+ if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC) {
+ brelse(bp);
+ return (EIO);
+ }
+ depth = letoh16(eh->eh_depth);
+ entries = letoh16(eh->eh_entries);
+ }
+
+ /* Leaf node: search for the extent containing lbn */
+ {
+ struct ext4fs_extent *ext;
+ ext = (struct ext4fs_extent *)(eh + 1);
+ for (i = 0; i < (int)entries; i++) {
+ u_int32_t e_block = letoh32(ext[i].e_block);
+ u_int16_t e_len = letoh16(ext[i].e_len);
+
+ /* High bit of e_len marks uninitialized extents */
+ if (e_len > 32768)
+ e_len -= 32768;
+
+ if (lbn >= e_block && lbn < e_block + e_len) {
+ u_int64_t start = letoh32(ext[i].e_start_lo);
+ start |=
+ (u_int64_t)letoh16(ext[i].e_start_hi) << 32;
+ *pblk = start + (lbn - e_block);
+ if (ncontig != NULL)
+ *ncontig = e_len - (lbn - e_block);
+ if (bp != NULL)
+ brelse(bp);
+ return (0);
+ }
+ }
+ }
+
+ if (bp != NULL)
+ brelse(bp);
+
+ /* Block not covered by any extent — hole */
+ *pblk = 0;
+ if (ncontig != NULL)
+ *ncontig = 1;
+ return (0);
+}
+
+/*
+ * Write inode back to disk with checksum update.
+ */
+int
+ext4fs_update(struct inode *ip, int waitfor)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct buf *bp;
+ u_int32_t inode_group, inode_index, block_in_table, offset_in_block;
+ struct ext4fs_block_group_descriptor *gd;
+ u_int64_t inode_table_block;
+ daddr_t disk_block;
+ u_int32_t csum;
+ int error;
+
+ if (ITOV(ip)->v_mount->mnt_flag & MNT_RDONLY)
+ return (0);
+
+ EXT4FS_ITIMES(ip);
+
+ if ((ip->i_flag & IN_MODIFIED) == 0) {
+ return (0);
+ }
+
+ ip->i_flag &= ~IN_MODIFIED;
+
+ /* Locate inode on disk */
+ inode_group = (ip->i_number - 1) / fs->m_inodes_per_group;
+ inode_index = (ip->i_number - 1) % fs->m_inodes_per_group;
+ block_in_table = inode_index / fs->m_inodes_per_block;
+ offset_in_block = (inode_index % fs->m_inodes_per_block) *
+ fs->m_inode_size;
+
+ gd = &fs->m_gd[inode_group];
+ inode_table_block = letoh32(gd->bgd_inode_table_block_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ inode_table_block |=
+ (u_int64_t)letoh32(gd->bgd_inode_table_block_hi) << 32;
+
+ disk_block = (inode_table_block + block_in_table) <<
+ fs->m_fs_block_to_disk_block;
+
+ error = bread(ip->i_devvp, disk_block, fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+
+ /*
+ * Verify extent header integrity before writing.
+ * If the inode uses extents (not a fast symlink), the magic
+ * must be valid. Refuse to persist corruption.
+ */
+ {
+ u_int16_t wr_mode = letoh16(ip->i_e4din->dinode.i_mode);
+ u_int32_t wr_flags = letoh32(ip->i_e4din->dinode.i_flags);
+ u_int16_t wr_magic =
+ letoh16(ip->i_e4din->dinode.i_extent_header.eh_magic);
+ if (wr_mode != 0 &&
+ (wr_flags & EXTFS_INODE_FLAG_EXTENTS) &&
+ wr_magic != EXT4FS_EXTENT_HEADER_MAGIC) {
+ printf("ext4fs_update: REFUSING to write ino=%u "
+ "with corrupt extent header! "
+ "magic=0x%x mode=0%o flags=0x%x\n",
+ ip->i_number, wr_magic, wr_mode, wr_flags);
+ brelse(bp);
+ return (EIO);
+ }
+ }
+
+ /* Recompute inode checksum */
+ csum = ext4fs_inode_csum(fs, ip->i_e4din, ip->i_number);
+ ip->i_e4din->dinode.i_checksum_lo = htole16(csum & 0xFFFF);
+ ip->i_e4din->dinode.i_checksum_hi = htole16((csum >> 16) & 0xFFFF);
+
+ /* Copy inode to buffer */
+ memcpy((char *)bp->b_data + offset_in_block, ip->i_e4din,
+ fs->m_inode_size);
+
+ if (waitfor) {
+ error = bwrite(bp);
+ return (error);
+ }
+
+ bdwrite(bp);
+ return (0);
+}
+
+/*
+ * Set inode size (both low and high 32-bit fields).
+ */
+void
+ext4fs_setsize(struct inode *ip, u_int64_t size)
+{
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+
+ din->i_size_lo = htole32((u_int32_t)size);
+ din->i_size_hi = htole32((u_int32_t)(size >> 32));
+}
+
+/*
+ * Allocate a filesystem block.
+ * Tries the group of the goal block first, then scans all groups.
+ */
+int
+ext4fs_blkalloc(struct inode *ip, u_int64_t goal, u_int32_t count,
+ u_int64_t *bnp, u_int32_t *countp)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_block_group_descriptor *gd;
+ struct buf *bp, *dbp;
+ u_int64_t bitmap_blk, grp_start, bb, ib, itb;
+ u_int32_t group, ngroups, g, blk_in_group, free_blocks;
+ u_int32_t it_blocks, mb, pbit, rb, bcsum;
+ u_int32_t start_bit, nalloced, k;
+ u_int32_t *dind;
+ char *bbp;
+ int error, i, j, has_sb;
+
+ *bnp = 0;
+ *countp = 0;
+
+ if (count == 0)
+ count = 1;
+
+ if (fs->m_free_blocks_count == 0)
+ return (ENOSPC);
+
+ ngroups = fs->m_block_group_count;
+
+ /* Pick starting group from goal */
+ if (goal >= fs->m_first_data_block && goal < fs->m_blocks_count)
+ group = (goal - fs->m_first_data_block) /
+ fs->m_blocks_per_group;
+ else
+ group = (ip->i_number - 1) / fs->m_inodes_per_group;
+
+ for (i = 0; i < ngroups; i++) {
+ g = (group + i) % ngroups;
+ gd = &fs->m_gd[g];
+
+ free_blocks = letoh16(gd->bgd_free_blocks_count_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ free_blocks |= (u_int32_t)
+ letoh16(gd->bgd_free_blocks_count_hi) << 16;
+ if (free_blocks == 0)
+ continue;
+
+ /* Read block bitmap */
+ bitmap_blk = letoh32(gd->bgd_block_bitmap_block_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ bitmap_blk |= (u_int64_t)
+ letoh32(gd->bgd_block_bitmap_block_hi) << 32;
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ continue;
+ }
+ bbp = (char *)bp->b_data;
+
+ /*
+ * If BLOCK_UNINIT is set, the on-disk bitmap block
+ * may contain garbage. Zero it and mark metadata
+ * blocks (bitmaps, inode table) as used.
+ */
+ if (letoh16(gd->bgd_flags) &
+ EXT4FS_BGD_FLAG_BLOCK_UNINIT) {
+ grp_start = (u_int64_t)g *
+ fs->m_blocks_per_group +
+ fs->m_first_data_block;
+ memset(bbp, 0, fs->m_block_size);
+ /*
+ * Mark superblock, GDT, and reserved
+ * GDT blocks for groups that have them.
+ */
+ has_sb = 0;
+ if (!(fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER))
+ has_sb = 1;
+ else if (g == 0 || g == 1)
+ has_sb = 1;
+ else {
+ u_int64_t n;
+ for (n = 3; n <= g; n *= 3)
+ if (n == g) has_sb = 1;
+ for (n = 5; n <= g; n *= 5)
+ if (n == g) has_sb = 1;
+ for (n = 7; n <= g; n *= 7)
+ if (n == g) has_sb = 1;
+ }
+ if (has_sb) {
+ u_int32_t overhead = 1 +
+ fs->m_block_group_descriptor_blocks_count +
+ fs->m_reserved_bgdt_blocks;
+ for (mb = 0; mb < overhead; mb++)
+ setbit(bbp, mb);
+ }
+ /* Block bitmap */
+ bb = letoh32(gd->bgd_block_bitmap_block_lo);
+ if (bb >= grp_start &&
+ bb < grp_start + fs->m_blocks_per_group)
+ setbit(bbp, bb - grp_start);
+ /* Inode bitmap */
+ ib = letoh32(gd->bgd_inode_bitmap_block_lo);
+ if (ib >= grp_start &&
+ ib < grp_start + fs->m_blocks_per_group)
+ setbit(bbp, ib - grp_start);
+ /* Inode table */
+ itb = letoh32(gd->bgd_inode_table_block_lo);
+ it_blocks = (fs->m_inodes_per_group *
+ fs->m_inode_size + fs->m_block_size - 1) /
+ fs->m_block_size;
+ for (mb = 0; mb < it_blocks; mb++) {
+ u_int64_t b = itb + mb;
+ if (b >= grp_start &&
+ b < grp_start +
+ fs->m_blocks_per_group)
+ setbit(bbp, b - grp_start);
+ }
+ for (pbit = fs->m_blocks_per_group;
+ pbit < fs->m_block_size * 8; pbit++)
+ setbit(bbp, pbit);
+ /* Mark resize inode (inode 7) blocks */
+ if (fs->m_resize_dind_block != 0) {
+ if (fs->m_resize_dind_block >= grp_start &&
+ fs->m_resize_dind_block <
+ grp_start + fs->m_blocks_per_group)
+ setbit(bbp,
+ fs->m_resize_dind_block -
+ grp_start);
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs,
+ fs->m_resize_dind_block),
+ fs->m_block_size, &dbp);
+ if (!error) {
+ dind = (u_int32_t *)dbp->b_data;
+ for (j = 0;
+ j < fs->m_block_size / 4;
+ j++) {
+ rb = letoh32(dind[j]);
+ if (rb == 0)
+ continue;
+ if (rb >= grp_start &&
+ rb < grp_start +
+ fs->m_blocks_per_group)
+ setbit(bbp,
+ rb - grp_start);
+ }
+ brelse(dbp);
+ } else {
+ brelse(dbp);
+ }
+ }
+ gd->bgd_flags = htole16(letoh16(
+ gd->bgd_flags) &
+ ~EXT4FS_BGD_FLAG_BLOCK_UNINIT);
+ ext4fs_bgd_write(fs, ip->i_devvp, g);
+ }
+
+ /* Start scan from goal bit if goal is in this group */
+ start_bit = 0;
+ if (goal >= fs->m_first_data_block &&
+ goal < fs->m_blocks_count) {
+ u_int32_t goal_group = (goal - fs->m_first_data_block) /
+ fs->m_blocks_per_group;
+ if (goal_group == g)
+ start_bit = (goal - fs->m_first_data_block) %
+ fs->m_blocks_per_group;
+ }
+
+ /* Scan bitmap for free block(s) */
+ for (blk_in_group = start_bit;
+ blk_in_group < fs->m_blocks_per_group;
+ blk_in_group++) {
+ if (isclr(bbp, blk_in_group)) {
+ /* Found first free bit; grab contiguous run */
+ nalloced = 1;
+ setbit(bbp, blk_in_group);
+ for (k = 1; k < count &&
+ blk_in_group + k < fs->m_blocks_per_group &&
+ isclr(bbp, blk_in_group + k); k++) {
+ setbit(bbp, blk_in_group + k);
+ nalloced++;
+ }
+
+ bcsum = ext4fs_bitmap_csum(fs, g, bbp,
+ fs->m_block_size);
+ gd->bgd_block_bitmap_checksum_lo =
+ htole16(bcsum & 0xFFFF);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_block_bitmap_checksum_hi
+ = htole16(
+ (bcsum >> 16) & 0xFFFF);
+
+ bdwrite(bp);
+
+ /* Update BGD */
+ free_blocks -= nalloced;
+ gd->bgd_free_blocks_count_lo =
+ htole16(free_blocks & 0xFFFF);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_free_blocks_count_hi =
+ htole16((free_blocks >> 16) &
+ 0xFFFF);
+
+ ext4fs_bgd_write(fs, ip->i_devvp, g);
+
+ /* Update superblock counters */
+ fs->m_free_blocks_count -= nalloced;
+ fs->m_sble.sb_free_blocks_count_lo =
+ htole32((u_int32_t)
+ fs->m_free_blocks_count);
+ fs->m_sble.sb_free_blocks_count_hi =
+ htole32((u_int32_t)
+ (fs->m_free_blocks_count >> 32));
+ fs->m_fs_was_modified = 1;
+
+ *bnp = (u_int64_t)g * fs->m_blocks_per_group +
+ blk_in_group + fs->m_first_data_block;
+ *countp = nalloced;
+
+ return (0);
+ }
+ }
+
+ brelse(bp);
+ }
+
+ return (ENOSPC);
+}
+
+/*
+ * Free a filesystem block.
+ */
+void
+ext4fs_blkfree(struct inode *ip, u_int64_t bno)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_block_group_descriptor *gd;
+ struct buf *bp;
+ u_int64_t bitmap_blk;
+ u_int32_t group, blk_in_group, free_blocks;
+ char *bbp;
+ int error;
+
+ if (bno < fs->m_first_data_block || bno >= fs->m_blocks_count)
+ return;
+
+ group = (bno - fs->m_first_data_block) / fs->m_blocks_per_group;
+ if (group >= fs->m_block_group_count)
+ return;
+ blk_in_group = (bno - fs->m_first_data_block) %
+ fs->m_blocks_per_group;
+ gd = &fs->m_gd[group];
+
+ /* Read block bitmap */
+ bitmap_blk = letoh32(gd->bgd_block_bitmap_block_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ bitmap_blk |=
+ (u_int64_t)letoh32(gd->bgd_block_bitmap_block_hi) << 32;
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return;
+ }
+
+ bbp = (char *)bp->b_data;
+ clrbit(bbp, blk_in_group);
+
+ /* Update block bitmap checksum in BGD */
+ {
+ u_int32_t bcsum = ext4fs_bitmap_csum(fs, group, bbp,
+ fs->m_block_size);
+ gd->bgd_block_bitmap_checksum_lo = htole16(bcsum & 0xFFFF);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_block_bitmap_checksum_hi =
+ htole16((bcsum >> 16) & 0xFFFF);
+ }
+
+ bdwrite(bp);
+
+ /* Update BGD */
+ free_blocks = letoh16(gd->bgd_free_blocks_count_lo);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ free_blocks |=
+ (u_int32_t)letoh16(gd->bgd_free_blocks_count_hi) << 16;
+ free_blocks++;
+ gd->bgd_free_blocks_count_lo = htole16(free_blocks & 0xFFFF);
+ if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_free_blocks_count_hi =
+ htole16((free_blocks >> 16) & 0xFFFF);
+
+ ext4fs_bgd_write(fs, ip->i_devvp, group);
+
+ /* Update superblock counters */
+ fs->m_free_blocks_count++;
+ fs->m_sble.sb_free_blocks_count_lo =
+ htole32((u_int32_t)fs->m_free_blocks_count);
+ fs->m_sble.sb_free_blocks_count_hi =
+ htole32((u_int32_t)(fs->m_free_blocks_count >> 32));
+ fs->m_fs_was_modified = 1;
+}
+
+/*
+ * Promote a depth-0 extent tree to depth 1.
+ * Called when the inline extent array is full (4/4 entries).
+ * Allocates a leaf block, copies the 4 inline extents into it,
+ * and converts the inode root to an index node with one entry.
+ */
+static int
+ext4fs_extent_grow_tree(struct inode *ip)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct ext4fs_extent_header *eh = &din->i_extent_header;
+ struct ext4fs_extent_header *leaf_eh;
+ struct buf *bp;
+ u_int64_t leaf_blk;
+ u_int16_t maxleaf;
+ u_int64_t i_blocks;
+ u_int32_t got;
+ int error;
+
+ if (letoh16(eh->eh_depth) != 0)
+ return (EIO);
+ if (letoh16(eh->eh_entries) != 4)
+ return (EIO);
+
+ /* Allocate a block for the leaf node */
+ error = ext4fs_blkalloc(ip, 0, 1, &leaf_blk, &got);
+ if (error)
+ return (error);
+
+ /* Get buffer for the new leaf block */
+ bp = getblk(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk),
+ fs->m_block_size, 0, INFSLP);
+ clrbuf(bp);
+
+ /* Initialize leaf block header */
+ maxleaf = (fs->m_block_size - sizeof(struct ext4fs_extent_header)) /
+ sizeof(struct ext4fs_extent);
+ leaf_eh = (struct ext4fs_extent_header *)bp->b_data;
+ leaf_eh->eh_magic = htole16(EXT4FS_EXTENT_HEADER_MAGIC);
+ leaf_eh->eh_entries = htole16(4);
+ leaf_eh->eh_max = htole16(maxleaf);
+ leaf_eh->eh_depth = htole16(0);
+ leaf_eh->eh_generation = htole32(0);
+
+ /* Copy 4 inline extents into the leaf block */
+ memcpy((char *)bp->b_data + sizeof(struct ext4fs_extent_header),
+ din->i_extent, 4 * sizeof(struct ext4fs_extent));
+
+ ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, bp->b_data);
+ bdwrite(bp);
+
+ /* Convert inode root to index node with depth=1 */
+ eh->eh_depth = htole16(1);
+ eh->eh_entries = htole16(1);
+ /* eh_max stays 4 (4 index entries fit in the inline area) */
+
+ /* Set up the single index entry pointing to the leaf block */
+ {
+ struct ext4fs_extent_idx *idx = din->i_extent_idx;
+ idx[0].ei_block = din->i_extent[0].e_block; /* first lbn */
+ idx[0].ei_leaf_lo = htole32((u_int32_t)leaf_blk);
+ idx[0].ei_leaf_hi = htole16((u_int16_t)(leaf_blk >> 32));
+ idx[0].ei_unused = 0;
+ /* Zero remaining index slots */
+ memset(&idx[1], 0, 3 * sizeof(struct ext4fs_extent_idx));
+ }
+
+ /* Update inode block count for the leaf block */
+ i_blocks = letoh32(din->i_blocks_lo) |
+ ((u_int64_t)letoh16(din->i_blocks_hi) << 32);
+ i_blocks += fs->m_block_size / DEV_BSIZE;
+ din->i_blocks_lo = htole32((u_int32_t)i_blocks);
+ din->i_blocks_hi = htole16((u_int16_t)(i_blocks >> 32));
+
+ ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+ return (0);
+}
+
+/*
+ * Split a full leaf block into two.
+ * The old leaf keeps the first half, a new leaf gets the second half.
+ * A new index entry is added to the parent (the inode root).
+ * Returns ENOSPC if the parent index is also full (depth 2+ needed).
+ */
+static int
+ext4fs_leaf_split(struct inode *ip, struct buf *old_bp,
+ struct ext4fs_extent_header *old_eh)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct ext4fs_extent_header *root_eh = &din->i_extent_header;
+ struct ext4fs_extent_idx *root_idx = din->i_extent_idx;
+ struct ext4fs_extent_header *new_eh;
+ struct ext4fs_extent *old_ext, *new_ext;
+ struct buf *new_bp;
+ u_int64_t new_blk;
+ u_int32_t new_first_block;
+ u_int16_t old_entries, new_entries, maxleaf;
+ u_int16_t root_entries, root_max;
+ u_int64_t i_blocks;
+ u_int32_t got;
+ int error, i;
+
+ old_entries = letoh16(old_eh->eh_entries);
+ maxleaf = letoh16(old_eh->eh_max);
+ old_ext = (struct ext4fs_extent *)(old_eh + 1);
+ /* Check parent has room for new index entry */
+ root_entries = letoh16(root_eh->eh_entries);
+ root_max = letoh16(root_eh->eh_max);
+ if (root_entries >= root_max) {
+ brelse(old_bp);
+ return (ENOSPC); /* Would need depth 2+, phase 4 */
+ }
+
+ /* Allocate block for new leaf */
+ error = ext4fs_blkalloc(ip, 0, 1, &new_blk, &got);
+ if (error) {
+ brelse(old_bp);
+ return (error);
+ }
+
+ /* Split ~50/50 */
+ new_entries = old_entries / 2;
+ old_entries = old_entries - new_entries;
+
+ /*
+ * Save the first logical block of the new (second) half
+ * BEFORE we write any buffers, since new_ext will point
+ * into new_bp->b_data which is consumed by bwrite.
+ */
+ new_first_block = letoh32(old_ext[old_entries].e_block);
+
+ new_bp = getblk(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, new_blk),
+ fs->m_block_size, 0, INFSLP);
+ clrbuf(new_bp);
+
+ /* Initialize new leaf */
+ new_eh = (struct ext4fs_extent_header *)new_bp->b_data;
+ new_eh->eh_magic = htole16(EXT4FS_EXTENT_HEADER_MAGIC);
+ new_eh->eh_entries = htole16(new_entries);
+ new_eh->eh_max = htole16(maxleaf);
+ new_eh->eh_depth = htole16(0);
+ new_eh->eh_generation = htole32(0);
+
+ new_ext = (struct ext4fs_extent *)(new_eh + 1);
+ memcpy(new_ext, &old_ext[old_entries],
+ new_entries * sizeof(struct ext4fs_extent));
+
+ ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, new_bp->b_data);
+ bdwrite(new_bp);
+
+ /* Update old leaf */
+ old_eh->eh_entries = htole16(old_entries);
+ ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, old_bp->b_data);
+ bdwrite(old_bp);
+
+ /* Add new index entry in parent root (keep sorted by ei_block) */
+ {
+ struct ext4fs_extent_idx entry;
+
+ entry.ei_block = htole32(new_first_block);
+ entry.ei_leaf_lo = htole32((u_int32_t)new_blk);
+ entry.ei_leaf_hi = htole16((u_int16_t)(new_blk >> 32));
+ entry.ei_unused = 0;
+
+ /* Find insertion point */
+ for (i = 0; i < root_entries; i++) {
+ if (letoh32(root_idx[i].ei_block) > new_first_block)
+ break;
+ }
+ if (i < root_entries)
+ memmove(&root_idx[i + 1], &root_idx[i],
+ (root_entries - i) *
+ sizeof(struct ext4fs_extent_idx));
+ root_idx[i] = entry;
+ root_eh->eh_entries = htole16(root_entries + 1);
+ }
+
+ /* Update inode block count for the new leaf block */
+ i_blocks = letoh32(din->i_blocks_lo) |
+ ((u_int64_t)letoh16(din->i_blocks_hi) << 32);
+ i_blocks += fs->m_block_size / DEV_BSIZE;
+ din->i_blocks_lo = htole32((u_int32_t)i_blocks);
+ din->i_blocks_hi = htole16((u_int16_t)(i_blocks >> 32));
+
+ ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+ return (0);
+}
+
+/*
+ * Insert an extent into a depth > 0 extent tree.
+ * Walks the index to find the correct leaf, tries merge,
+ * inserts if room, splits leaf if full.
+ */
+static int
+ext4fs_extent_insert_depth(struct inode *ip, u_int32_t lbn, u_int64_t pblk,
+ u_int16_t len)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct ext4fs_extent_header *root_eh = &din->i_extent_header;
+ struct ext4fs_extent_idx *idx;
+ struct ext4fs_extent_header *leaf_eh;
+ struct ext4fs_extent *ext;
+ struct buf *bp;
+ u_int64_t leaf_blk;
+ u_int16_t root_entries, leaf_entries, leaf_max;
+ int error, found, i;
+
+ root_entries = letoh16(root_eh->eh_entries);
+ if (root_entries == 0)
+ return (EIO);
+
+ /* Find the index entry whose subtree covers lbn */
+ idx = din->i_extent_idx;
+ found = 0;
+ for (i = 0; i < root_entries; i++) {
+ if (letoh32(idx[i].ei_block) <= lbn)
+ found = i;
+ else
+ break;
+ }
+
+ /* Read the leaf block */
+ leaf_blk = letoh32(idx[found].ei_leaf_lo) |
+ ((u_int64_t)letoh16(idx[found].ei_leaf_hi) << 32);
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+
+ leaf_eh = (struct ext4fs_extent_header *)bp->b_data;
+ if (letoh16(leaf_eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC) {
+ brelse(bp);
+ return (EIO);
+ }
+
+ leaf_entries = letoh16(leaf_eh->eh_entries);
+ leaf_max = letoh16(leaf_eh->eh_max);
+ ext = (struct ext4fs_extent *)(leaf_eh + 1);
+
+ /* Try to merge with last extent in this leaf */
+ if (leaf_entries > 0) {
+ struct ext4fs_extent *last = &ext[leaf_entries - 1];
+ u_int32_t last_block = letoh32(last->e_block);
+ u_int16_t last_len = letoh16(last->e_len);
+ u_int64_t last_start = letoh32(last->e_start_lo) |
+ ((u_int64_t)letoh16(last->e_start_hi) << 32);
+
+ if (last_block + last_len == lbn &&
+ last_start + last_len == pblk &&
+ last_len + len <= 32768) {
+ last->e_len = htole16(last_len + len);
+ ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, bp->b_data);
+ bdwrite(bp);
+ ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+ return (0);
+ }
+ }
+
+ /* Room in leaf? */
+ if (leaf_entries < leaf_max) {
+ /* Find insertion point (keep sorted) */
+ for (i = 0; i < leaf_entries; i++) {
+ if (letoh32(ext[i].e_block) > lbn)
+ break;
+ }
+ if (i < leaf_entries)
+ memmove(&ext[i + 1], &ext[i],
+ (leaf_entries - i) *
+ sizeof(struct ext4fs_extent));
+
+ ext[i].e_block = htole32(lbn);
+ ext[i].e_len = htole16(len);
+ ext[i].e_start_lo = htole32((u_int32_t)pblk);
+ ext[i].e_start_hi = htole16((u_int16_t)(pblk >> 32));
+
+ leaf_eh->eh_entries = htole16(leaf_entries + 1);
+ ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, bp->b_data);
+ bdwrite(bp);
+ ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+ return (0);
+ }
+
+ /* Leaf is full - need to split */
+ error = ext4fs_leaf_split(ip, bp, leaf_eh);
+ if (error)
+ return (error);
+
+ /* bp was consumed by leaf_split (bwrite'd). Retry the insert. */
+ return (ext4fs_extent_insert_depth(ip, lbn, pblk, len));
+}
+
+/*
+ * Insert an extent into the inode's extent tree.
+ * Handles depth 0 (inline) and depth > 0 (tree) cases.
+ * Tries to merge with the last extent if contiguous.
+ */
+static int
+ext4fs_extent_insert(struct inode *ip, u_int32_t lbn, u_int64_t pblk,
+ u_int16_t len)
+{
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct ext4fs_extent_header *eh = &din->i_extent_header;
+ struct ext4fs_extent *ext = din->i_extent;
+ u_int16_t entries, maxe, depth;
+ int error, i;
+
+ if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC)
+ return (EIO);
+
+ depth = letoh16(eh->eh_depth);
+
+ /* Depth > 0: delegate to tree insert */
+ if (depth > 0)
+ return (ext4fs_extent_insert_depth(ip, lbn, pblk, len));
+
+ /* Depth 0: inline extents */
+ entries = letoh16(eh->eh_entries);
+ maxe = letoh16(eh->eh_max);
+
+ /* Try to merge with last extent */
+ if (entries > 0) {
+ struct ext4fs_extent *last = &ext[entries - 1];
+ u_int32_t last_block = letoh32(last->e_block);
+ u_int16_t last_len = letoh16(last->e_len);
+ u_int64_t last_start = letoh32(last->e_start_lo) |
+ ((u_int64_t)letoh16(last->e_start_hi) << 32);
+
+ if (last_block + last_len == lbn &&
+ last_start + last_len == pblk &&
+ last_len + len <= 32768) {
+ last->e_len = htole16(last_len + len);
+ ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+ return (0);
+ }
+ }
+
+ /* Room for a new inline entry? */
+ if (entries < maxe) {
+ /* Find insertion point (keep sorted by lbn) */
+ for (i = 0; i < entries; i++) {
+ if (letoh32(ext[i].e_block) > lbn)
+ break;
+ }
+
+ /* Shift entries to make room */
+ if (i < entries)
+ memmove(&ext[i + 1], &ext[i],
+ (entries - i) * sizeof(struct ext4fs_extent));
+
+ /* Insert new extent */
+ ext[i].e_block = htole32(lbn);
+ ext[i].e_len = htole16(len);
+ ext[i].e_start_lo = htole32((u_int32_t)pblk);
+ ext[i].e_start_hi = htole16((u_int16_t)(pblk >> 32));
+
+ eh->eh_entries = htole16(entries + 1);
+ ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+
+ return (0);
+ }
+
+ /* Inline full - grow tree to depth 1, then insert */
+ error = ext4fs_extent_grow_tree(ip);
+ if (error)
+ return (error);
+
+ return (ext4fs_extent_insert_depth(ip, lbn, pblk, len));
+}
+
+/*
+ * Allocate a buffer for a logical block.
+ * If the block is already mapped, just read it.
+ * Otherwise, allocate a new physical block and insert extent.
+ */
+static int
+ext4fs_buf_alloc(struct inode *ip, u_int64_t lbn, int size,
+ struct ucred *cred, struct buf **bpp, int flags)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ u_int64_t pblk, goal, ncontig, i_blocks;
+ int error;
+
+ /* Check if already mapped */
+ error = ext4fs_extent_pblk(ip, lbn, &pblk, &ncontig);
+ if (error == 0 && pblk != 0) {
+ /* Already mapped, just read */
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, bpp);
+ if (error)
+ brelse(*bpp);
+ return (error);
+ }
+ error = 0;
+
+ /* Not mapped - allocate a new block */
+ /* Goal: try to be contiguous with last extent */
+ goal = 0;
+ if (letoh16(din->i_extent_header.eh_entries) > 0) {
+ u_int16_t depth = letoh16(din->i_extent_header.eh_depth);
+
+ if (depth == 0) {
+ u_int16_t ent = letoh16(din->i_extent_header.eh_entries);
+ struct ext4fs_extent *last = &din->i_extent[ent - 1];
+ u_int64_t last_start = letoh32(last->e_start_lo) |
+ ((u_int64_t)letoh16(last->e_start_hi) << 32);
+ goal = last_start + letoh16(last->e_len);
+ } else {
+ /* Walk to last leaf to find last extent */
+ u_int16_t ent = letoh16(din->i_extent_header.eh_entries);
+ struct ext4fs_extent_idx *idx = din->i_extent_idx;
+ u_int64_t leaf_blk;
+ struct buf *gbp;
+
+ leaf_blk = letoh32(idx[ent - 1].ei_leaf_lo) |
+ ((u_int64_t)letoh16(idx[ent - 1].ei_leaf_hi) << 32);
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk),
+ fs->m_block_size, &gbp);
+ if (error == 0) {
+ struct ext4fs_extent_header *leh =
+ (struct ext4fs_extent_header *)gbp->b_data;
+ u_int16_t lent = letoh16(leh->eh_entries);
+ if (lent > 0 && letoh16(leh->eh_magic) ==
+ EXT4FS_EXTENT_HEADER_MAGIC) {
+ struct ext4fs_extent *le =
+ (struct ext4fs_extent *)(leh + 1);
+ u_int64_t ls =
+ letoh32(le[lent - 1].e_start_lo) |
+ ((u_int64_t)letoh16(
+ le[lent - 1].e_start_hi) << 32);
+ goal = ls + letoh16(le[lent - 1].e_len);
+ }
+ brelse(gbp);
+ } else {
+ brelse(gbp);
+ }
+ }
+ }
+
+ {
+ u_int32_t got;
+
+ error = ext4fs_blkalloc(ip, goal, 1, &pblk, &got);
+ if (error)
+ return (error);
+ error = ext4fs_extent_insert(ip, lbn, pblk, 1);
+ if (error) {
+ ext4fs_blkfree(ip, pblk);
+ return (error);
+ }
+ /* Update inode block count (i_blocks is in 512-byte sectors) */
+ i_blocks = letoh32(din->i_blocks_lo) |
+ ((u_int64_t)letoh16(din->i_blocks_hi) << 32);
+ i_blocks += fs->m_block_size / DEV_BSIZE;
+ din->i_blocks_lo = htole32((u_int32_t)i_blocks);
+ din->i_blocks_hi = htole16((u_int16_t)(i_blocks >> 32));
+ }
+
+ /* Set extents flag */
+ din->i_flags |= htole32(EXTFS_INODE_FLAG_EXTENTS);
+
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+ /* Get buffer for the new block */
+ *bpp = getblk(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, 0, INFSLP);
+ if (flags & B_CLRBUF)
+ clrbuf(*bpp);
+
+ return (0);
+}
+
+/*
+ * Free all blocks described by an array of extents.
+ * Batches frees by block group for efficiency.
+ */
+static void
+ext4fs_free_extents(struct inode *ip, struct ext4fs_extent *ext,
+ u_int16_t entries)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ int i;
+
+ for (i = 0; i < entries; i++) {
+ u_int64_t start = letoh32(ext[i].e_start_lo) |
+ ((u_int64_t)letoh16(ext[i].e_start_hi) << 32);
+ u_int32_t len = letoh16(ext[i].e_len);
+ u_int32_t freed = 0;
+
+ if (len > 32768)
+ len -= 32768;
+
+ while (freed < len) {
+ u_int64_t bno = start + freed;
+ u_int32_t group, blk_in_group;
+ struct ext4fs_block_group_descriptor *gd;
+
+ if (bno < fs->m_first_data_block ||
+ bno >= fs->m_blocks_count) {
+ freed++;
+ continue;
+ }
+ group = (bno - fs->m_first_data_block) /
+ fs->m_blocks_per_group;
+ if (group >= fs->m_block_group_count) {
+ freed++;
+ continue;
+ }
+ blk_in_group = (bno - fs->m_first_data_block) %
+ fs->m_blocks_per_group;
+ gd = &fs->m_gd[group];
+ u_int64_t bitmap_blk;
+ struct buf *bbp;
+ u_int32_t n, k, free_blocks;
+ int berr;
+
+ /* How many blocks fall in this group? */
+ n = fs->m_blocks_per_group - blk_in_group;
+ if (n > len - freed)
+ n = len - freed;
+
+ bitmap_blk = letoh32(gd->bgd_block_bitmap_block_lo);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ bitmap_blk |= (u_int64_t)letoh32(
+ gd->bgd_block_bitmap_block_hi) << 32;
+
+ berr = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk),
+ fs->m_block_size, &bbp);
+ if (berr) {
+ brelse(bbp);
+ freed += n;
+ continue;
+ }
+
+ for (k = 0; k < n; k++)
+ clrbit((char *)bbp->b_data, blk_in_group + k);
+
+ {
+ u_int32_t bcsum = ext4fs_bitmap_csum(fs, group,
+ bbp->b_data, fs->m_block_size);
+ gd->bgd_block_bitmap_checksum_lo =
+ htole16(bcsum & 0xFFFF);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_block_bitmap_checksum_hi =
+ htole16((bcsum >> 16) & 0xFFFF);
+ }
+ bdwrite(bbp);
+
+ free_blocks = letoh16(gd->bgd_free_blocks_count_lo);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ free_blocks |= (u_int32_t)letoh16(
+ gd->bgd_free_blocks_count_hi) << 16;
+ free_blocks += n;
+ gd->bgd_free_blocks_count_lo =
+ htole16(free_blocks & 0xFFFF);
+ if (fs->m_feature_incompat &
+ EXT4FS_FEATURE_INCOMPAT_64BIT)
+ gd->bgd_free_blocks_count_hi =
+ htole16((free_blocks >> 16) & 0xFFFF);
+ ext4fs_bgd_write(fs, ip->i_devvp, group);
+
+ fs->m_free_blocks_count += n;
+ fs->m_sble.sb_free_blocks_count_lo =
+ htole32((u_int32_t)fs->m_free_blocks_count);
+ fs->m_sble.sb_free_blocks_count_hi =
+ htole32((u_int32_t)(fs->m_free_blocks_count >> 32));
+ fs->m_fs_was_modified = 1;
+
+ freed += n;
+ }
+ }
+}
+
+/*
+ * Trim extents: free blocks beyond new_nblocks, trim straddling extents.
+ * Returns number of filesystem blocks freed.
+ */
+static u_int64_t
+ext4fs_trim_extents(struct inode *ip, struct ext4fs_extent *ext,
+ u_int16_t *entries_p, u_int32_t new_nblocks)
+{
+ u_int16_t entries = *entries_p;
+ u_int64_t blocks_freed = 0;
+ u_int16_t new_count = 0;
+ int i;
+
+ for (i = 0; i < entries; i++) {
+ u_int32_t eblk = letoh32(ext[i].e_block);
+ u_int16_t raw_len = letoh16(ext[i].e_len);
+ u_int16_t elen = raw_len;
+ u_int64_t estart;
+ int uninit = 0;
+
+ if (elen > 32768) {
+ elen -= 32768;
+ uninit = 1;
+ }
+ estart = letoh32(ext[i].e_start_lo) |
+ ((u_int64_t)letoh16(ext[i].e_start_hi) << 32);
+
+ if (eblk >= new_nblocks) {
+ /* Entirely past boundary — free all */
+ struct ext4fs_extent tmp = ext[i];
+ tmp.e_len = htole16(elen);
+ ext4fs_free_extents(ip, &tmp, 1);
+ blocks_freed += elen;
+ } else if (eblk + elen > new_nblocks) {
+ /* Straddles boundary — trim */
+ u_int32_t keep = new_nblocks - eblk;
+ u_int32_t discard = elen - keep;
+ struct ext4fs_extent tmp;
+
+ /* Free the tail */
+ tmp.e_block = htole32(eblk + keep);
+ tmp.e_start_lo = htole32(
+ (u_int32_t)(estart + keep));
+ tmp.e_start_hi = htole16(
+ (u_int16_t)((estart + keep) >> 32));
+ tmp.e_len = htole16(discard);
+ ext4fs_free_extents(ip, &tmp, 1);
+ blocks_freed += discard;
+
+ /* Keep the trimmed extent */
+ ext[new_count] = ext[i];
+ ext[new_count].e_len = htole16(
+ keep | (uninit ? 32768 : 0));
+ new_count++;
+ } else {
+ /* Entirely before boundary — keep */
+ if (new_count != i)
+ ext[new_count] = ext[i];
+ new_count++;
+ }
+ }
+
+ *entries_p = new_count;
+ return (blocks_freed);
+}
+
+/*
+ * Truncate inode to given length.
+ * Handles grow (extend with hole), shrink to 0, and shrink to non-zero.
+ * Supports both depth-0 (inline) and depth > 0 (tree) extent trees.
+ */
+int
+ext4fs_truncate(struct inode *ip, off_t length, int flags, struct ucred *cred)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct ext4fs_extent_header *eh = &din->i_extent_header;
+ struct vnode *vp = ITOV(ip);
+ off_t cursize;
+ u_int16_t entries, depth;
+ u_int64_t blocks_freed;
+
+ cursize = (off_t)letoh32(din->i_size_lo) |
+ ((off_t)letoh32(din->i_size_hi) << 32);
+
+ if (length == cursize)
+ return (0);
+
+ if (length < 0)
+ return (EINVAL);
+
+ if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC)
+ return (EIO);
+
+ depth = letoh16(eh->eh_depth);
+ entries = letoh16(eh->eh_entries);
+
+ if (length > cursize) {
+ /* Grow: just update size. Gap becomes hole. */
+ ext4fs_setsize(ip, length);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ uvm_vnp_setsize(vp, length);
+ return (ext4fs_update(ip, 1));
+ }
+
+ /* Shrink */
+ blocks_freed = 0;
+
+ if (length == 0) {
+ /* Truncate to 0: free everything */
+ if (depth == 0) {
+ ext4fs_free_extents(ip, din->i_extent, entries);
+ } else {
+ struct ext4fs_extent_idx *idx = din->i_extent_idx;
+ int i;
+
+ for (i = 0; i < entries; i++) {
+ u_int64_t leaf_blk;
+ struct ext4fs_extent_header *leaf_eh;
+ struct ext4fs_extent *leaf_ext;
+ struct buf *bp;
+ u_int16_t leaf_entries;
+ int error;
+
+ leaf_blk = letoh32(idx[i].ei_leaf_lo) |
+ ((u_int64_t)letoh16(
+ idx[i].ei_leaf_hi) << 32);
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ continue;
+ }
+
+ leaf_eh = (struct ext4fs_extent_header *)
+ bp->b_data;
+ if (letoh16(leaf_eh->eh_magic) !=
+ EXT4FS_EXTENT_HEADER_MAGIC) {
+ brelse(bp);
+ continue;
+ }
+
+ leaf_entries = letoh16(leaf_eh->eh_entries);
+ leaf_ext = (struct ext4fs_extent *)
+ (leaf_eh + 1);
+
+ ext4fs_free_extents(ip, leaf_ext,
+ leaf_entries);
+ brelse(bp);
+
+ ext4fs_blkfree(ip, leaf_blk);
+ }
+ }
+
+ /* Reset inode root to depth 0, 0 entries */
+ memset(din->i_extent, 0,
+ 4 * sizeof(struct ext4fs_extent));
+ eh->eh_entries = htole16(0);
+ eh->eh_depth = htole16(0);
+
+ /* Zero block count */
+ din->i_blocks_lo = htole32(0);
+ din->i_blocks_hi = htole16(0);
+ } else {
+ /* Truncate to non-zero length */
+ u_int32_t new_nblocks;
+ u_int64_t i_blocks, freed_512;
+
+ new_nblocks = (length + fs->m_block_size - 1) /
+ fs->m_block_size;
+
+ /* Zero out partial block tail */
+ if (length % fs->m_block_size != 0) {
+ u_int32_t offset = length % fs->m_block_size;
+ u_int64_t pblk;
+
+ if (ext4fs_extent_pblk(ip, new_nblocks - 1,
+ &pblk, NULL) == 0 && pblk != 0) {
+ struct buf *bp;
+ int error;
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+ memset((char *)bp->b_data + offset, 0,
+ fs->m_block_size - offset);
+ bdwrite(bp);
+ }
+ }
+
+ /* Free/trim extents past new_nblocks */
+ if (depth == 0) {
+ blocks_freed = ext4fs_trim_extents(ip,
+ din->i_extent, &entries, new_nblocks);
+ eh->eh_entries = htole16(entries);
+ } else {
+ struct ext4fs_extent_idx *idx = din->i_extent_idx;
+ u_int16_t new_idx_count = 0;
+ int i;
+
+ for (i = 0; i < entries; i++) {
+ u_int64_t leaf_blk;
+ struct ext4fs_extent_header *leaf_eh;
+ struct ext4fs_extent *leaf_ext;
+ struct buf *bp;
+ u_int16_t leaf_entries;
+ int error;
+
+ leaf_blk = letoh32(idx[i].ei_leaf_lo) |
+ ((u_int64_t)letoh16(
+ idx[i].ei_leaf_hi) << 32);
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ continue;
+ }
+
+ leaf_eh = (struct ext4fs_extent_header *)
+ bp->b_data;
+ if (letoh16(leaf_eh->eh_magic) !=
+ EXT4FS_EXTENT_HEADER_MAGIC) {
+ brelse(bp);
+ continue;
+ }
+
+ leaf_entries = letoh16(leaf_eh->eh_entries);
+ leaf_ext = (struct ext4fs_extent *)
+ (leaf_eh + 1);
+
+ blocks_freed += ext4fs_trim_extents(ip,
+ leaf_ext, &leaf_entries, new_nblocks);
+
+ if (leaf_entries == 0) {
+ /* Leaf now empty — free it */
+ brelse(bp);
+ ext4fs_blkfree(ip, leaf_blk);
+ } else {
+ leaf_eh->eh_entries =
+ htole16(leaf_entries);
+ ext4fs_extent_block_csum_set(fs,
+ ip->i_number,
+ din->i_nfs_generation,
+ bp->b_data);
+ bdwrite(bp);
+ if (new_idx_count != i)
+ idx[new_idx_count] = idx[i];
+ new_idx_count++;
+ }
+ }
+
+ entries = new_idx_count;
+ eh->eh_entries = htole16(entries);
+
+ /* If all index entries gone, collapse to depth 0 */
+ if (entries == 0) {
+ memset(din->i_extent, 0,
+ 4 * sizeof(struct ext4fs_extent));
+ eh->eh_depth = htole16(0);
+ }
+ }
+
+ /* Update i_blocks */
+ i_blocks = letoh32(din->i_blocks_lo);
+ if (fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_HUGE_FILE)
+ i_blocks |=
+ (u_int64_t)letoh16(din->i_blocks_hi) << 32;
+ freed_512 = blocks_freed *
+ (fs->m_block_size / DEV_BSIZE);
+ if (i_blocks >= freed_512)
+ i_blocks -= freed_512;
+ else
+ i_blocks = 0;
+ din->i_blocks_lo = htole32((u_int32_t)i_blocks);
+ din->i_blocks_hi = htole16((u_int16_t)(i_blocks >> 32));
+ }
+
+ /* Update size */
+ ext4fs_setsize(ip, length);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+ /* Purge cached data */
+ uvm_vnp_setsize(vp, length);
+ vinvalbuf(vp, 0, NOCRED, curproc, 0, INFSLP);
+
+ return (ext4fs_update(ip, 1));
+}
+
+/* Forward declarations */
+
+int ext4fs_access(void *);
+int ext4fs_advlock(void *);
+int ext4fs_bmap(void *);
+int ext4fs_chmod(struct vnode *, mode_t, struct ucred *);
+int ext4fs_chown(struct vnode *, uid_t, gid_t, struct ucred *);
+int ext4fs_create(void *);
+int ext4fs_fsync(void *);
+int ext4fs_getattr(void *);
+int ext4fs_inactive(void *);
+int ext4fs_link(void *);
+int ext4fs_lookup(void *);
+int ext4fs_mkdir(void *);
+int ext4fs_mknod(void *);
+int ext4fs_open(void *);
+int ext4fs_pathconf(void *);
+int ext4fs_print(void *);
+int ext4fs_read(void *);
+int ext4fs_readdir(void *);
+int ext4fs_readlink(void *);
+int ext4fs_reclaim(void *);
+int ext4fs_remove(void *);
+int ext4fs_rename(void *);
+int ext4fs_rmdir(void *);
+int ext4fs_setattr(void *);
+int ext4fs_strategy(void *);
+int ext4fs_symlink(void *);
+int ext4fs_write(void *);
+
+const struct vops ext4fs_vops = {
+ .vop_lookup = ext4fs_lookup,
+ .vop_create = ext4fs_create,
+ .vop_mknod = ext4fs_mknod,
+ .vop_open = ext4fs_open,
+ .vop_close = ufs_close,
+ .vop_access = ext4fs_access,
+ .vop_getattr = ext4fs_getattr,
+ .vop_setattr = ext4fs_setattr,
+ .vop_read = ext4fs_read,
+ .vop_write = ext4fs_write,
+ .vop_ioctl = ufs_ioctl,
+ .vop_kqfilter = ufs_kqfilter,
+ .vop_revoke = NULL,
+ .vop_fsync = ext4fs_fsync,
+ .vop_remove = ext4fs_remove,
+ .vop_link = ext4fs_link,
+ .vop_rename = ext4fs_rename,
+ .vop_mkdir = ext4fs_mkdir,
+ .vop_rmdir = ext4fs_rmdir,
+ .vop_symlink = ext4fs_symlink,
+ .vop_readdir = ext4fs_readdir,
+ .vop_readlink = ext4fs_readlink,
+ .vop_abortop = NULL,
+ .vop_inactive = ext4fs_inactive,
+ .vop_reclaim = ext4fs_reclaim,
+ .vop_lock = ufs_lock,
+ .vop_unlock = ufs_unlock,
+ .vop_bmap = ext4fs_bmap,
+ .vop_strategy = ext4fs_strategy,
+ .vop_print = ext4fs_print,
+ .vop_pathconf = ext4fs_pathconf,
+ .vop_advlock = ext4fs_advlock,
+ .vop_bwrite = NULL,
+};
+
+/* Stub implementations */
+
+int
+ext4fs_lookup(void *v)
+{
+ struct vop_lookup_args *ap = v;
+ struct vnode *vdp = ap->a_dvp;
+ struct vnode **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ struct inode *dp = VTOI(vdp);
+ struct m_ext4fs *fs = dp->i_e4fs;
+ struct ext4fs_dinode *din = &dp->i_e4din->dinode;
+ struct ext4fs_directory *ep;
+ struct vnode *tdp;
+ struct buf *bp;
+ int flags = cnp->cn_flags;
+ int nameiop = cnp->cn_nameiop;
+ int lockparent = flags & LOCKPARENT;
+ ino_t foundino = 0;
+ off_t off, filesz;
+ u_int64_t lbn, pblk, blkoff;
+ u_int16_t reclen;
+ int error;
+
+ /* For CREATE: track free slot info */
+ int slotneeded = 0;
+ int slotsize = 0;
+ off_t slotoffset = -1;
+ off_t prevoff = -1;
+
+ *vpp = NULL;
+
+ /* Check accessibility of directory */
+ if ((error = VOP_ACCESS(vdp, VEXEC, cnp->cn_cred, cnp->cn_proc)) != 0) {
+ return (error);
+ }
+
+ if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
+ (nameiop == DELETE || nameiop == RENAME))
+ return (EROFS);
+
+ /* Check the name cache */
+ if ((error = cache_lookup(vdp, vpp, cnp)) >= 0)
+ return (error);
+
+ /* Search directory for the name */
+ filesz = (off_t)letoh32(din->i_size_lo) |
+ ((off_t)letoh32(din->i_size_hi) << 32);
+
+ if (nameiop == CREATE || nameiop == RENAME)
+ slotneeded = EXT4FS_DIRSIZ(cnp->cn_namelen);
+
+ for (off = 0; off < filesz; ) {
+ lbn = EXT4FS_LBLKNO(fs, off);
+
+ error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL);
+ if (error || pblk == 0) {
+ return (error ? error : EIO);
+ }
+
+ error = bread(dp->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+
+ blkoff = EXT4FS_BLKOFF(fs, off);
+ prevoff = -1;
+
+ while (blkoff < fs->m_block_size && off < filesz) {
+ ep = (struct ext4fs_directory *)
+ ((char *)bp->b_data + blkoff);
+ reclen = letoh16(ep->e4d_reclen);
+
+ if (reclen == 0) {
+ brelse(bp);
+ return (EIO);
+ }
+
+ /* Skip directory checksum tail entry */
+ if (letoh32(ep->e4d_ino) == 0 &&
+ ep->e4d_namlen == 0 &&
+ ep->e4d_type == EXT4FS_DIR_TAIL_FT &&
+ reclen == EXT4FS_DIR_TAIL_SIZE) {
+ off += reclen;
+ blkoff += reclen;
+ continue;
+ }
+
+ /* Track free space for CREATE/RENAME */
+ if ((nameiop == CREATE || nameiop == RENAME) &&
+ slotoffset == -1) {
+ int freespace;
+
+ if (letoh32(ep->e4d_ino) == 0) {
+ freespace = reclen;
+ } else {
+ freespace = reclen -
+ EXT4FS_DIRSIZ(ep->e4d_namlen);
+ }
+ if (freespace >= slotneeded) {
+ slotoffset = off;
+ slotsize = reclen;
+ }
+ }
+
+ if (letoh32(ep->e4d_ino) != 0 &&
+ ep->e4d_namlen == cnp->cn_namelen &&
+ memcmp(cnp->cn_nameptr, ep->e4d_name,
+ cnp->cn_namelen) == 0) {
+ foundino = letoh32(ep->e4d_ino);
+ dp->i_ino = foundino;
+ dp->i_reclen = reclen;
+ dp->i_offset = off;
+ /* For DELETE: count = prev entry to this */
+ if (nameiop == DELETE && prevoff != -1)
+ dp->i_count = off - prevoff;
+ else
+ dp->i_count = 0;
+ brelse(bp);
+ goto found;
+ }
+
+ prevoff = off;
+ off += reclen;
+ blkoff += reclen;
+ }
+
+ brelse(bp);
+ }
+
+ /* Not found */
+ if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) {
+ if (vdp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ if ((error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred,
+ cnp->cn_proc)) != 0)
+ return (error);
+ /* Save free slot info for direnter */
+ if (slotoffset == -1) {
+ dp->i_offset = filesz;
+ dp->i_count = 0;
+ } else {
+ dp->i_offset = slotoffset;
+ dp->i_count = slotsize;
+ }
+ cnp->cn_flags |= SAVENAME;
+ if (!lockparent) {
+ VOP_UNLOCK(vdp);
+ cnp->cn_flags |= PDIRUNLOCK;
+ }
+ return (EJUSTRETURN);
+ }
+
+ if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+ cache_enter(vdp, *vpp, cnp);
+ return (ENOENT);
+
+found:
+ if ((flags & ISLASTCN) && nameiop == LOOKUP)
+ dp->i_diroff = EXT4FS_LBLKNO(fs, dp->i_offset) *
+ fs->m_block_size;
+
+ /*
+ * If deleting, and at end of pathname, return parameters
+ * which can be used to remove file. If the wantparent flag
+ * isn't set, we return only the directory (in ndp->ni_dvp),
+ * otherwise we go on and lock the inode, being careful with ".".
+ */
+ if (nameiop == DELETE && (flags & ISLASTCN)) {
+ if ((error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred,
+ cnp->cn_proc)) != 0)
+ return (error);
+ if (dp->i_number == foundino) {
+ vref(vdp);
+ *vpp = vdp;
+ return (0);
+ }
+ if ((error = VFS_VGET(vdp->v_mount, foundino, &tdp)) != 0)
+ return (error);
+ *vpp = tdp;
+ if (!lockparent) {
+ VOP_UNLOCK(vdp);
+ cnp->cn_flags |= PDIRUNLOCK;
+ }
+ return (0);
+ }
+
+ /*
+ * If rewriting (RENAME), return the inode and the
+ * information required to rewrite the present directory
+ * Must get inode of directory entry to verify it's a
+ * regular file, or empty directory.
+ */
+ if (nameiop == RENAME && (flags & ISLASTCN)) {
+ if ((error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred,
+ cnp->cn_proc)) != 0)
+ return (error);
+ if (dp->i_number == foundino)
+ return (EISDIR);
+ if ((error = VFS_VGET(vdp->v_mount, foundino, &tdp)) != 0)
+ return (error);
+ *vpp = tdp;
+ cnp->cn_flags |= SAVENAME;
+ if (!lockparent) {
+ VOP_UNLOCK(vdp);
+ cnp->cn_flags |= PDIRUNLOCK;
+ }
+ return (0);
+ }
+
+ if (flags & ISDOTDOT) {
+ /* ".." - unlock parent, get child, optionally relock */
+ VOP_UNLOCK(vdp);
+ cnp->cn_flags |= PDIRUNLOCK;
+ error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+ if (error) {
+ if (vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY) == 0)
+ cnp->cn_flags &= ~PDIRUNLOCK;
+ return (error);
+ }
+ if (lockparent && (flags & ISLASTCN)) {
+ if ((error = vn_lock(vdp, LK_EXCLUSIVE)) != 0) {
+ vput(tdp);
+ return (error);
+ }
+ cnp->cn_flags &= ~PDIRUNLOCK;
+ }
+ *vpp = tdp;
+ } else if (dp->i_number == foundino) {
+ /* "." - return same vnode */
+ vref(vdp);
+ *vpp = vdp;
+ } else {
+ if ((error = VFS_VGET(vdp->v_mount, foundino, &tdp)) != 0)
+ return (error);
+ if (!lockparent || !(flags & ISLASTCN)) {
+ VOP_UNLOCK(vdp);
+ cnp->cn_flags |= PDIRUNLOCK;
+ }
+ *vpp = tdp;
+ }
+
+ if (cnp->cn_flags & MAKEENTRY)
+ cache_enter(vdp, *vpp, cnp);
+ return (0);
+}
+
+/*
+ * Common code to create a new inode and enter it in a directory.
+ */
+static int
+ext4fs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
+ struct componentname *cnp)
+{
+ struct inode *ip, *pdir;
+ struct vnode *tvp;
+ struct ext4fs_dinode *din;
+ int error;
+
+ pdir = VTOI(dvp);
+
+ *vpp = NULL;
+ if ((mode & S_IFMT) == 0)
+ mode |= S_IFREG;
+
+
+ error = ext4fs_inode_alloc(pdir, mode, cnp->cn_cred, &tvp);
+ if (error) {
+ pool_put(&namei_pool, cnp->cn_pnbuf);
+ return (error);
+ }
+
+ ip = VTOI(tvp);
+ din = &ip->i_e4din->dinode;
+ /* Set owner from cred and parent */
+ din->i_uid_lo = htole16(cnp->cn_cred->cr_uid & 0xFFFF);
+ din->i_uid_hi = htole16((cnp->cn_cred->cr_uid >> 16) & 0xFFFF);
+ {
+ gid_t gid = letoh16(pdir->i_e4din->dinode.i_gid_lo) |
+ ((gid_t)letoh16(pdir->i_e4din->dinode.i_gid_hi) << 16);
+ din->i_gid_lo = htole16(gid & 0xFFFF);
+ din->i_gid_hi = htole16((gid >> 16) & 0xFFFF);
+ }
+
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ din->i_mode = htole16(mode);
+ tvp->v_type = IFTOVT(mode);
+ ip->i_effnlink = 1;
+ din->i_links_count = htole16(1);
+
+ /* Clear SGID if not group member */
+ if ((mode & ISGID) &&
+ !groupmember(letoh16(din->i_gid_lo) |
+ ((gid_t)letoh16(din->i_gid_hi) << 16), cnp->cn_cred) &&
+ suser_ucred(cnp->cn_cred))
+ din->i_mode = htole16(letoh16(din->i_mode) & ~ISGID);
+
+ /* Write inode to disk before directory entry */
+ if ((error = ext4fs_update(ip, 1)) != 0)
+ goto bad;
+ error = ext4fs_direnter(ip, dvp, cnp);
+ if (error != 0)
+ goto bad;
+
+ if ((cnp->cn_flags & SAVESTART) == 0)
+ pool_put(&namei_pool, cnp->cn_pnbuf);
+ *vpp = tvp;
+ return (0);
+
+bad:
+ pool_put(&namei_pool, cnp->cn_pnbuf);
+ ip->i_effnlink = 0;
+ din->i_links_count = htole16(0);
+ ip->i_flag |= IN_CHANGE;
+ tvp->v_type = VNON;
+ vput(tvp);
+ return (error);
+}
+
+int
+ext4fs_create(void *v)
+{
+ struct vop_create_args *ap = v;
+ return (ext4fs_makeinode(
+ MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
+ ap->a_dvp, ap->a_vpp, ap->a_cnp));
+}
+
+int
+ext4fs_mknod(void *v)
+{
+ struct vop_mknod_args *ap = v;
+ struct vnode **vpp = ap->a_vpp;
+ struct vnode *tvp;
+ struct inode *ip;
+ int error;
+
+ error = ext4fs_makeinode(
+ MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
+ ap->a_dvp, &tvp, ap->a_cnp);
+ if (error)
+ return (error);
+
+ ip = VTOI(tvp);
+
+ /* Store device number */
+ if (ap->a_vap->va_rdev != VNOVAL) {
+ /* Old format in i_block[0], new format in i_block[1] */
+ ip->i_e4din->dinode.i_block[0] =
+ htole32(ap->a_vap->va_rdev);
+ ip->i_e4din->dinode.i_block[1] =
+ htole32(ap->a_vap->va_rdev);
+ }
+
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ ext4fs_update(ip, 1);
+
+ *vpp = tvp;
+ return (0);
+}
+
+int
+ext4fs_open(void *v)
+{
+ struct vop_open_args *ap = v;
+ struct inode *ip = VTOI(ap->a_vp);
+ u_int32_t iflags = letoh32(ip->i_e4din->dinode.i_flags);
+
+ /* Deny write access to immutable files, non-append to append-only */
+ if ((iflags & EXTFS_INODE_FLAG_IMMUTABLE) &&
+ (ap->a_mode & (FWRITE | O_TRUNC)))
+ return (EPERM);
+ if ((iflags & EXTFS_INODE_FLAG_APPEND) &&
+ (ap->a_mode & (FWRITE | O_TRUNC)) &&
+ !(ap->a_mode & O_APPEND))
+ return (EPERM);
+
+ return (0);
+}
+
+int
+ext4fs_access(void *v)
+{
+ struct vop_access_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ u_int32_t iflags;
+ mode_t mode;
+ uid_t uid;
+ gid_t gid;
+
+ /* Deny write access to immutable files */
+ iflags = letoh32(din->i_flags);
+ if ((ap->a_mode & VWRITE) && (iflags & EXTFS_INODE_FLAG_IMMUTABLE))
+ return (EPERM);
+
+ mode = letoh16(din->i_mode);
+ uid = letoh16(din->i_uid_lo) |
+ ((uid_t)letoh16(din->i_uid_hi) << 16);
+ gid = letoh16(din->i_gid_lo) |
+ ((gid_t)letoh16(din->i_gid_hi) << 16);
+
+ return (vaccess(vp->v_type, mode, uid, gid, ap->a_mode, ap->a_cred));
+}
+
+int
+ext4fs_getattr(void *v)
+{
+ struct vop_getattr_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+
+ struct ext4fs_dinode_256 *din = ip->i_e4din;
+ struct vattr *vap = ap->a_vap;
+
+ /* Copy from inode table */
+ vap->va_fsid = ip->i_dev;
+ vap->va_fileid = ip->i_number;
+ vap->va_mode = letoh16(din->dinode.i_mode) & ALLPERMS;
+ vap->va_nlink = letoh16(din->dinode.i_links_count);
+ vap->va_uid = letoh16(din->dinode.i_uid_lo);
+ vap->va_uid |= (uid_t)letoh16(din->dinode.i_uid_hi) << 16;
+ vap->va_gid = letoh16(din->dinode.i_gid_lo);
+ vap->va_gid |= (gid_t)letoh16(din->dinode.i_gid_hi) << 16;
+ vap->va_rdev = 0;
+ vap->va_size = letoh32(din->dinode.i_size_lo);
+ vap->va_size |= (off_t)letoh32(din->dinode.i_size_hi) << 32;
+
+ /* Convert timestamps with nanosecond precision */
+ vap->va_atime.tv_sec = letoh32(din->dinode.i_atime);
+ vap->va_atime.tv_nsec = letoh32(din->dinode.i_atime_extra) >> 2;
+ vap->va_mtime.tv_sec = letoh32(din->dinode.i_mtime);
+ vap->va_mtime.tv_nsec = letoh32(din->dinode.i_mtime_extra) >> 2;
+ vap->va_ctime.tv_sec = letoh32(din->dinode.i_ctime);
+ vap->va_ctime.tv_nsec = letoh32(din->dinode.i_ctime_extra) >> 2;
+
+ vap->va_flags = 0;
+ vap->va_gen = letoh32(din->dinode.i_nfs_generation);
+
+ /* Set appropriate block size */
+ if (vp->v_type == VBLK)
+ vap->va_blocksize = BLKDEV_IOSIZE;
+ else if (vp->v_type == VCHR)
+ vap->va_blocksize = MAXBSIZE;
+ else
+ vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+
+ vap->va_bytes = letoh32(din->dinode.i_blocks_lo);
+ vap->va_bytes |= (off_t)letoh16(din->dinode.i_blocks_hi) << 32;
+ vap->va_bytes *= DEV_BSIZE;
+ vap->va_type = vp->v_type;
+ vap->va_filerev = 0;
+
+ return (0);
+}
+
+int
+ext4fs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred)
+{
+ struct inode *ip = VTOI(vp);
+
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ uid_t uid;
+ gid_t gid;
+ u_int16_t cur_mode;
+ int error;
+
+ uid = letoh16(din->i_uid_lo) |
+ ((uid_t)letoh16(din->i_uid_hi) << 16);
+ gid = letoh16(din->i_gid_lo) |
+ ((gid_t)letoh16(din->i_gid_hi) << 16);
+
+ if (cred->cr_uid != uid && (error = suser_ucred(cred)))
+ return (error);
+ if (cred->cr_uid) {
+ if (vp->v_type != VDIR && (mode & S_ISTXT))
+ return (EFTYPE);
+ if (!groupmember(gid, cred) && (mode & ISGID))
+ return (EPERM);
+ }
+
+ cur_mode = letoh16(din->i_mode);
+ cur_mode &= ~ALLPERMS;
+ cur_mode |= (mode & ALLPERMS);
+ din->i_mode = htole16(cur_mode);
+ ip->i_flag |= IN_CHANGE;
+
+ if ((vp->v_flag & VTEXT) && (cur_mode & S_ISTXT) == 0)
+ (void)uvm_vnp_uncache(vp);
+
+ return (0);
+}
+
+int
+ext4fs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred)
+{
+ struct inode *ip = VTOI(vp);
+
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ uid_t ouid;
+ gid_t ogid;
+ u_int16_t mode;
+ int error;
+
+ ouid = letoh16(din->i_uid_lo) |
+ ((uid_t)letoh16(din->i_uid_hi) << 16);
+ ogid = letoh16(din->i_gid_lo) |
+ ((gid_t)letoh16(din->i_gid_hi) << 16);
+
+ if (uid == (uid_t)VNOVAL)
+ uid = ouid;
+ if (gid == (gid_t)VNOVAL)
+ gid = ogid;
+
+ if ((cred->cr_uid != ouid || uid != ouid ||
+ (gid != ogid && !groupmember(gid, cred))) &&
+ (error = suser_ucred(cred)))
+ return (error);
+
+ din->i_uid_lo = htole16(uid & 0xFFFF);
+ din->i_uid_hi = htole16((uid >> 16) & 0xFFFF);
+ din->i_gid_lo = htole16(gid & 0xFFFF);
+ din->i_gid_hi = htole16((gid >> 16) & 0xFFFF);
+
+ if (ouid != uid || ogid != gid)
+ ip->i_flag |= IN_CHANGE;
+ if (ouid != uid && cred->cr_uid != 0) {
+ mode = letoh16(din->i_mode);
+ mode &= ~S_ISUID;
+ din->i_mode = htole16(mode);
+ }
+ if (ogid != gid && cred->cr_uid != 0) {
+ mode = letoh16(din->i_mode);
+ mode &= ~S_ISGID;
+ din->i_mode = htole16(mode);
+ }
+
+ return (0);
+}
+
+int
+ext4fs_setattr(void *v)
+{
+ struct vop_setattr_args *ap = v;
+ struct vattr *vap = ap->a_vap;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct ucred *cred = ap->a_cred;
+
+ int error = 0;
+
+ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
+ (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
+ (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
+ ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL))
+ return (EINVAL);
+
+ if (vap->va_flags != VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ if ((error = suser_ucred(cred)))
+ return (error);
+ u_int32_t iflags = letoh32(din->i_flags);
+ iflags &= ~(EXTFS_INODE_FLAG_APPEND |
+ EXTFS_INODE_FLAG_IMMUTABLE);
+ iflags |= (vap->va_flags & SF_APPEND) ?
+ EXTFS_INODE_FLAG_APPEND : 0;
+ iflags |= (vap->va_flags & SF_IMMUTABLE) ?
+ EXTFS_INODE_FLAG_IMMUTABLE : 0;
+ din->i_flags = htole32(iflags);
+ ip->i_flag |= IN_CHANGE;
+ }
+
+ if (vap->va_uid != (uid_t)VNOVAL ||
+ vap->va_gid != (gid_t)VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ error = ext4fs_chown(vp, vap->va_uid, vap->va_gid, cred);
+ if (error)
+ return (error);
+ }
+
+ if (vap->va_size != VNOVAL) {
+ switch (vp->v_type) {
+ case VDIR:
+ return (EISDIR);
+ case VLNK:
+ case VREG:
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ break;
+ default:
+ break;
+ }
+ error = ext4fs_truncate(ip, vap->va_size, 0, cred);
+ if (error)
+ return (error);
+ }
+
+ if ((vap->va_vaflags & VA_UTIMES_CHANGE) ||
+ vap->va_atime.tv_nsec != VNOVAL ||
+ vap->va_mtime.tv_nsec != VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ uid_t uid = letoh16(din->i_uid_lo) |
+ ((uid_t)letoh16(din->i_uid_hi) << 16);
+ if (cred->cr_uid != uid &&
+ (error = suser_ucred(cred)) &&
+ ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
+ (error = VOP_ACCESS(vp, VWRITE, cred, ap->a_p))))
+ return (error);
+ if (vap->va_mtime.tv_nsec != VNOVAL)
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ else if (vap->va_vaflags & VA_UTIMES_CHANGE)
+ ip->i_flag |= IN_CHANGE;
+ if (vap->va_atime.tv_nsec != VNOVAL)
+ ip->i_flag |= IN_ACCESS;
+ EXT4FS_ITIMES(ip);
+ if (vap->va_mtime.tv_nsec != VNOVAL) {
+ din->i_mtime =
+ htole32((u_int32_t)vap->va_mtime.tv_sec);
+ din->i_mtime_extra =
+ htole32(vap->va_mtime.tv_nsec << 2);
+ }
+ if (vap->va_atime.tv_nsec != VNOVAL) {
+ din->i_atime =
+ htole32((u_int32_t)vap->va_atime.tv_sec);
+ din->i_atime_extra =
+ htole32(vap->va_atime.tv_nsec << 2);
+ }
+ ip->i_flag |= IN_MODIFIED;
+ error = ext4fs_update(ip, 1);
+ if (error)
+ return (error);
+ }
+
+ if (vap->va_mode != (mode_t)VNOVAL) {
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+ error = ext4fs_chmod(vp, vap->va_mode, cred);
+ }
+
+ return (error);
+}
+
+int
+ext4fs_read(void *v)
+{
+ struct vop_read_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct uio *uio = ap->a_uio;
+ struct buf *bp;
+ off_t filesz, bytesinfile;
+ daddr_t lbn, nextlbn;
+ int error, blkoffset, xfersize, size;
+
+ if (vp->v_type == VDIR)
+ return (EISDIR);
+ if (uio->uio_offset < 0)
+ return (EINVAL);
+ if (uio->uio_resid == 0)
+ return (0);
+
+ filesz = (off_t)letoh32(din->i_size_lo) |
+ ((off_t)letoh32(din->i_size_hi) << 32);
+
+ for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+ bytesinfile = filesz - uio->uio_offset;
+ if (bytesinfile <= 0)
+ break;
+
+ lbn = EXT4FS_LBLKNO(fs, uio->uio_offset);
+ nextlbn = lbn + 1;
+ blkoffset = EXT4FS_BLKOFF(fs, uio->uio_offset);
+ size = fs->m_block_size;
+
+ xfersize = size - blkoffset;
+ if (uio->uio_resid < xfersize)
+ xfersize = uio->uio_resid;
+ if (bytesinfile < xfersize)
+ xfersize = bytesinfile;
+
+ if ((u_int64_t)nextlbn * fs->m_block_size >= filesz)
+ error = bread(vp, lbn, size, &bp);
+ else if (lbn - 1 == ip->i_ci.ci_lastr ||
+ uio->uio_resid > xfersize)
+ error = bread_cluster(vp, lbn, size, &bp);
+ else
+ error = bread(vp, lbn, size, &bp);
+ if (error)
+ break;
+ ip->i_ci.ci_lastr = lbn;
+
+ /*
+ * We should only get non-zero b_resid when an I/O error
+ * has occurred, which should cause us to break above.
+ * However, if the short read did not cause an error,
+ * then we want to ensure that we do not uiomove bad
+ * or uninitialized data.
+ */
+ size -= bp->b_resid;
+ if (size < xfersize) {
+ if (size == 0)
+ break;
+ xfersize = size;
+ }
+
+ error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+ if (error)
+ break;
+ brelse(bp);
+ }
+ if (bp != NULL)
+ brelse(bp);
+
+ if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
+ ip->i_flag |= IN_ACCESS;
+
+ return (error);
+}
+
+int
+ext4fs_write(void *v)
+{
+ struct vop_write_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct uio *uio = ap->a_uio;
+ struct inode *ip = VTOI(vp);
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct buf *bp;
+ off_t filesz;
+ u_int64_t lbn, pblk, ncontig, prealloc_start;
+ u_int32_t prealloc_count, prealloc_got, prealloc_i;
+ u_int64_t i_blocks;
+ int ioflag = ap->a_ioflag;
+ int blkoffset, xfersize;
+ int error;
+ size_t resid;
+ ssize_t overrun;
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ switch (vp->v_type) {
+ case VREG:
+ break;
+ case VLNK:
+ break;
+ case VDIR:
+ return (EOPNOTSUPP);
+ default:
+ panic("ext4fs_write: type");
+ }
+
+ filesz = (off_t)letoh32(din->i_size_lo) |
+ ((off_t)letoh32(din->i_size_hi) << 32);
+
+ if (ioflag & IO_APPEND)
+ uio->uio_offset = filesz;
+
+ if (uio->uio_offset < 0)
+ return (EINVAL);
+
+ if ((error = vn_fsizechk(vp, uio, ioflag, &overrun)))
+ return (error);
+
+ resid = uio->uio_resid;
+
+ for (error = 0; uio->uio_resid > 0; ) {
+ lbn = EXT4FS_LBLKNO(fs, uio->uio_offset);
+ blkoffset = EXT4FS_BLKOFF(fs, uio->uio_offset);
+ xfersize = fs->m_block_size - blkoffset;
+ if (uio->uio_resid < xfersize)
+ xfersize = uio->uio_resid;
+
+ /*
+ * For full-block writes past EOF, batch-allocate
+ * contiguous blocks for the remaining write.
+ */
+ if (blkoffset == 0 && xfersize == fs->m_block_size &&
+ uio->uio_offset >= filesz &&
+ (ext4fs_extent_pblk(ip, lbn, &pblk, &ncontig) != 0 ||
+ pblk == 0)) {
+ /* Count full blocks remaining in this write */
+ prealloc_count = uio->uio_resid / fs->m_block_size;
+ if (prealloc_count > 32768)
+ prealloc_count = 32768;
+ if (prealloc_count == 0)
+ prealloc_count = 1;
+ /* Goal: contiguous with last extent */
+ pblk = 0;
+ if (letoh16(din->i_extent_header.eh_entries) > 0) {
+ u_int64_t dummy;
+ u_int64_t nc;
+ /* Use lbn-1 to find last mapped block */
+ if (lbn > 0 && ext4fs_extent_pblk(ip,
+ lbn - 1, &dummy, &nc) == 0 &&
+ dummy != 0)
+ pblk = dummy + 1;
+ }
+ error = ext4fs_blkalloc(ip, pblk, prealloc_count,
+ &prealloc_start, &prealloc_got);
+ if (error)
+ break;
+ error = ext4fs_extent_insert(ip, lbn,
+ prealloc_start, prealloc_got);
+ if (error) {
+ for (prealloc_i = 0;
+ prealloc_i < prealloc_got;
+ prealloc_i++)
+ ext4fs_blkfree(ip,
+ prealloc_start + prealloc_i);
+ break;
+ }
+ i_blocks = letoh32(din->i_blocks_lo) |
+ ((u_int64_t)letoh16(din->i_blocks_hi) << 32);
+ i_blocks += (u_int64_t)prealloc_got *
+ (fs->m_block_size / DEV_BSIZE);
+ din->i_blocks_lo = htole32((u_int32_t)i_blocks);
+ din->i_blocks_hi =
+ htole16((u_int16_t)(i_blocks >> 32));
+ din->i_flags |=
+ htole32(EXTFS_INODE_FLAG_EXTENTS);
+ ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+ /* Now use the first allocated block */
+ pblk = prealloc_start;
+ } else if (ext4fs_extent_pblk(ip, lbn, &pblk,
+ &ncontig) == 0 && pblk != 0) {
+ /* Already mapped */
+ } else {
+ /* Partial block or not past EOF: single alloc */
+ error = ext4fs_buf_alloc(ip, lbn, fs->m_block_size,
+ ap->a_cred, &bp, B_CLRBUF);
+ if (error)
+ break;
+ goto do_io;
+ }
+
+ /* Full block: getblk without read; partial: bread */
+ if (blkoffset == 0 && xfersize == fs->m_block_size) {
+ bp = getblk(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, 0, INFSLP);
+ } else {
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ break;
+ }
+ }
+do_io:
+ error = uiomove((char *)bp->b_data + blkoffset, xfersize,
+ uio);
+ if (error) {
+ brelse(bp);
+ break;
+ }
+
+ if (ioflag & IO_SYNC)
+ (void)bwrite(bp);
+ else if (xfersize + blkoffset == fs->m_block_size)
+ bawrite(bp);
+ else
+ bdwrite(bp);
+ (void)uvm_vnp_uncache(vp);
+ if (xfersize == 0)
+ break;
+
+ /* Update file size if we wrote past end */
+ if (uio->uio_offset > filesz) {
+ ext4fs_setsize(ip, uio->uio_offset);
+ filesz = uio->uio_offset;
+ uvm_vnp_setsize(vp, filesz);
+ }
+
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ }
+
+ /* Clear setuid/setgid bits on write by non-root */
+ if (resid > uio->uio_resid && ap->a_cred &&
+ ap->a_cred->cr_uid != 0) {
+ u_int16_t mode = letoh16(din->i_mode);
+ mode &= ~(S_ISUID | S_ISGID);
+ din->i_mode = htole16(mode);
+ }
+
+ if (error == 0 && resid > uio->uio_resid && (ioflag & IO_SYNC))
+ error = ext4fs_update(ip, 1);
+
+ uio->uio_resid += overrun;
+ return (error);
+}
+
+int
+ext4fs_fsync(void *v)
+{
+ struct vop_fsync_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+
+
+ if (vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (0);
+
+ vflushbuf(vp, ap->a_waitfor == MNT_WAIT);
+ return (ext4fs_update(VTOI(vp), ap->a_waitfor == MNT_WAIT));
+}
+
+int
+ext4fs_remove(void *v)
+{
+ struct vop_remove_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct vnode *dvp = ap->a_dvp;
+ struct inode *ip = VTOI(vp);
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ u_int16_t nlink;
+ int error;
+
+ if (vp->v_type == VDIR) {
+ error = EPERM;
+ goto out;
+ }
+
+ /* Cannot remove immutable or append-only files */
+ if (letoh32(din->i_flags) &
+ (EXTFS_INODE_FLAG_IMMUTABLE | EXTFS_INODE_FLAG_APPEND)) {
+ error = EPERM;
+ goto out;
+ }
+
+ error = ext4fs_dirremove(dvp, ap->a_cnp);
+ if (error)
+ goto out;
+
+ nlink = letoh16(din->i_links_count);
+ if (nlink > 0)
+ nlink--;
+ din->i_links_count = htole16(nlink);
+ ip->i_effnlink = nlink;
+ ip->i_flag |= IN_CHANGE;
+
+out:
+ return (error);
+}
+
+int
+ext4fs_link(void *v)
+{
+ struct vop_link_args *ap = v;
+ struct vnode *dvp = ap->a_dvp;
+ struct vnode *vp = ap->a_vp;
+ struct componentname *cnp = ap->a_cnp;
+ struct inode *ip = VTOI(vp);
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ u_int16_t nlink;
+ int error;
+
+ if (vp->v_type == VDIR) {
+ error = EPERM;
+ goto out2;
+ }
+ if (dvp->v_mount != vp->v_mount) {
+ error = EXDEV;
+ goto out2;
+ }
+
+ nlink = letoh16(din->i_links_count);
+ if (nlink >= EXT4FS_LINK_MAX) {
+ error = EMLINK;
+ goto out2;
+ }
+
+ if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0)
+ goto out2;
+
+ nlink++;
+ din->i_links_count = htole16(nlink);
+ ip->i_effnlink = nlink;
+ ip->i_flag |= IN_CHANGE;
+ error = ext4fs_update(ip, 1);
+ if (error)
+ goto out1;
+
+ error = ext4fs_direnter(ip, dvp, cnp);
+ if (error) {
+ nlink--;
+ din->i_links_count = htole16(nlink);
+ ip->i_effnlink = nlink;
+ ip->i_flag |= IN_CHANGE;
+ }
+
+out1:
+ if (dvp != vp)
+ VOP_UNLOCK(vp);
+out2:
+ vput(dvp);
+ return (error);
+}
+
+/*
+ * Check if source is an ancestor of target in the directory hierarchy.
+ * Prevents creating directory loops via rename.
+ * target vnode must be locked on entry and will be vput on exit.
+ */
+static int
+ext4fs_checkpath(struct inode *source, struct inode *target, struct ucred *cred)
+{
+ struct vnode *vp;
+ struct m_ext4fs *fs = source->i_e4fs;
+ u_int32_t ino;
+ int error = 0;
+
+ vp = ITOV(target);
+ if (target->i_number == source->i_number) {
+ error = EEXIST;
+ goto out;
+ }
+ if (target->i_number == ROOTINO)
+ goto out;
+
+ for (;;) {
+ struct inode *ip = VTOI(vp);
+ struct buf *bp;
+ struct ext4fs_directory *dot, *dotdot;
+ u_int64_t pblk;
+
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ break;
+ }
+
+ /* Read ".." from first directory block */
+ error = ext4fs_extent_pblk(ip, 0, &pblk, NULL);
+ if (error || pblk == 0) {
+ if (!error) error = EIO;
+ break;
+ }
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ break;
+ }
+
+ /* ".." is the second entry after "." */
+ dot = (struct ext4fs_directory *)bp->b_data;
+ dotdot = (struct ext4fs_directory *)
+ ((char *)bp->b_data + letoh16(dot->e4d_reclen));
+ if (dotdot->e4d_namlen != 2 ||
+ dotdot->e4d_name[0] != '.' ||
+ dotdot->e4d_name[1] != '.') {
+ brelse(bp);
+ error = ENOTDIR;
+ break;
+ }
+ ino = letoh32(dotdot->e4d_ino);
+ brelse(bp);
+
+ if (ino == source->i_number) {
+ error = EINVAL;
+ break;
+ }
+ if (ino == ROOTINO)
+ break;
+
+ VOP_UNLOCK(vp);
+ error = VFS_VGET(vp->v_mount, ino, &vp);
+ if (error) {
+ vp = NULL;
+ break;
+ }
+ }
+
+out:
+ if (error == ENOTDIR)
+ printf("ext4fs_checkpath: .. not a directory\n");
+ if (vp != NULL)
+ vput(vp);
+ return (error);
+}
+
+int
+ext4fs_rename(void *v)
+{
+ struct vop_rename_args *ap = v;
+ struct vnode *tvp = ap->a_tvp;
+ struct vnode *tdvp = ap->a_tdvp;
+ struct vnode *fvp = ap->a_fvp;
+ struct vnode *fdvp = ap->a_fdvp;
+ struct componentname *tcnp = ap->a_tcnp;
+ struct componentname *fcnp = ap->a_fcnp;
+ struct inode *ip, *xp = NULL, *dp;
+ struct ext4fs_dinode *din;
+ int doingdirectory = 0, oldparent = 0, newparent = 0;
+ int error = 0;
+ u_int16_t nlink;
+
+ /* Check for cross-device rename */
+ if ((fvp->v_mount != tdvp->v_mount) ||
+ (tvp && (fvp->v_mount != tvp->v_mount))) {
+ error = EXDEV;
+abortit:
+ VOP_ABORTOP(tdvp, tcnp);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ VOP_ABORTOP(fdvp, fcnp);
+ vrele(fdvp);
+ vrele(fvp);
+ return (error);
+ }
+
+ /* Lock source */
+ if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
+ goto abortit;
+
+ dp = VTOI(fdvp);
+ ip = VTOI(fvp);
+ din = &ip->i_e4din->dinode;
+
+ nlink = letoh16(din->i_links_count);
+ if ((letoh32(din->i_flags) &
+ (EXTFS_INODE_FLAG_IMMUTABLE | EXTFS_INODE_FLAG_APPEND))) {
+ VOP_UNLOCK(fvp);
+ error = EPERM;
+ goto abortit;
+ }
+
+ if ((letoh16(din->i_mode) & S_IFMT) == S_IFDIR) {
+ doingdirectory = 1;
+ oldparent = dp->i_number;
+ }
+
+ /* Bump link count temporarily for crash safety */
+ nlink++;
+ din->i_links_count = htole16(nlink);
+ ip->i_effnlink = nlink;
+ ip->i_flag |= IN_CHANGE;
+ if ((error = ext4fs_update(ip, 1)) != 0) {
+ VOP_UNLOCK(fvp);
+ goto abortit;
+ }
+
+ /* Check write access for changing ".." */
+ if (doingdirectory)
+ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred,
+ tcnp->cn_proc);
+ VOP_UNLOCK(fvp);
+ vrele(fdvp);
+
+ /*
+ * If ".." must be changed (ie the directory gets a new parent)
+ * then the source directory must not be in the directory
+ * hierarchy above the target.
+ */
+ dp = VTOI(tdvp);
+ if (oldparent != dp->i_number)
+ newparent = dp->i_number;
+
+ if (doingdirectory && newparent) {
+ if (error) /* write access check above */
+ goto bad;
+ if (tvp)
+ vput(tvp);
+ /* checkpath vput's tdvp, compensate */
+ vref(tdvp);
+ error = ext4fs_checkpath(ip, dp, tcnp->cn_cred);
+ if (error) {
+ vrele(tdvp);
+ goto out;
+ }
+ if ((tcnp->cn_flags & SAVESTART) == 0)
+ panic("ext4fs_rename: lost to startdir");
+ error = vfs_relookup(tdvp, &tvp, tcnp);
+ if (error) {
+ vrele(tdvp);
+ goto out;
+ }
+ vrele(tdvp);
+ dp = VTOI(tdvp);
+ }
+
+ xp = NULL;
+ if (tvp)
+ xp = VTOI(tvp);
+
+ /*
+ * 2) If target doesn't exist, link the target to the source
+ * and unlink the source. Otherwise, rewrite the target
+ * directory entry to reference the source inode.
+ */
+ if (xp == NULL) {
+ /*
+ * Account for ".." in new directory.
+ * When source and destination have the same
+ * parent we don't fool with the link count.
+ */
+ if (doingdirectory && newparent) {
+ u_int16_t pnlink = letoh16(
+ dp->i_e4din->dinode.i_links_count);
+ pnlink++;
+ dp->i_e4din->dinode.i_links_count = htole16(pnlink);
+ dp->i_effnlink = pnlink;
+ dp->i_flag |= IN_CHANGE;
+ if ((error = ext4fs_update(dp, 1)) != 0)
+ goto bad;
+ }
+ error = ext4fs_direnter(ip, tdvp, tcnp);
+ if (error) {
+ if (doingdirectory && newparent) {
+ u_int16_t pnlink = letoh16(
+ dp->i_e4din->dinode.i_links_count);
+ if (pnlink > 1)
+ pnlink--;
+ dp->i_e4din->dinode.i_links_count =
+ htole16(pnlink);
+ dp->i_effnlink = pnlink;
+ dp->i_flag |= IN_CHANGE;
+ (void)ext4fs_update(dp, 1);
+ }
+ goto bad;
+ }
+ vput(tdvp);
+ } else {
+ /*
+ * Target exists. If replacing a directory,
+ * check that it is empty BEFORE rewriting
+ * the directory entry.
+ */
+ if (doingdirectory) {
+ if (!ext4fs_dirempty(xp, dp->i_number,
+ tcnp->cn_cred)) {
+ error = ENOTEMPTY;
+ goto bad;
+ }
+ }
+
+ /* Rewrite the entry to point to source inode */
+ error = ext4fs_dirrewrite(dp, ip, tcnp);
+ if (error)
+ goto bad;
+
+ /*
+ * If the target directory is in the same
+ * directory as the source directory,
+ * decrement the link count on the parent
+ * of the target directory.
+ */
+ if (doingdirectory && !newparent) {
+ u_int16_t pnlink = letoh16(
+ dp->i_e4din->dinode.i_links_count);
+ if (pnlink > 1)
+ pnlink--;
+ dp->i_e4din->dinode.i_links_count = htole16(pnlink);
+ dp->i_effnlink = pnlink;
+ dp->i_flag |= IN_CHANGE;
+ }
+ vput(tdvp);
+
+ /*
+ * Adjust the link count of the target to
+ * reflect the dirrewrite above.
+ */
+ {
+ u_int16_t xnlink =
+ letoh16(xp->i_e4din->dinode.i_links_count);
+ if (xnlink > 0)
+ xnlink--;
+ if (doingdirectory) {
+ if (xnlink > 0)
+ xnlink--;
+ error = ext4fs_truncate(xp, 0, 0,
+ tcnp->cn_cred);
+ }
+ xp->i_e4din->dinode.i_links_count = htole16(xnlink);
+ xp->i_effnlink = xnlink;
+ xp->i_flag |= IN_CHANGE;
+ }
+ vput(tvp);
+ xp = NULL;
+ }
+
+ /*
+ * 3) Unlink the source.
+ * Re-lookup the source entry to get correct i_offset/i_count,
+ * since the target lookup overwrites them (especially when
+ * fdvp == tdvp, i.e., same-directory rename).
+ */
+ fcnp->cn_flags &= ~MODMASK;
+ fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+ if ((fcnp->cn_flags & SAVESTART) == 0)
+ panic("ext4fs_rename: lost from startdir");
+ (void) vfs_relookup(fdvp, &fvp, fcnp);
+ if (fvp != NULL) {
+ xp = VTOI(fvp);
+ dp = VTOI(fdvp);
+ } else {
+ if (doingdirectory)
+ panic("ext4fs_rename: lost dir entry");
+ vrele(ap->a_fvp);
+ return (0);
+ }
+
+ if (xp != ip) {
+ if (doingdirectory)
+ panic("ext4fs_rename: lost dir entry");
+ } else {
+ /* If directory moved to new parent, update ".." */
+ if (doingdirectory && newparent) {
+ struct buf *dbp;
+ struct ext4fs_directory *dotdot;
+ u_int64_t dpblk;
+
+ dp->i_e4din->dinode.i_links_count = htole16(
+ letoh16(dp->i_e4din->dinode.i_links_count) - 1);
+ dp->i_effnlink--;
+ dp->i_flag |= IN_CHANGE;
+
+ error = ext4fs_extent_pblk(ip, 0, &dpblk, NULL);
+ if (error == 0 && dpblk != 0) {
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(ip->i_e4fs,
+ dpblk), ip->i_e4fs->m_block_size, &dbp);
+ if (error == 0) {
+ dotdot = (struct ext4fs_directory *)
+ ((char *)dbp->b_data +
+ letoh16(((struct ext4fs_directory *)
+ dbp->b_data)->e4d_reclen));
+ dotdot->e4d_ino = htole32(newparent);
+ ext4fs_dir_set_csum(ip->i_e4fs,
+ ip->i_number,
+ ip->i_e4din->dinode.
+ i_nfs_generation,
+ dbp->b_data);
+ bwrite(dbp);
+ } else
+ brelse(dbp);
+ }
+ }
+
+ error = ext4fs_dirremove(fdvp, fcnp);
+ if (!error) {
+ nlink = letoh16(
+ xp->i_e4din->dinode.i_links_count);
+ if (nlink > 0)
+ nlink--;
+ xp->i_e4din->dinode.i_links_count = htole16(nlink);
+ xp->i_effnlink = nlink;
+ xp->i_flag |= IN_CHANGE;
+ }
+ }
+ if (dp)
+ vput(fdvp);
+ if (xp)
+ vput(fvp);
+ vrele(ap->a_fvp);
+ return (error);
+
+bad:
+ if (xp)
+ vput(ITOV(xp));
+ vput(ITOV(dp));
+out:
+ if (doingdirectory)
+ ip->i_flag &= ~IN_RENAME;
+ if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
+ nlink = letoh16(ip->i_e4din->dinode.i_links_count);
+ if (nlink > 0)
+ nlink--;
+ ip->i_e4din->dinode.i_links_count = htole16(nlink);
+ ip->i_effnlink = nlink;
+ ip->i_flag |= IN_CHANGE;
+ vput(fvp);
+ } else
+ vrele(fvp);
+ return (error);
+}
+
+int
+ext4fs_mkdir(void *v)
+{
+ struct vop_mkdir_args *ap = v;
+ struct vnode *dvp = ap->a_dvp;
+ struct vattr *vap = ap->a_vap;
+ struct componentname *cnp = ap->a_cnp;
+ struct inode *dp = VTOI(dvp);
+ struct inode *ip;
+ struct vnode *tvp;
+ struct buf *bp;
+ struct ext4fs_directory *dirp;
+ struct ext4fs_dinode *din;
+ struct m_ext4fs *fs = dp->i_e4fs;
+ int error;
+ u_int16_t nlink;
+
+ nlink = letoh16(dp->i_e4din->dinode.i_links_count);
+ if (nlink >= EXT4FS_LINK_MAX) {
+ error = EMLINK;
+ pool_put(&namei_pool, cnp->cn_pnbuf);
+ goto out;
+ }
+
+ /* Allocate inode for new directory */
+ error = ext4fs_inode_alloc(dp, S_IFDIR | vap->va_mode,
+ cnp->cn_cred, &tvp);
+ if (error) {
+ pool_put(&namei_pool, cnp->cn_pnbuf);
+ goto out;
+ }
+
+ ip = VTOI(tvp);
+ din = &ip->i_e4din->dinode;
+
+ /* Set owner */
+ din->i_uid_lo = htole16(cnp->cn_cred->cr_uid & 0xFFFF);
+ din->i_uid_hi = htole16((cnp->cn_cred->cr_uid >> 16) & 0xFFFF);
+ {
+ gid_t gid = letoh16(dp->i_e4din->dinode.i_gid_lo) |
+ ((gid_t)letoh16(dp->i_e4din->dinode.i_gid_hi) << 16);
+ din->i_gid_lo = htole16(gid & 0xFFFF);
+ din->i_gid_hi = htole16((gid >> 16) & 0xFFFF);
+ }
+
+ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+ din->i_mode = htole16(S_IFDIR | vap->va_mode);
+ tvp->v_type = VDIR;
+ ip->i_effnlink = 2;
+ din->i_links_count = htole16(2);
+
+ /* Allocate first block for "." and ".." */
+ error = ext4fs_buf_alloc(ip, 0, fs->m_block_size, cnp->cn_cred,
+ &bp, B_CLRBUF);
+ if (error)
+ goto bad;
+
+ /* Write "." entry */
+ dirp = (struct ext4fs_directory *)bp->b_data;
+ dirp->e4d_ino = htole32((u_int32_t)ip->i_number);
+ dirp->e4d_reclen = htole16(12);
+ dirp->e4d_namlen = 1;
+ dirp->e4d_type = EXT4FS_FT_DIR;
+ dirp->e4d_name[0] = '.';
+
+ /* Write ".." entry */
+ dirp = (struct ext4fs_directory *)((char *)bp->b_data + 12);
+ dirp->e4d_ino = htole32((u_int32_t)dp->i_number);
+ dirp->e4d_reclen = htole16(fs->m_block_size - 12 -
+ ((fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM) ? EXT4FS_DIR_TAIL_SIZE : 0));
+ dirp->e4d_namlen = 2;
+ dirp->e4d_type = EXT4FS_FT_DIR;
+ dirp->e4d_name[0] = '.';
+ dirp->e4d_name[1] = '.';
+
+ ext4fs_dir_set_csum(fs, ip->i_number,
+ ip->i_e4din->dinode.i_nfs_generation, bp->b_data);
+ error = bwrite(bp);
+ if (error)
+ goto bad;
+
+ /* Set directory size */
+ ext4fs_setsize(ip, fs->m_block_size);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+ /* Write inode before directory entry */
+ if ((error = ext4fs_update(ip, 1)) != 0)
+ goto bad;
+
+ /* Increment parent's link count for ".." */
+ nlink++;
+ dp->i_e4din->dinode.i_links_count = htole16(nlink);
+ dp->i_effnlink = nlink;
+ dp->i_flag |= IN_CHANGE;
+ if ((error = ext4fs_update(dp, 1)) != 0)
+ goto bad;
+
+ /* Enter new directory in parent */
+ error = ext4fs_direnter(ip, dvp, cnp);
+ if (error) {
+ /* Undo parent nlink */
+ nlink--;
+ dp->i_e4din->dinode.i_links_count = htole16(nlink);
+ dp->i_effnlink = nlink;
+ dp->i_flag |= IN_CHANGE;
+ goto bad;
+ }
+
+ if ((cnp->cn_flags & SAVESTART) == 0)
+ pool_put(&namei_pool, cnp->cn_pnbuf);
+ *ap->a_vpp = tvp;
+
+ vput(dvp);
+ return (0);
+
+bad:
+ pool_put(&namei_pool, cnp->cn_pnbuf);
+ ip->i_effnlink = 0;
+ din->i_links_count = htole16(0);
+ ip->i_flag |= IN_CHANGE;
+ tvp->v_type = VNON;
+ vput(tvp);
+out:
+ vput(dvp);
+ return (error);
+}
+
+int
+ext4fs_rmdir(void *v)
+{
+ struct vop_rmdir_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct vnode *dvp = ap->a_dvp;
+ struct componentname *cnp = ap->a_cnp;
+ struct inode *ip = VTOI(vp);
+ struct inode *dp = VTOI(dvp);
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ u_int16_t nlink;
+ int error;
+
+ /* Directory must be empty */
+ if (!ext4fs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
+ error = ENOTEMPTY;
+ goto out;
+ }
+
+ /* Remove entry from parent */
+ error = ext4fs_dirremove(dvp, cnp);
+ if (error)
+ goto out;
+
+ /* Decrement parent's link count ("..") */
+ nlink = letoh16(dp->i_e4din->dinode.i_links_count);
+ if (nlink > 1)
+ nlink--;
+ dp->i_e4din->dinode.i_links_count = htole16(nlink);
+ dp->i_effnlink = nlink;
+ dp->i_flag |= IN_CHANGE;
+
+ cache_purge(dvp);
+
+ /* Set target link count to 0 */
+ din->i_links_count = htole16(0);
+ ip->i_effnlink = 0;
+ ip->i_flag |= IN_CHANGE;
+
+ /* Truncate directory contents */
+ error = ext4fs_truncate(ip, 0, 0, cnp->cn_cred);
+
+ cache_purge(vp);
+
+out:
+ if (dvp == vp)
+ vrele(vp);
+ else
+ vput(vp);
+ vput(dvp);
+ return (error);
+}
+
+int
+ext4fs_symlink(void *v)
+{
+ struct vop_symlink_args *ap = v;
+ struct vnode *dvp = ap->a_dvp;
+ struct vattr *vap = ap->a_vap;
+ struct componentname *cnp = ap->a_cnp;
+ struct vnode **vpp = ap->a_vpp;
+ struct inode *ip;
+ int error, len;
+
+ error = ext4fs_makeinode(S_IFLNK | vap->va_mode, dvp, vpp, cnp);
+ if (error) {
+ vput(dvp);
+ return (error);
+ }
+
+ ip = VTOI(*vpp);
+ len = strlen(ap->a_target);
+
+ if (len <= EXT4FS_SYMLINK_LEN_MAX) {
+ /* Fast symlink: store inline in i_block[] */
+ memcpy(ip->i_e4din->dinode.i_block, ap->a_target, len);
+ ext4fs_setsize(ip, len);
+ /* Clear EXTENTS flag for fast symlinks */
+ ip->i_e4din->dinode.i_flags &=
+ ~htole32(EXTFS_INODE_FLAG_EXTENTS);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ error = ext4fs_update(ip, 1);
+ } else {
+ /* Slow symlink: write to data blocks */
+ struct uio auio;
+ struct iovec aiov;
+
+ aiov.iov_base = ap->a_target;
+ aiov.iov_len = len;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_procp = NULL;
+ auio.uio_resid = len;
+ error = VOP_WRITE(*vpp, &auio, IO_NODELOCKED, ap->a_cnp->cn_cred);
+ }
+
+ vput(*vpp);
+ vput(dvp);
+ return (error);
+}
+
+int
+ext4fs_readdir(void *v)
+{
+ struct vop_readdir_args *ap = v;
+ struct uio *uio = ap->a_uio;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct ext4fs_directory *ep;
+ struct dirent dstd;
+ struct buf *bp;
+ off_t off, filesz;
+ u_int64_t lbn, pblk, blkoff;
+ u_int16_t reclen;
+ int error = 0;
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ filesz = (off_t)letoh32(din->i_size_lo) |
+ ((off_t)letoh32(din->i_size_hi) << 32);
+ off = uio->uio_offset;
+
+ while (off < filesz && uio->uio_resid > 0) {
+ lbn = EXT4FS_LBLKNO(fs, off);
+
+ error = ext4fs_extent_pblk(ip, lbn, &pblk, NULL);
+ if (error || pblk == 0) {
+ if (!error) error = EIO;
+ break;
+ }
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ break;
+ }
+
+ blkoff = EXT4FS_BLKOFF(fs, off);
+
+ while (blkoff < fs->m_block_size && off < filesz) {
+ ep = (struct ext4fs_directory *)
+ ((char *)bp->b_data + blkoff);
+ reclen = letoh16(ep->e4d_reclen);
+
+ if (reclen < 8 || reclen > fs->m_block_size ||
+ blkoff + reclen > fs->m_block_size) {
+ error = EIO;
+ brelse(bp);
+ goto done;
+ }
+
+ if (letoh32(ep->e4d_ino) != 0) {
+ u_int8_t namlen = ep->e4d_namlen;
+
+ if (namlen > reclen - 8 ||
+ namlen > MAXNAMLEN) {
+ error = EIO;
+ brelse(bp);
+ goto done;
+ }
+
+ memset(&dstd, 0, sizeof(dstd));
+ dstd.d_fileno = letoh32(ep->e4d_ino);
+ dstd.d_namlen = namlen;
+
+ if (ep->e4d_type < EXT4FS_FT_MAX)
+ dstd.d_type =
+ ext4fs_type_to_dt[ep->e4d_type];
+ else
+ dstd.d_type = DT_UNKNOWN;
+
+ memcpy(dstd.d_name, ep->e4d_name,
+ namlen);
+ dstd.d_name[dstd.d_namlen] = '\0';
+ dstd.d_reclen = DIRENT_SIZE(&dstd);
+ dstd.d_off = off + reclen;
+
+ if (dstd.d_reclen > uio->uio_resid) {
+ brelse(bp);
+ goto done;
+ }
+
+ error = uiomove(&dstd, dstd.d_reclen, uio);
+ if (error) {
+ brelse(bp);
+ goto done;
+ }
+ }
+
+ off += reclen;
+ blkoff += reclen;
+ }
+
+ brelse(bp);
+ }
+
+done:
+ uio->uio_offset = off;
+ *ap->a_eofflag = (off >= filesz);
+ return (error);
+}
+
+int
+ext4fs_readlink(void *v)
+{
+ struct vop_readlink_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ u_int64_t filesz;
+
+ filesz = (u_int64_t)letoh32(din->i_size_lo) |
+ ((u_int64_t)letoh32(din->i_size_hi) << 32);
+
+ /* Fast symlink: target stored inline in i_block[] area */
+ if (filesz <= EXT4FS_SYMLINK_LEN_MAX &&
+ !(letoh32(din->i_flags) & EXTFS_INODE_FLAG_EXTENTS)) {
+ return (uiomove((char *)din->i_block, filesz, ap->a_uio));
+ }
+
+ /* Slow symlink: target stored in data blocks */
+ return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
+}
+
+/*
+ * Enter a directory entry for inode ip into directory dvp.
+ */
+int
+ext4fs_direnter(struct inode *ip, struct vnode *dvp,
+ struct componentname *cnp)
+{
+ struct inode *dp = VTOI(dvp);
+ struct m_ext4fs *fs = dp->i_e4fs;
+ struct ext4fs_dinode *ddin = &dp->i_e4din->dinode;
+ struct ext4fs_directory *ep, *nep;
+ struct buf *bp;
+ u_int64_t pblk;
+ off_t filesz;
+ int entrysize, error, loc;
+ u_int16_t reclen, mode;
+
+ entrysize = EXT4FS_DIRSIZ(cnp->cn_namelen);
+ mode = letoh16(ip->i_e4din->dinode.i_mode);
+
+ filesz = (off_t)letoh32(ddin->i_size_lo) |
+ ((off_t)letoh32(ddin->i_size_hi) << 32);
+
+ if (dp->i_count == 0) {
+ /*
+ * No free slot found - append at end of directory.
+ * Allocate a new block if needed.
+ */
+ u_int64_t lbn = EXT4FS_LBLKNO(fs, filesz);
+ u_int64_t blkoff = EXT4FS_BLKOFF(fs, filesz);
+
+ if (blkoff == 0) {
+ /* Need a new block */
+ error = ext4fs_buf_alloc(dp, lbn, fs->m_block_size,
+ cnp->cn_cred, &bp, B_CLRBUF);
+ if (error)
+ return (error);
+ } else {
+ error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL);
+ if (error || pblk == 0)
+ return (error ? error : EIO);
+ error = bread(dp->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+ }
+
+ /* Write entry at end */
+ ep = (struct ext4fs_directory *)
+ ((char *)bp->b_data + blkoff);
+ ep->e4d_ino = htole32((u_int32_t)ip->i_number);
+ {
+ int tail = (fs->m_feature_ro_compat &
+ EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM) ?
+ EXT4FS_DIR_TAIL_SIZE : 0;
+ if (blkoff == 0)
+ ep->e4d_reclen =
+ htole16(fs->m_block_size - tail);
+ else
+ ep->e4d_reclen =
+ htole16(fs->m_block_size - blkoff - tail);
+ }
+ ep->e4d_namlen = cnp->cn_namelen;
+ ep->e4d_type = ext4fs_mode_to_ft(mode);
+ memcpy(ep->e4d_name, cnp->cn_nameptr, cnp->cn_namelen);
+
+ ext4fs_dir_set_csum(fs, dp->i_number,
+ dp->i_e4din->dinode.i_nfs_generation, bp->b_data);
+ error = bwrite(bp);
+ if (error)
+ return (error);
+
+ /* Update directory size */
+ if (blkoff == 0)
+ ext4fs_setsize(dp, filesz + fs->m_block_size);
+ else
+ ext4fs_setsize(dp, filesz + entrysize);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (ext4fs_update(dp, 1));
+ }
+
+ /*
+ * Found a free slot at dp->i_offset with dp->i_count bytes.
+ * Read the block and compact entries to make room.
+ */
+ {
+ u_int64_t lbn = EXT4FS_LBLKNO(fs, dp->i_offset);
+
+ error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL);
+ if (error || pblk == 0)
+ return (error ? error : EIO);
+
+ error = bread(dp->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+ }
+
+ loc = EXT4FS_BLKOFF(fs, dp->i_offset);
+ ep = (struct ext4fs_directory *)((char *)bp->b_data + loc);
+ reclen = letoh16(ep->e4d_reclen);
+
+ if (letoh32(ep->e4d_ino) == 0) {
+ /* Unused entry - just overwrite */
+ ep->e4d_ino = htole32((u_int32_t)ip->i_number);
+ /* Keep reclen as is */
+ ep->e4d_namlen = cnp->cn_namelen;
+ ep->e4d_type = ext4fs_mode_to_ft(mode);
+ memcpy(ep->e4d_name, cnp->cn_nameptr, cnp->cn_namelen);
+ } else {
+ /* Compact: shrink current entry, add new one after it */
+ int oldentsz = EXT4FS_DIRSIZ(ep->e4d_namlen);
+
+ nep = (struct ext4fs_directory *)
+ ((char *)ep + oldentsz);
+ nep->e4d_ino = htole32((u_int32_t)ip->i_number);
+ nep->e4d_reclen = htole16(reclen - oldentsz);
+ nep->e4d_namlen = cnp->cn_namelen;
+ nep->e4d_type = ext4fs_mode_to_ft(mode);
+ memcpy(nep->e4d_name, cnp->cn_nameptr, cnp->cn_namelen);
+ ep->e4d_reclen = htole16(oldentsz);
+ }
+
+ ext4fs_dir_set_csum(fs, dp->i_number,
+ dp->i_e4din->dinode.i_nfs_generation, bp->b_data);
+ error = bwrite(bp);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ if (error == 0)
+ error = ext4fs_update(dp, 1);
+ return (error);
+}
+
+/*
+ * Remove a directory entry.
+ */
+int
+ext4fs_dirremove(struct vnode *dvp, struct componentname *cnp)
+{
+ struct inode *dp = VTOI(dvp);
+ struct m_ext4fs *fs = dp->i_e4fs;
+ struct ext4fs_directory *ep, *prevep;
+ struct buf *bp;
+ u_int64_t lbn, pblk;
+ int error, loc;
+
+ lbn = EXT4FS_LBLKNO(fs, dp->i_offset);
+
+ error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL);
+ if (error || pblk == 0)
+ return (error ? error : EIO);
+
+ error = bread(dp->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+
+ loc = EXT4FS_BLKOFF(fs, dp->i_offset);
+ ep = (struct ext4fs_directory *)((char *)bp->b_data + loc);
+
+ if (dp->i_count == 0) {
+ /* First entry in block: just zero the inode field */
+ ep->e4d_ino = 0;
+ } else {
+ /* Merge with previous entry */
+ int prevloc = EXT4FS_BLKOFF(fs, dp->i_offset - dp->i_count);
+ prevep = (struct ext4fs_directory *)
+ ((char *)bp->b_data + prevloc);
+ prevep->e4d_reclen = htole16(
+ letoh16(prevep->e4d_reclen) + letoh16(ep->e4d_reclen));
+ }
+
+ ext4fs_dir_set_csum(fs, dp->i_number,
+ dp->i_e4din->dinode.i_nfs_generation, bp->b_data);
+ error = bwrite(bp);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (error);
+}
+
+/*
+ * Check if a directory is empty (contains only "." and "..").
+ */
+int
+ext4fs_dirempty(struct inode *ip, ufsino_t parentino, struct ucred *cred)
+{
+ struct m_ext4fs *fs = ip->i_e4fs;
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ struct ext4fs_directory *ep;
+ struct buf *bp;
+ off_t off, filesz;
+ u_int64_t lbn, pblk, blkoff;
+ u_int16_t reclen;
+ int error;
+
+ filesz = (off_t)letoh32(din->i_size_lo) |
+ ((off_t)letoh32(din->i_size_hi) << 32);
+
+ for (off = 0; off < filesz; ) {
+ lbn = EXT4FS_LBLKNO(fs, off);
+
+ error = ext4fs_extent_pblk(ip, lbn, &pblk, NULL);
+ if (error || pblk == 0)
+ return (0);
+
+ error = bread(ip->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (0);
+ }
+
+ blkoff = EXT4FS_BLKOFF(fs, off);
+
+ while (blkoff < fs->m_block_size && off < filesz) {
+ ep = (struct ext4fs_directory *)
+ ((char *)bp->b_data + blkoff);
+ reclen = letoh16(ep->e4d_reclen);
+
+ if (reclen < 8 || reclen > fs->m_block_size ||
+ blkoff + reclen > fs->m_block_size) {
+ brelse(bp);
+ return (0);
+ }
+
+ if (letoh32(ep->e4d_ino) != 0) {
+ if (ep->e4d_namlen > 2) {
+ brelse(bp);
+ return (0);
+ }
+ if (ep->e4d_name[0] != '.') {
+ brelse(bp);
+ return (0);
+ }
+ if (ep->e4d_namlen == 1) {
+ /* "." - ok */
+ } else if (ep->e4d_name[1] == '.') {
+ /* ".." - ok */
+ } else {
+ brelse(bp);
+ return (0);
+ }
+ }
+
+ off += reclen;
+ blkoff += reclen;
+ }
+
+ brelse(bp);
+ }
+
+ return (1);
+}
+
+/*
+ * Rewrite an existing directory entry to point to a new inode.
+ */
+int
+ext4fs_dirrewrite(struct inode *dp, struct inode *ip,
+ struct componentname *cnp)
+{
+ struct m_ext4fs *fs = dp->i_e4fs;
+ struct ext4fs_directory *ep;
+ struct buf *bp;
+ u_int64_t lbn, pblk;
+ u_int16_t mode;
+ int error, loc;
+
+ lbn = EXT4FS_LBLKNO(fs, dp->i_offset);
+
+ error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL);
+ if (error || pblk == 0)
+ return (error ? error : EIO);
+
+ error = bread(dp->i_devvp,
+ (daddr_t)EXT4FS_FSBTODB(fs, pblk),
+ fs->m_block_size, &bp);
+ if (error) {
+ brelse(bp);
+ return (error);
+ }
+
+ loc = EXT4FS_BLKOFF(fs, dp->i_offset);
+ ep = (struct ext4fs_directory *)((char *)bp->b_data + loc);
+ ep->e4d_ino = htole32((u_int32_t)ip->i_number);
+ mode = letoh16(ip->i_e4din->dinode.i_mode);
+ ep->e4d_type = ext4fs_mode_to_ft(mode);
+
+ ext4fs_dir_set_csum(fs, dp->i_number,
+ dp->i_e4din->dinode.i_nfs_generation, bp->b_data);
+ error = bwrite(bp);
+ dp->i_flag |= IN_CHANGE | IN_UPDATE;
+ return (error);
+}
+
+int
+ext4fs_inactive(void *v)
+{
+ struct vop_inactive_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ u_int16_t mode, nlink = 1;
+ int error = 0;
+#ifdef DIAGNOSTIC
+ extern int prtactive;
+
+ if (prtactive && vp->v_usecount != 0)
+ vprint("ext4fs_inactive: pushing active", vp);
+#endif
+
+ /*
+ * Ignore inodes related to stale file handles.
+ */
+ if (ip->i_e4din == NULL) {
+ goto out;
+ }
+
+ mode = letoh16(ip->i_e4din->dinode.i_mode);
+ if (mode == 0) {
+ goto out;
+ }
+
+ /*
+ * If the inode was deleted (dtime != 0), skip further processing.
+ */
+ if (letoh32(ip->i_e4din->dinode.i_dtime) != 0) {
+ goto out;
+ }
+
+ nlink = letoh16(ip->i_e4din->dinode.i_links_count);
+
+ /*
+ * Handle file deletion: if nlink == 0, truncate data,
+ * free inode, and mark as deleted.
+ */
+ if (nlink == 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+ struct timespec ts;
+
+ (void)ext4fs_truncate(ip, 0, 0, NOCRED);
+
+ mode = letoh16(ip->i_e4din->dinode.i_mode);
+ ip->i_e4din->dinode.i_mode = htole16(0);
+
+ ext4fs_inode_free(ip, ip->i_number, mode);
+
+ getnanotime(&ts);
+ ip->i_e4din->dinode.i_dtime =
+ htole32((u_int32_t)ts.tv_sec);
+ ip->i_flag |= IN_CHANGE | IN_UPDATE;
+ }
+
+ if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) {
+ ext4fs_update(ip, nlink == 0 ? 1 : 0);
+ }
+
+out:
+ VOP_UNLOCK(vp);
+
+ /*
+ * If we are done with the inode, reclaim it
+ * so that it can be reused immediately.
+ * NOTE: after vrecycle, ip is freed (use-after-free danger).
+ */
+ if (ip->i_e4din == NULL ||
+ letoh16(ip->i_e4din->dinode.i_mode) == 0)
+ vrecycle(vp, ap->a_p);
+
+ return (error);
+}
+
+int
+ext4fs_reclaim(void *v)
+{
+ struct vop_reclaim_args *ap = v;
+ struct vnode *vp = ap->a_vp;
+ struct inode *ip = VTOI(vp);
+ int error;
+
+ if ((error = ufs_reclaim(vp)) != 0)
+ return (error);
+
+ if (ip->i_e4din != NULL)
+ pool_put(&ext4fs_dinode_pool, ip->i_e4din);
+
+ pool_put(&ext4fs_inode_pool, ip);
+
+ vp->v_data = NULL;
+
+ return (0);
+}
+
+int
+ext4fs_bmap(void *v)
+{
+ struct vop_bmap_args *ap = v;
+ struct inode *ip = VTOI(ap->a_vp);
+ struct m_ext4fs *fs = ip->i_e4fs;
+ u_int64_t pblk, ncontig;
+ int error;
+
+ if (ap->a_vpp != NULL)
+ *ap->a_vpp = ip->i_devvp;
+ if (ap->a_bnp == NULL)
+ return (0);
+
+ error = ext4fs_extent_pblk(ip, (u_int64_t)ap->a_bn, &pblk, &ncontig);
+ if (error) {
+ *ap->a_bnp = -1;
+ return (error);
+ }
+
+ if (pblk == 0) {
+ /* Hole — no physical block allocated */
+ *ap->a_bnp = -1;
+ if (ap->a_runp != NULL)
+ *ap->a_runp = 0;
+ return (0);
+ }
+
+ *ap->a_bnp = (daddr_t)EXT4FS_FSBTODB(fs, pblk);
+
+ if (ap->a_runp != NULL) {
+ int maxrun = MAXBSIZE / fs->m_block_size - 1;
+ *ap->a_runp = MIN((int)(ncontig - 1), maxrun);
+ if (*ap->a_runp < 0)
+ *ap->a_runp = 0;
+ }
+
+ return (0);
+}
+
+int
+ext4fs_strategy(void *v)
+{
+ struct vop_strategy_args *ap = v;
+ struct buf *bp = ap->a_bp;
+ struct vnode *vp = bp->b_vp;
+ struct inode *ip;
+ int error;
+ int s;
+
+ ip = VTOI(vp);
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ panic("ext4fs_strategy: spec");
+
+ if (bp->b_blkno == bp->b_lblkno) {
+ error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+ NULL);
+ if (error) {
+ bp->b_error = error;
+ bp->b_flags |= B_ERROR;
+ s = splbio();
+ biodone(bp);
+ splx(s);
+ return (error);
+ }
+ if (bp->b_blkno == -1)
+ clrbuf(bp);
+ }
+ if (bp->b_blkno == -1) {
+ s = splbio();
+ biodone(bp);
+ splx(s);
+ return (0);
+ }
+ vp = ip->i_devvp;
+ bp->b_dev = vp->v_rdev;
+ VOP_STRATEGY(vp, bp);
+ return (0);
+}
+
+int
+ext4fs_print(void *v)
+{
+ struct vop_print_args *ap = v;
+ struct inode *ip = VTOI(ap->a_vp);
+
+ printf("tag VT_EXT4FS, ino %llu, on dev %d, %d",
+ (unsigned long long)ip->i_number,
+ major(ip->i_dev), minor(ip->i_dev));
+ printf(" flags 0x%x, effnlink %d\n",
+ ip->i_flag, ip->i_effnlink);
+ return (0);
+}
+
+int
+ext4fs_pathconf(void *v)
+{
+ struct vop_pathconf_args *ap = v;
+
+
+ switch (ap->a_name) {
+ case _PC_LINK_MAX:
+ *ap->a_retval = EXT4FS_LINK_MAX;
+ break;
+ case _PC_NAME_MAX:
+ *ap->a_retval = EXT4FS_MAXNAMLEN;
+ break;
+ case _PC_PATH_MAX:
+ *ap->a_retval = PATH_MAX;
+ break;
+ case _PC_PIPE_BUF:
+ *ap->a_retval = PIPE_BUF;
+ break;
+ case _PC_CHOWN_RESTRICTED:
+ *ap->a_retval = 1;
+ break;
+ case _PC_NO_TRUNC:
+ *ap->a_retval = 1;
+ break;
+ case _PC_TIMESTAMP_RESOLUTION:
+ *ap->a_retval = 1;
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+int
+ext4fs_advlock(void *v)
+{
+ struct vop_advlock_args *ap = v;
+ struct inode *ip = VTOI(ap->a_vp);
+
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ off_t filesz;
+
+ filesz = (off_t)letoh32(din->i_size_lo) |
+ ((off_t)letoh32(din->i_size_hi) << 32);
+ return (lf_advlock(&ip->i_lockf, filesz, ap->a_id, ap->a_op,
+ ap->a_fl, ap->a_flags));
+}
diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h
index 16aaac8b7..2d9224f0f 100644
--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h
@@ -43,6 +43,7 @@
#include <ufs/ufs/dir.h>
#include <ufs/ext2fs/ext2fs_dinode.h>
#include <ufs/ext2fs/ext2fs_extents.h>
+#include <ufs/ext4fs/ext4fs_dinode.h>
/*
@@ -76,11 +77,13 @@ struct inode {
union { /* Associated filesystem. */
struct fs *fs; /* FFS */
- struct m_ext2fs *e2fs; /* EXT2FS */
+ struct m_ext2fs *e2fs; /* EXT2FS */
+ struct m_ext4fs *e4fs; /* EXT4FS */
} inode_u;
#define i_fs inode_u.fs
#define i_e2fs inode_u.e2fs
+#define i_e4fs inode_u.e4fs
struct cluster_info i_ci;
struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */
@@ -117,14 +120,16 @@ struct inode {
* The on-disk dinode itself.
*/
union {
- struct ufs1_dinode *ffs1_din;
- struct ufs2_dinode *ffs2_din;
- struct ext2fs_dinode *e2fs_din;
+ struct ufs1_dinode *ffs1_din;
+ struct ufs2_dinode *ffs2_din;
+ struct ext2fs_dinode *e2fs_din;
+ struct ext4fs_dinode_256 *e4fs_din;
} dinode_u;
#define i_din1 dinode_u.ffs1_din
#define i_din2 dinode_u.ffs2_din
#define i_e2din dinode_u.e2fs_din
+#define i_e4din dinode_u.e4fs_din
struct inode_vtbl *i_vtbl;
};
@@ -224,6 +229,8 @@ struct inode_vtbl {
#define i_uid i_din1->di_uid
#endif /* _KERNEL */
+#define i_e4fs_nlink i_e4din->dinode.i_links_count
+
#define i_e2fs_mode i_e2din->e2di_mode
#define i_e2fs_size i_e2din->e2di_size
#define i_e2fs_atime i_e2din->e2di_atime
diff --git a/sys/ufs/ufs/ufs_ihash.c b/sys/ufs/ufs/ufs_ihash.c
index f470326e5..a192c9a0b 100644
--- a/sys/ufs/ufs/ufs_ihash.c
+++ b/sys/ufs/ufs/ufs_ihash.c
@@ -43,6 +43,7 @@
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext4fs/ext4fs_extern.h>
#include <crypto/siphash.h>
@@ -119,6 +120,11 @@ loop:
*/
IS_EXT2_VNODE(vp) ? ip->i_e2fs_nlink <= 0 :
#endif
+ /*
+ * XXX DIP does not cover ext4fs either;
+ * use i_e4din directly like ext2fs uses i_e2din.
+ */
+ vp->v_tag == VT_EXT4FS ? ip->i_e4fs_nlink <= 0 :
DIP(ip, nlink) <= 0) &&
(vp->v_mount->mnt_flag & MNT_RDONLY) == 0)) {
/*
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 80f403861..8c2345021 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -68,6 +68,9 @@
#include <ufs/ufs/dirhash.h>
#endif
#include <ufs/ext2fs/ext2fs_extern.h>
+#ifdef EXT4FS
+#include <ufs/ext4fs/ext4fs.h>
+#endif
#include <uvm/uvm_extern.h>
@@ -108,6 +111,12 @@ ufs_itimes(struct vnode *vp)
goto out;
}
#endif
+#ifdef EXT4FS
+ if (vp->v_tag == VT_EXT4FS) {
+ EXT4FS_ITIMES(ip);
+ goto out;
+ }
+#endif
if ((vp->v_type == VBLK || vp->v_type == VCHR))
ip->i_flag |= IN_LAZYMOD;
@@ -1864,6 +1873,14 @@ filt_ufsread(struct knote *kn, long hint)
if (IS_EXT2_VNODE(ip->i_vnode))
kn->kn_data = ext2fs_size(ip) - foffset(kn->kn_fp);
else
+#endif
+#ifdef EXT4FS
+ if (ip->i_vnode->v_tag == VT_EXT4FS) {
+ struct ext4fs_dinode *din = &ip->i_e4din->dinode;
+ kn->kn_data = ((off_t)letoh32(din->i_size_lo) |
+ ((off_t)letoh32(din->i_size_hi) << 32)) -
+ foffset(kn->kn_fp);
+ } else
#endif
kn->kn_data = DIP(ip, size) - foffset(kn->kn_fp);
if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) {
diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h
index 3ad8938e4..40ea54993 100644
--- a/sys/ufs/ufs/ufsmount.h
+++ b/sys/ufs/ufs/ufsmount.h
@@ -51,10 +51,13 @@ struct ufsmount {
union { /* pointer to superblock */
struct fs *fs; /* FFS */
struct m_ext2fs *e2fs; /* EXT2FS */
+ struct m_ext4fs *e4fs; /* EXT4FS */
} ufsmount_u;
#define um_fs ufsmount_u.fs
#define um_e2fs ufsmount_u.e2fs
#define um_e2fsb ufsmount_u.e2fs->s_es
+#define um_e4fs ufsmount_u.e4fs
+//#define um_e4fsb ufsmount_u.e4fs->s_es
struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */
struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */
@@ -72,9 +75,10 @@ struct ufsmount {
/*
* Filesystem types
*/
-#define UM_UFS1 1
-#define UM_UFS2 2
+#define UM_UFS1 1
+#define UM_UFS2 2
#define UM_EXT2FS 3
+#define UM_EXT4FS 4
/*
* Flags describing the state of quotas.
[patch] ext4fs rw