From: Thomas de Grivel Subject: [patch] ext4fs rw To: tech@openbsd.org Date: Tue, 17 Mar 2026 19:26:07 +0100 Hi tech@, Here is a patch to attach ext4fs drives to OpenBSD with full ext4 support (compatible with recent Linux). Performances are 610MB/s read/write vs 830MB/s for FFS2 on an NVMe. No journalling. Recovery at mount time not tested. All tests pass e2fstools / e2fsck without trouble. Please test and reply without too much flames as I'm rather new to the kernel side of development. I hope this helps, Cheers, diff --git a/sbin/mount_ext4fs/Makefile b/sbin/mount_ext4fs/Makefile new file mode 100644 index 000000000..939f3e8a5 --- /dev/null +++ b/sbin/mount_ext4fs/Makefile @@ -0,0 +1,11 @@ +# $OpenBSD: Makefile,v 1.1 1996/06/27 07:20:28 downsj Exp $ + +PROG= mount_ext4fs +SRCS= mount_ext4fs.c getmntopts.c +MAN= mount_ext4fs.8 + +MOUNT= ${.CURDIR}/../mount +CFLAGS+= -I${MOUNT} +.PATH: ${MOUNT} + +.include diff --git a/sbin/mount_ext4fs/mount_ext4fs.8 b/sbin/mount_ext4fs/mount_ext4fs.8 new file mode 100644 index 000000000..95aef0db2 --- /dev/null +++ b/sbin/mount_ext4fs/mount_ext4fs.8 @@ -0,0 +1,88 @@ +.\" $OpenBSD: $ +.\" +.\" Copyright (c) 1993, 1994 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" +.Dd $Mdocdate: April 23 2020 $ +.Dt MOUNT_EXT4FS 8 +.Os +.Sh NAME +.Nm mount_ext4fs +.Nd mount an ext4fs file system +.Sh SYNOPSIS +.Nm mount_ext4fs +.Op Fl o Ar options +.Ar special +.Ar node +.Sh DESCRIPTION +The +.Nm +command attaches an ext4fs file system +.Ar special +device on to the file system tree at the point +.Ar node . +This command is invoked by +.Xr mount 8 +when using the syntax +.Bd -ragged -offset 4n +.Nm mount Op options +-t ext4fs +.Ar special node +.Ed +.Pp +The +.Ar special +device must correspond to a partition registered in the +.Xr disklabel 5 . +.Pp +This command is normally executed by +.Xr mount 8 +at boot time. +.Pp +The options are as follows: +.Bl -tag -width Ds +.It Fl o Ar options +Options are specified with a +.Fl o +flag followed by a comma separated string of options. +See the +.Xr mount 8 +man page for possible options and their meanings. +.El +.Sh SEE ALSO +.Xr mount 2 , +.Xr disklabel 5 , +.Xr fstab 5 , +.Xr disklabel 8 , +.Xr mount 8 , +.Xr umount 8 +.Sh HISTORY +The +.Nm +function first appeared in +.Fx 2.2 . diff --git a/sbin/mount_ext4fs/mount_ext4fs.c b/sbin/mount_ext4fs/mount_ext4fs.c new file mode 100644 index 000000000..1148a9cab --- /dev/null +++ b/sbin/mount_ext4fs/mount_ext4fs.c @@ -0,0 +1,113 @@ +/* $OpenBSD: $ */ +/* $NetBSD: mount_ffs.c,v 1.3 1996/04/13 01:31:19 jtc Exp $ */ + +/*- + * Copyright (c) 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "mntopts.h" + +void ext4fs_usage(void); + +static const struct mntopt mopts[] = { + MOPT_STDOPTS, + MOPT_UPDATE, + { NULL } +}; + +int +main(int argc, char *argv[]) +{ + struct ufs_args args; /* XXX ffs_args */ + int ch, mntflags; + char fs_name[PATH_MAX], *errcause; + + mntflags = 0; + optind = optreset = 1; /* Reset for parse of new argv. */ + while ((ch = getopt(argc, argv, "o:")) != -1) + switch (ch) { + case 'o': + getmntopts(optarg, mopts, &mntflags); + break; + default: + ext4fs_usage(); + } + argc -= optind; + argv += optind; + + if (argc != 2) + ext4fs_usage(); + + args.fspec = argv[0]; /* The name of the device file. */ + if (realpath(argv[1], fs_name) == NULL) /* The mount point. */ + err(1, "realpath %s", argv[1]); + + #define DEFAULT_ROOTUID -2 + args.export_info.ex_root = DEFAULT_ROOTUID; + + if (mntflags & MNT_RDONLY) + args.export_info.ex_flags = MNT_EXRDONLY; + else + args.export_info.ex_flags = 0; + if (mount(MOUNT_EXT4FS, fs_name, mntflags, &args) == -1) { + switch (errno) { + case EMFILE: + errcause = "mount table full"; + break; + case EINVAL: + errcause = + "specified device does not match mounted device"; + break; + case EOPNOTSUPP: + errcause = "filesystem not supported by kernel"; + break; + default: + errcause = strerror(errno); + break; + } + errx(1, "%s on %s: %s", args.fspec, fs_name, errcause); + } + exit(0); +} + +void +ext4fs_usage(void) +{ + (void)fprintf(stderr, + "usage: mount_ext4fs [-o options] special node\n"); + exit(1); +} diff --git a/sys/conf/GENERIC b/sys/conf/GENERIC index f9f615912..86e3e597c 100644 --- a/sys/conf/GENERIC +++ b/sys/conf/GENERIC @@ -32,6 +32,7 @@ option FFS2 # UFS2 option UFS_DIRHASH # hash large directories option QUOTA # UFS quotas option EXT2FS # Second Extended Filesystem +option EXT4FS # Fourth Extended Filesystem option MFS # memory file system option NFSCLIENT # Network File System client option NFSSERVER # Network File System server diff --git a/sys/conf/files b/sys/conf/files index e5e66dca6..873653e75 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -994,15 +994,15 @@ file ufs/ffs/ffs_vfsops.c ffs | mfs file ufs/ffs/ffs_vnops.c ffs | mfs file ufs/mfs/mfs_vfsops.c mfs file ufs/mfs/mfs_vnops.c mfs -file ufs/ufs/ufs_bmap.c ffs | mfs | ext2fs +file ufs/ufs/ufs_bmap.c ffs | mfs | ext2fs | ext4fs file ufs/ufs/ufs_dirhash.c ufs_dirhash & (ffs | mfs) -file ufs/ufs/ufs_ihash.c ffs | mfs | ext2fs | fuse -file ufs/ufs/ufs_inode.c ffs | mfs | ext2fs -file ufs/ufs/ufs_lookup.c ffs | mfs | ext2fs -file ufs/ufs/ufs_quota.c quota & ( ffs | mfs | ext2fs ) +file ufs/ufs/ufs_ihash.c ffs | mfs | ext2fs | ext4fs | fuse +file ufs/ufs/ufs_inode.c ffs | mfs | ext2fs | ext4fs +file ufs/ufs/ufs_lookup.c ffs | mfs | ext2fs | ext4fs +file ufs/ufs/ufs_quota.c quota & ( ffs | mfs | ext2fs | ext4fs ) file ufs/ufs/ufs_quota_stub.c ffs | mfs -file ufs/ufs/ufs_vfsops.c ffs | mfs | ext2fs -file ufs/ufs/ufs_vnops.c ffs | mfs | ext2fs +file ufs/ufs/ufs_vfsops.c ffs | mfs | ext2fs | ext4fs +file ufs/ufs/ufs_vnops.c ffs | mfs | ext2fs | ext4fs file ufs/ext2fs/ext2fs_alloc.c ext2fs file ufs/ext2fs/ext2fs_balloc.c ext2fs file ufs/ext2fs/ext2fs_bmap.c ext2fs @@ -1014,6 +1014,10 @@ file ufs/ext2fs/ext2fs_readwrite.c ext2fs file ufs/ext2fs/ext2fs_subr.c ext2fs file ufs/ext2fs/ext2fs_vfsops.c ext2fs file ufs/ext2fs/ext2fs_vnops.c ext2fs +file ufs/ext4fs/ext4fs_crc32c.c ext4fs +file ufs/ext4fs/ext4fs_journal.c ext4fs +file ufs/ext4fs/ext4fs_vfsops.c ext4fs +file ufs/ext4fs/ext4fs_vnops.c ext4fs file uvm/uvm_addr.c file uvm/uvm_amap.c file uvm/uvm_anon.c diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c index 8d31047ac..4045a0bd5 100644 --- a/sys/kern/vfs_init.c +++ b/sys/kern/vfs_init.c @@ -100,6 +100,11 @@ static struct vfsconf vfsconflist[] = { { &tmpfs_vfsops, MOUNT_TMPFS, 19, 0, MNT_LOCAL, sizeof(struct tmpfs_args) }, #endif + +#ifdef EXT4FS + { &ext4fs_vfsops, MOUNT_EXT4FS, 22, 0, MNT_LOCAL, + sizeof(struct ufs_args) }, +#endif }; diff --git a/sys/sys/disklabel.h b/sys/sys/disklabel.h index 3d938666e..86a0fb267 100644 --- a/sys/sys/disklabel.h +++ b/sys/sys/disklabel.h @@ -275,6 +275,7 @@ static const char * const dktypenames[] = { #define FS_RAID 19 /* RAIDframe or softraid */ #define FS_NTFS 20 /* Windows/NT file system */ #define FS_UDF 21 /* UDF (DVD) filesystem */ +#define FS_EXT4FS 22 /* ext4fs */ #ifdef DKTYPENAMES static const char * const fstypenames[] = { @@ -300,6 +301,7 @@ static const char * const fstypenames[] = { "RAID", "NTFS", "UDF", + "ext4fs", NULL }; @@ -327,6 +329,7 @@ static char *fstypesnames[] = { "", /* 19 */ "ntfs", /* 20 */ "udf", /* 21 */ + "ext4fs", /* 22 */ NULL }; diff --git a/sys/sys/mount.h b/sys/sys/mount.h index a0010c55f..9cbe4607e 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -320,6 +320,7 @@ struct statfs { #define MOUNT_AFS "afs" /* Andrew Filesystem */ #define MOUNT_CD9660 "cd9660" /* ISO9660 (aka CDROM) Filesystem */ #define MOUNT_EXT2FS "ext2fs" /* Second Extended Filesystem */ +#define MOUNT_EXT4FS "ext4fs" /* Fourth Extended Filesystem */ #define MOUNT_NCPFS "ncpfs" /* NetWare Network File System */ #define MOUNT_NTFS "ntfs" /* NTFS */ #define MOUNT_UDF "udf" /* UDF */ @@ -556,6 +557,7 @@ extern const struct vfsops msdosfs_vfsops; extern const struct vfsops nfs_vfsops; extern const struct vfsops cd9660_vfsops; extern const struct vfsops ext2fs_vfsops; +extern const struct vfsops ext4fs_vfsops; extern const struct vfsops ntfs_vfsops; extern const struct vfsops udf_vfsops; extern const struct vfsops fusefs_vfsops; diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 6f1664ce1..c9970f6a9 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -67,12 +67,14 @@ enum vtagtype { VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_MSDOSFS, VT_PORTAL, VT_PROCFS, VT_AFS, VT_ISOFS, VT_ADOSFS, VT_EXT2FS, VT_VFS, VT_NTFS, VT_UDF, VT_FUSEFS, VT_TMPFS, + VT_EXT4FS, }; #define VTAG_NAMES \ "NON", "UFS", "NFS", "MFS", "MSDOSFS", \ "unused", "unused", "unused", "ISOFS", "unused", \ - "EXT2FS", "VFS", "NTFS", "UDF", "FUSEFS", "TMPFS" + "EXT2FS", "VFS", "NTFS", "UDF", "FUSEFS", "TMPFS", \ + "EXT4FS" /* * Each underlying filesystem allocates its own private area and hangs diff --git a/sys/ufs/ext4fs/ext4fs.h b/sys/ufs/ext4fs/ext4fs.h new file mode 100644 index 000000000..e95191eff --- /dev/null +++ b/sys/ufs/ext4fs/ext4fs.h @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2025 kmx.io. + * Copyright (c) 1997 Manuel Bouyer. + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Modified for ext4fs by kmx.io. + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct fid; +struct inode; +struct nameidata; +struct statfs; +struct vfsconf; + +#define EXT4FS_EXTENT_DEPTH_MAX 5 +#define EXT4FS_FUNCTION_MAX 32 +#define EXT4FS_REV_EXT2 0 +#define EXT4FS_REV_DYNAMIC 1 +#define EXT4FS_REV_MINOR 0 +#define EXT4FS_LAST_MOUNTED_MAX 64 +#define EXT4FS_LOG_MIN_BLOCK_SIZE 10 +#define EXT4FS_MAGIC 0xEF53 +#define EXT4FS_MOUNT_OPTS_MAX 64 +#define EXT4FS_LINK_MAX 65000 +#define EXT4FS_MAXNAMLEN 255 +#define EXT4FS_SUPER_BLOCK_OFFSET 1024 +#define EXT4FS_SUPER_BLOCK_SIZE 1024 +#define EXT4FS_VOLUME_NAME_MAX 16 + +#define EXT4FS_DIRECT_ADDR_IN_INODE 12 +#define EXT4FS_INDIRECT_ADDR_IN_INODE 3 +#define EXT4FS_SYMLINK_LEN_MAX \ + ((EXT4FS_DIRECT_ADDR_IN_INODE + \ + EXT4FS_INDIRECT_ADDR_IN_INODE) * sizeof(u_int32_t)) + +#define EXT4FS_NINDIR(fs) ((fs)->m_block_size / sizeof(u_int32_t)) + +#define EXT4FS_LBLKNO(fs, offset) ((offset) >> (fs)->m_block_size_shift) +#define EXT4FS_BLKOFF(fs, offset) ((offset) & ((fs)->m_block_size - 1)) +#define EXT4FS_FSBTODB(fs, b) ((b) << (fs)->m_fs_block_to_disk_block) + +#define EXT4FS_CHECKSUM_TYPE_NONE 0x0000 +#define EXT4FS_CHECKSUM_TYPE_CRC32C 0x0001 + +#define EXT4FS_ENCODING_NONE 0x0000 // legacy behavior +#define EXT4FS_ENCODING_UTF8 0x0001 // UTF-8, Unicode 12.1.0 + +#define EXT4FS_ENCODING_FLAG_NONE 0x0000 +#define EXT4FS_ENCODING_FLAG_STRICT_MODE 0x0001 // Reject invalid encoding + +#define EXT4FS_ERRORS_CONTINUE 1 // Log and keep going +#define EXT4FS_ERRORS_RO 2 // Remount read-only +#define EXT4FS_ERRORS_PANIC 3 // Kernel panic + +#define EXT4FS_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4FS_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4FS_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4FS_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4FS_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4FS_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT4FS_FEATURE_INCOMPAT_COMPRESSION 0x00001 +#define EXT4FS_FEATURE_INCOMPAT_FILETYPE 0x00002 +#define EXT4FS_FEATURE_INCOMPAT_RECOVER 0x00004 +#define EXT4FS_FEATURE_INCOMPAT_JOURNAL_DEV 0x00008 +#define EXT4FS_FEATURE_INCOMPAT_META_BG 0x00010 +#define EXT4FS_FEATURE_INCOMPAT_EXTENTS 0x00040 +#define EXT4FS_FEATURE_INCOMPAT_64BIT 0x00080 +#define EXT4FS_FEATURE_INCOMPAT_MMP 0x00100 +#define EXT4FS_FEATURE_INCOMPAT_FLEX_BG 0x00200 +#define EXT4FS_FEATURE_INCOMPAT_EA_INODE 0x00400 +#define EXT4FS_FEATURE_INCOMPAT_DIRDATA 0x01000 +#define EXT4FS_FEATURE_INCOMPAT_CSUM_SEED 0x02000 +#define EXT4FS_FEATURE_INCOMPAT_LARGEDIR 0x04000 +#define EXT4FS_FEATURE_INCOMPAT_INLINE_DATA 0x08000 +#define EXT4FS_FEATURE_INCOMPAT_ENCRYPT 0x10000 + +#define EXT4FS_FEATURE_INCOMPAT_SUPPORTED \ + (EXT4FS_FEATURE_INCOMPAT_FILETYPE | \ + EXT4FS_FEATURE_INCOMPAT_RECOVER | \ + EXT4FS_FEATURE_INCOMPAT_EXTENTS | \ + EXT4FS_FEATURE_INCOMPAT_64BIT | \ + EXT4FS_FEATURE_INCOMPAT_FLEX_BG | \ + EXT4FS_FEATURE_INCOMPAT_CSUM_SEED) + +#define EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4FS_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4FS_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4FS_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4FS_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4FS_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4FS_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4FS_FEATURE_RO_COMPAT_HAS_SNAPSHOT 0x0080 +#define EXT4FS_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4FS_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +#define EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4FS_FEATURE_RO_COMPAT_REPLICA 0x0800 +#define EXT4FS_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4FS_FEATURE_RO_COMPAT_PROJECT 0x2000 + +#define EXT4FS_FEATURE_RO_COMPAT_SUPPORTED \ + (EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER | \ + EXT4FS_FEATURE_RO_COMPAT_LARGE_FILE | \ + EXT4FS_FEATURE_RO_COMPAT_HUGE_FILE | \ + EXT4FS_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4FS_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM) + +#define EXT4FS_FLAG_SIGNED_HASH 0x0001 +#define EXT4FS_FLAG_UNSIGNED_HASH 0x0002 +#define EXT4FS_FLAG_TEST_FILESYS 0x0004 +#define EXT4FS_FLAG_64BIT 0x0008 +#define EXT4FS_FLAG_MOUNT_OPT_CHECK 0x0010 + +#define EXT4FS_INODE_BAD_BLOCKS 1 +#define EXT4FS_INODE_ROOT_DIR 2 +#define EXT4FS_INODE_USER_QUOTA 3 +#define EXT4FS_INODE_GROUP_QUOTA 4 +#define EXT4FS_INODE_BOOT_LOADER 5 +#define EXT4FS_INODE_JOURNAL 8 +#define EXT4FS_INODE_FIRST 11 + +#define EXTFS_INODE_FLAG_SECURE_RM 0x00000001 +#define EXTFS_INODE_FLAG_UN_RM 0x00000002 +#define EXTFS_INODE_FLAG_COMPRESSION 0x00000004 +#define EXTFS_INODE_FLAG_SYNC 0x00000008 +#define EXTFS_INODE_FLAG_IMMUTABLE 0x00000010 +#define EXTFS_INODE_FLAG_APPEND 0x00000020 +#define EXTFS_INODE_FLAG_NO_DUMP 0x00000040 +#define EXTFS_INODE_FLAG_NO_ATIME 0x00000080 +#define EXTFS_INODE_FLAG_DIRTY 0x00000100 +#define EXTFS_INODE_FLAG_COMPRESSED_BLOCKS 0x00000200 +#define EXTFS_INODE_FLAG_NO_COMPRESSION 0x00000400 +#define EXTFS_INODE_FLAG_ENCRYPTED 0x00000800 +#define EXTFS_INODE_FLAG_INDEX 0x00001000 +#define EXTFS_INODE_FLAG_IMAGIC 0x00002000 +#define EXTFS_INODE_FLAG_JOURNAL_DATA 0x00004000 +#define EXTFS_INODE_FLAG_NO_TAIL 0x00008000 +#define EXTFS_INODE_FLAG_DIR_SYNC 0x00010000 +#define EXTFS_INODE_FLAG_TOP_DIR 0x00020000 +#define EXTFS_INODE_FLAG_HUGE_FILE 0x00040000 +#define EXTFS_INODE_FLAG_EXTENTS 0x00080000 +#define EXTFS_INODE_FLAG_EXTENDED_ATTRIBUTES_INODE 0x00200000 +#define EXTFS_INODE_FLAG_EOF_BLOCKS 0x00400000 +#define EXTFS_INODE_FLAG_INLINE_DATA 0x10000000 +#define EXTFS_INODE_FLAG_PROJECT_ID_INHERITANCE 0x20000000 +#define EXTFS_INODE_FLAG_CASEFOLD 0x40000000 + +#define EXT4FS_MOUNT_READONLY 0x0001 +#define EXT4FS_MOUNT_NO_ATIME 0x0002 +#define EXT4FS_MOUNT_DIRSYNC 0x0004 +#define EXT4FS_MOUNT_DATA_JOURNAL 0x0008 +#define EXT4FS_MOUNT_DATA_ORDERED 0x0010 +#define EXT4FS_MOUNT_DATA_WRITEBACK 0x0020 +#define EXT4FS_MOUNT_ERRORS_CONTINUE 0x0040 +#define EXT4FS_MOUNT_ERRORS_REMOUNT_RO 0x0080 +#define EXT4FS_MOUNT_ERRORS_PANIC 0x0100 +#define EXT4FS_MOUNT_DISCARD 0x0200 +#define EXT4FS_MOUNT_NO_BUFFER_HEADS 0x0400 +#define EXT4FS_MOUNT_SKIP_JOURNAL 0x0800 +#define EXT4FS_MOUNT_NOAUTO_DELAYED_ALLOC 0x1000 + +#define EXT4FS_OS_LINUX 0 +#define EXT4FS_OS_HURD 1 +#define EXT4FS_OS_MASIX 2 +#define EXT4FS_OS_FREEBSD 3 +#define EXT4FS_OS_LITES 4 +#define EXT4FS_OS_OPENBSD 5 + +#define EXT4FS_STATE_VALID 0x0001 // Clean unmount +#define EXT4FS_STATE_ERROR 0x0002 // Errors detected (fsck needed) + +#define EXT4FS_BGD_FLAG_INODE_UNINIT 0x0001 +#define EXT4FS_BGD_FLAG_BLOCK_UNINIT 0x0002 +#define EXT4FS_BGD_FLAG_INODE_ZEROED 0x0004 +#define EXT4FS_BGD_FLAG_DIRTY 0x0008 +#define EXT4FS_BGD_FLAG_BLOCK_ZEROED 0x0010 +#define EXT4FS_BGD_FLAG_READ_ONLY 0x0020 + +struct ext4fs { + u_int32_t sb_inodes_count; + u_int32_t sb_blocks_count_lo; + u_int32_t sb_reserved_blocks_count_lo; + u_int32_t sb_free_blocks_count_lo; + // 0x10 + u_int32_t sb_free_inodes_count; + u_int32_t sb_first_data_block; + u_int32_t sb_log_block_size; // log2(block size) - 10 + u_int32_t sb_log_cluster_size; // log2(cluster size) - 10 + // 0x20 + u_int32_t sb_blocks_per_group; + u_int32_t sb_clusters_per_group; + u_int32_t sb_inodes_per_group; + u_int32_t sb_mount_time_lo; + // 0x30 + u_int32_t sb_write_time_lo; + u_int16_t sb_mount_count; + int16_t sb_max_mount_count_before_fsck; + u_int16_t sb_magic; + u_int16_t sb_state; // EXT4FS_STATE_* + u_int16_t sb_errors; // EXT4FS_ERRORS_* + u_int16_t sb_revision_level_minor; + // 0x40 + u_int32_t sb_check_time_lo; + u_int32_t sb_check_interval; + u_int32_t sb_creator_os; // EXT4FS_OS_* + u_int32_t sb_revision_level; + // 0x50 + u_int16_t sb_default_reserved_uid; + u_int16_t sb_default_reserved_gid; + u_int32_t sb_first_non_reserved_inode; + u_int16_t sb_inode_size; + u_int16_t sb_block_group_id; + u_int32_t sb_feature_compat; + // 0x60 + u_int32_t sb_feature_incompat; + u_int32_t sb_feature_ro_compat; + u_int8_t sb_uuid[16]; + char sb_volume_name[EXT4FS_VOLUME_NAME_MAX]; + char sb_last_mounted[EXT4FS_LAST_MOUNTED_MAX]; + u_int32_t sb_algorithm_usage_bitmap; + u_int8_t sb_preallocate_blocks; + u_int8_t sb_preallocate_dir_blocks; + u_int16_t sb_reserved_bgdt_blocks; + // 0xD0 + u_int8_t sb_journal_uuid[16]; // UUID of journal superblock + // 0xE0 + u_int32_t sb_journal_inode_number; + u_int32_t sb_journal_device_number; + u_int32_t sb_last_orphan; + u_int32_t sb_hash_seed[4]; + u_int8_t sb_default_hash_version; + u_int8_t sb_journal_backup_type; + u_int16_t sb_block_group_descriptor_size; + // 0x100 + u_int32_t sb_default_mount_opts; + u_int32_t sb_first_meta_block_group; + u_int32_t sb_newfs_time_lo; + u_int32_t sb_jnl_blocks[17]; // Backup of journal inode + // 0x150 + u_int32_t sb_blocks_count_hi; + u_int32_t sb_reserved_blocks_count_hi; + u_int32_t sb_free_blocks_count_hi; + u_int16_t sb_inode_size_extra_min; + u_int16_t sb_inode_size_extra_want; + // 0x160 + u_int32_t sb_flags; + u_int16_t sb_raid_stride_block_count; + u_int16_t sb_mmp_interval; + u_int64_t sb_mmp_block; + // 0x170 + u_int32_t sb_raid_stripe_width_block_count; + u_int8_t sb_log_groups_per_flex; + u_int8_t sb_checksum_type; + u_int16_t sb_reserved_176; + u_int64_t sb_kilobytes_written; + // 0x180 + u_int32_t sb_ext3_snapshot_inode; + u_int32_t sb_ext3_snapshot_id; + u_int64_t sb_ext3_snapshot_reserved_blocks_count; + // 0x190 + u_int32_t sb_ext3_snapshot_list; + u_int32_t sb_error_count; + u_int32_t sb_first_error_time_lo; + u_int32_t sb_first_error_inode; + // 0x1A0 + u_int64_t sb_first_error_block; + char sb_first_error_function[EXT4FS_FUNCTION_MAX]; + u_int32_t sb_first_error_line; + u_int32_t sb_last_error_time_lo; + // 0x1D0 + u_int32_t sb_last_error_inode; + u_int32_t sb_last_error_line; + u_int64_t sb_last_error_block; + // 0x1E0 + char sb_last_error_function[EXT4FS_FUNCTION_MAX]; + // 0x200 + char sb_mount_opts[EXT4FS_MOUNT_OPTS_MAX]; + // 0x240 + u_int32_t sb_user_quota_inode; + u_int32_t sb_group_quota_inode; + u_int32_t sb_overhead_clusters; + u_int32_t sb_backup_block_groups[2]; + u_int8_t sb_encrypt_algos[4]; + u_int8_t sb_encrypt_pw_salt[16]; + u_int32_t sb_lost_and_found_inode; + u_int32_t sb_project_quota_inode; + // 0x270 + u_int32_t sb_checksum_seed; + u_int8_t sb_write_time_hi; + u_int8_t sb_mount_time_hi; + u_int8_t sb_newfs_time_hi; + u_int8_t sb_check_time_hi; + u_int8_t sb_first_error_time_hi; + u_int8_t sb_last_error_time_hi; + u_int8_t sb_first_error_code; + u_int8_t sb_last_error_code; + u_int16_t sb_encoding; + u_int16_t sb_encoding_flags; + // 0x280 + u_int16_t sb_orphan_file_inode; + u_int16_t sb_reserved_284; + u_int32_t sb_reserved_288[94]; + u_int32_t sb_checksum; +} __attribute__((packed)); + +struct m_ext4fs { + /* little-endian super-block */ + struct ext4fs m_sble; + /* computed from little-endian super-block */ + u_int32_t m_inodes_count; + u_int64_t m_blocks_count; + u_int64_t m_reserved_blocks_count; + u_int64_t m_free_blocks_count; + u_int32_t m_free_inodes_count; + u_int32_t m_first_data_block; + u_int32_t m_log_block_size; // log2(block size) - 10 + u_int32_t m_log_cluster_size; // log2(cluster size) - 10 + u_int32_t m_blocks_per_group; + u_int32_t m_clusters_per_group; + u_int32_t m_inodes_per_group; + u_int64_t m_mount_time; + u_int32_t m_write_time; + u_int16_t m_mount_count; + int16_t m_max_mount_count_before_fsck; + u_int16_t m_state; // EXT4FS_STATE_* + u_int16_t m_errors; // EXT4FS_ERRORS_* + u_int16_t m_revision_level_minor; + u_int64_t m_check_time; + u_int32_t m_check_interval; + u_int32_t m_creator_os; // EXT4FS_OS_* + u_int32_t m_revision_level; + u_int16_t m_default_reserved_uid; + u_int16_t m_default_reserved_gid; + u_int32_t m_first_non_reserved_inode; + u_int16_t m_inode_size; + u_int16_t m_block_group_id; + u_int32_t m_feature_compat; + u_int32_t m_feature_incompat; + u_int32_t m_feature_ro_compat; + u_int32_t m_algorithm_usage_bitmap; + u_int16_t m_reserved_bgdt_blocks; + u_int32_t m_journal_inode_number; + u_int32_t m_journal_device_number; + u_int32_t m_last_orphan; + u_int16_t m_block_group_descriptor_size; + u_int32_t m_default_mount_opts; + u_int32_t m_first_meta_block_group; + u_int64_t m_newfs_time; + u_int16_t m_inode_size_extra_min; + u_int16_t m_inode_size_extra_want; + u_int32_t m_flags; + u_int16_t m_raid_stride_block_count; + u_int16_t m_mmp_interval; + u_int64_t m_mmp_block; + u_int32_t m_raid_stripe_width_block_count; + u_int64_t m_kilobytes_written; + u_int32_t m_error_count; + u_int64_t m_first_error_time; + u_int32_t m_first_error_inode; + u_int64_t m_first_error_block; + u_int32_t m_first_error_line; + u_int64_t m_last_error_time; + u_int32_t m_last_error_inode; + u_int32_t m_last_error_line; + u_int64_t m_last_error_block; + u_int32_t m_user_quota_inode; + u_int32_t m_group_quota_inode; + u_int32_t m_overhead_clusters; + u_int32_t m_backup_block_groups[2]; + u_int32_t m_lost_and_found_inode; + u_int32_t m_project_quota_inode; + u_int32_t m_checksum_seed; + u_int16_t m_encoding; + u_int16_t m_encoding_flags; + u_int16_t m_orphan_file_inode; + int m_read_only; + int m_fs_was_modified; + /* computed by ext4fs_sbfill */ + u_int64_t m_block_group_descriptor_blocks_count; + u_int64_t m_block_group_count; + u_int64_t m_block_size; + u_int64_t m_block_size_shift; + u_int32_t m_fs_block_to_disk_block; + u_int32_t m_inodes_per_block; + u_int32_t m_inode_table_blocks_per_group; + u_int32_t m_resize_dind_block; + struct ext4fs_block_group_descriptor *m_gd; +}; + +struct ext4fs_block_group_descriptor { + u_int32_t bgd_block_bitmap_block_lo; + u_int32_t bgd_inode_bitmap_block_lo; + u_int32_t bgd_inode_table_block_lo; + u_int16_t bgd_free_blocks_count_lo; + u_int16_t bgd_free_inodes_count_lo; + // 0x10 + u_int16_t bgd_used_dirs_count_lo; + u_int16_t bgd_flags; + u_int32_t bgd_exclude_bitmap_block_lo; + u_int16_t bgd_block_bitmap_checksum_lo; + u_int16_t bgd_inode_bitmap_checksum_lo; + u_int16_t bgd_inode_table_unused_lo; + u_int16_t bgd_checksum; + // 0x20 + u_int32_t bgd_block_bitmap_block_hi; + u_int32_t bgd_inode_bitmap_block_hi; + u_int32_t bgd_inode_table_block_hi; + u_int16_t bgd_free_blocks_count_hi; + u_int16_t bgd_free_inodes_count_hi; + // 0x30 + u_int16_t bgd_used_dirs_count_hi; + u_int16_t bgd_inode_table_unused_hi; + u_int32_t bgd_exclude_bitmap_block_hi; + u_int16_t bgd_block_bitmap_checksum_hi; + u_int16_t bgd_inode_bitmap_checksum_hi; + u_int32_t bgd_reserved_3c; + // 0x40 +} __attribute__((packed)); + + +/* Directory entry file types */ +#define EXT4FS_FT_UNKNOWN 0 +#define EXT4FS_FT_REG_FILE 1 +#define EXT4FS_FT_DIR 2 +#define EXT4FS_FT_CHRDEV 3 +#define EXT4FS_FT_BLKDEV 4 +#define EXT4FS_FT_FIFO 5 +#define EXT4FS_FT_SOCK 6 +#define EXT4FS_FT_SYMLINK 7 +#define EXT4FS_FT_MAX 8 + +struct ext4fs_directory { + u_int32_t e4d_ino; + u_int16_t e4d_reclen; + u_int8_t e4d_namlen; + u_int8_t e4d_type; + char e4d_name[EXT4FS_MAXNAMLEN]; +} __attribute__((packed)); + +/* Directory block checksum tail (last 12 bytes of block when metadata_csum) */ +#define EXT4FS_DIR_TAIL_FT 0xDE +#define EXT4FS_DIR_TAIL_SIZE 12 + +struct ext4fs_directory_tail { + u_int32_t det_reserved_zero1; /* must be 0 (fake inode = 0) */ + u_int16_t det_rec_len; /* always EXT4FS_DIR_TAIL_SIZE */ + u_int8_t det_reserved_zero2; /* must be 0 (namlen = 0) */ + u_int8_t det_reserved_ft; /* EXT4FS_DIR_TAIL_FT */ + u_int32_t det_checksum; +} __attribute__((packed)); + +struct ext4fs_feature { + int f_mask; + const char * f_name; +}; + +static const struct ext4fs_feature ext4fs_feature_incompat[] = { + {EXT4FS_FEATURE_INCOMPAT_COMPRESSION, "compression"}, + {EXT4FS_FEATURE_INCOMPAT_FILETYPE, "filetype"}, + {EXT4FS_FEATURE_INCOMPAT_RECOVER, "recover"}, + {EXT4FS_FEATURE_INCOMPAT_JOURNAL_DEV, "journal_dev"}, + {EXT4FS_FEATURE_INCOMPAT_META_BG, "meta_bg"}, + {EXT4FS_FEATURE_INCOMPAT_EXTENTS, "extents"}, + {EXT4FS_FEATURE_INCOMPAT_64BIT, "64bit"}, + {EXT4FS_FEATURE_INCOMPAT_MMP, "mmp"}, + {EXT4FS_FEATURE_INCOMPAT_FLEX_BG, "flex_bg"}, + {EXT4FS_FEATURE_INCOMPAT_EA_INODE, "ea_inode"}, + {EXT4FS_FEATURE_INCOMPAT_DIRDATA, "dirdata"}, + {EXT4FS_FEATURE_INCOMPAT_CSUM_SEED, "csum_seed"}, + {EXT4FS_FEATURE_INCOMPAT_LARGEDIR, "largedir"}, + {EXT4FS_FEATURE_INCOMPAT_INLINE_DATA, "inline_data"}, + {EXT4FS_FEATURE_INCOMPAT_ENCRYPT, "encrypt"}, +}; + +static const struct ext4fs_feature ext4fs_feature_ro_compat[] = { + {EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER, "sparse-super"}, + {EXT4FS_FEATURE_RO_COMPAT_LARGE_FILE, "large-file"}, + {EXT4FS_FEATURE_RO_COMPAT_BTREE_DIR, "btree-dir"}, + {EXT4FS_FEATURE_RO_COMPAT_HUGE_FILE, "huge-file"}, + {EXT4FS_FEATURE_RO_COMPAT_GDT_CSUM, "gdt-csum"}, + {EXT4FS_FEATURE_RO_COMPAT_DIR_NLINK, "dir-nlink"}, + {EXT4FS_FEATURE_RO_COMPAT_EXTRA_ISIZE, "extra-isize"}, + {EXT4FS_FEATURE_RO_COMPAT_HAS_SNAPSHOT, "has-snapshot"}, + {EXT4FS_FEATURE_RO_COMPAT_QUOTA, "quota"}, + {EXT4FS_FEATURE_RO_COMPAT_BIGALLOC, "bigalloc"}, + {EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM, "metadata-csum"}, + {EXT4FS_FEATURE_RO_COMPAT_REPLICA, "replica"}, + {EXT4FS_FEATURE_RO_COMPAT_READONLY, "readonly"}, + {EXT4FS_FEATURE_RO_COMPAT_PROJECT, "project"}, +}; + +#define EXT4FS_ITIMES(ip) do { \ + if ((ip)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) { \ + struct timespec _ts; \ + (ip)->i_flag |= IN_MODIFIED; \ + getnanotime(&_ts); \ + if ((ip)->i_flag & IN_ACCESS) { \ + (ip)->i_e4din->dinode.i_atime = \ + htole32((u_int32_t)_ts.tv_sec); \ + (ip)->i_e4din->dinode.i_atime_extra = \ + htole32(_ts.tv_nsec << 2); \ + } \ + if ((ip)->i_flag & IN_UPDATE) { \ + (ip)->i_e4din->dinode.i_mtime = \ + htole32((u_int32_t)_ts.tv_sec); \ + (ip)->i_e4din->dinode.i_mtime_extra = \ + htole32(_ts.tv_nsec << 2); \ + } \ + if ((ip)->i_flag & IN_CHANGE) { \ + (ip)->i_e4din->dinode.i_ctime = \ + htole32((u_int32_t)_ts.tv_sec); \ + (ip)->i_e4din->dinode.i_ctime_extra = \ + htole32(_ts.tv_nsec << 2); \ + (ip)->i_modrev++; \ + } \ + (ip)->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); \ + } \ +} while (0) + +struct ext4fs_sync_args { + int allerror; + int waitfor; + struct proc *p; + struct ucred *cred; +}; + +extern struct pool ext4fs_inode_pool; +extern struct pool ext4fs_dinode_pool; + +/* VFS operations */ +int ext4fs_fhtovp(struct mount *, struct fid *, struct vnode **); +int ext4fs_init(struct vfsconf *); +int ext4fs_mount(struct mount *, const char *, void *, + struct nameidata *, struct proc *); +int ext4fs_statfs(struct mount *, struct statfs *, struct proc *); +int ext4fs_sync(struct mount *, int, int, struct ucred *, + struct proc *); +int ext4fs_sysctl(int *, u_int, void *, size_t *, void *, size_t, + struct proc *); +int ext4fs_unmount(struct mount *, int, struct proc *); +int ext4fs_vget(struct mount *, ino_t, struct vnode **); +int ext4fs_vptofh(struct vnode *, struct fid *); + +/* VNode operations */ + +int ext4fs_lookup(void *); +int ext4fs_create(void *); +int ext4fs_mknod(void *); +int ext4fs_open(void *); +int ext4fs_access(void *); +int ext4fs_getattr(void *); +int ext4fs_setattr(void *); +int ext4fs_read(void *); +int ext4fs_write(void *); +int ext4fs_fsync(void *); +int ext4fs_remove(void *); +int ext4fs_link(void *); +int ext4fs_rename(void *); +int ext4fs_mkdir(void *); +int ext4fs_rmdir(void *); +int ext4fs_symlink(void *); +int ext4fs_readdir(void *); +int ext4fs_readlink(void *); +int ext4fs_inactive(void *); +int ext4fs_reclaim(void *); +int ext4fs_bmap(void *); +int ext4fs_strategy(void *); +int ext4fs_print(void *); +int ext4fs_pathconf(void *); +int ext4fs_advlock(void *); + +int ext4fs_update(struct inode *, int); + +u_int32_t ext4fs_sb_csum(struct ext4fs *); +int ext4fs_sb_csum_verify(struct ext4fs *); +u_int32_t ext4fs_csum_seed(struct m_ext4fs *); +u_int32_t ext4fs_bitmap_csum(struct m_ext4fs *, u_int32_t, void *, size_t); +u_int16_t ext4fs_bgd_csum(struct m_ext4fs *, + struct ext4fs_block_group_descriptor *, u_int32_t); +int ext4fs_bgd_csum_verify(struct m_ext4fs *, + struct ext4fs_block_group_descriptor *, u_int32_t); +u_int32_t ext4fs_inode_csum(struct m_ext4fs *, + struct ext4fs_dinode_256 *, u_int32_t); +int ext4fs_inode_csum_verify(struct m_ext4fs *, + struct ext4fs_dinode_256 *, u_int32_t); + +/* Directory entry size: 8 bytes header + name, rounded up to 4 */ +#define EXT4FS_DIRSIZ(namlen) (((8 + (namlen)) + 3) & ~3) + +/* Convert inode mode to directory file type */ +static inline u_int8_t +ext4fs_mode_to_ft(u_int16_t mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: return EXT4FS_FT_REG_FILE; + case S_IFDIR: return EXT4FS_FT_DIR; + case S_IFCHR: return EXT4FS_FT_CHRDEV; + case S_IFBLK: return EXT4FS_FT_BLKDEV; + case S_IFIFO: return EXT4FS_FT_FIFO; + case S_IFSOCK: return EXT4FS_FT_SOCK; + case S_IFLNK: return EXT4FS_FT_SYMLINK; + default: return EXT4FS_FT_UNKNOWN; + } +} + +/* Block allocation / free */ +int ext4fs_blkalloc(struct inode *, u_int64_t, u_int32_t, u_int64_t *, + u_int32_t *); +void ext4fs_blkfree(struct inode *, u_int64_t); + +/* Inode allocation / free */ +int ext4fs_inode_alloc(struct inode *, mode_t, struct ucred *, + struct vnode **); +void ext4fs_inode_free(struct inode *, ufsino_t, mode_t); + +/* Directory operations */ +int ext4fs_direnter(struct inode *, struct vnode *, + struct componentname *); +int ext4fs_dirremove(struct vnode *, struct componentname *); +int ext4fs_dirempty(struct inode *, ufsino_t, struct ucred *); +int ext4fs_dirrewrite(struct inode *, struct inode *, + struct componentname *); + +/* Truncation */ +int ext4fs_truncate(struct inode *, off_t, int, struct ucred *); + +/* Size update */ +void ext4fs_setsize(struct inode *, u_int64_t); + +/* Superblock / BGD write-back */ +int ext4fs_bgd_write(struct m_ext4fs *, struct vnode *, u_int32_t); +int ext4fs_sbwrite(struct mount *); diff --git a/sys/ufs/ext4fs/ext4fs_crc32c.c b/sys/ufs/ext4fs/ext4fs_crc32c.c new file mode 100644 index 000000000..a28e8a05c --- /dev/null +++ b/sys/ufs/ext4fs/ext4fs_crc32c.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2025 kmx.io. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include +#include + +/* + * CRC32C lookup table, generated using the Castagnoli polynomial + * 0x1EDC6F41 (bit-reversed: 0x82F63B78). + * + * This table is for little-endian CRC computation. + */ +static const u_int32_t crc32c_table[256] = { + 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, + 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, + 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, + 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, + 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, + 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, + 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, + 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, + 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, + 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, + 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, + 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, + 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, + 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, + 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, + 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, + 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, + 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, + 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, + 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, + 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, + 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, + 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, + 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, + 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, + 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, + 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, + 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, + 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, + 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, + 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, + 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, + 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, + 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, + 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, + 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, + 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, + 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, + 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, + 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, + 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, + 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, + 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, + 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, + 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, + 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, + 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, + 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, + 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, + 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, + 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, + 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, + 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, + 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, + 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, + 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, + 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, + 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, + 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, + 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, + 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, + 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, + 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, + 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 +}; + +/* + * Compute CRC32C of a buffer. + * + * crc: initial CRC value (use ~0 to start fresh, or previous CRC to continue) + * buf: pointer to data buffer + * len: length of data in bytes + * + * Returns the updated CRC32C value. + */ +u_int32_t +ext4fs_crc32c(u_int32_t crc, const void *buf, size_t len) +{ + const u_int8_t *p = buf; + + crc = ~crc; + while (len--) + crc = (crc >> 8) ^ crc32c_table[(crc & 0xff) ^ *p++]; + + return ~crc; +} + +/* + * Compute CRC32C in the style used by ext4. + * + * ext4 computes CRC32C starting with ~0 (or a seed), then inverts the + * final result before storing it. + * + * crc: seed value (use ~0 for standard ext4 checksum, or sb_checksum_seed) + * buf: pointer to data buffer + * len: length of data in bytes + * + * Returns the final CRC32C value (NOT inverted - caller should invert + * if comparing against stored checksum, or pass result to next call). + */ +u_int32_t +ext4fs_crc32c_le(u_int32_t crc, const void *buf, size_t len) +{ + return ext4fs_crc32c(crc, buf, len); +} + +/* + * Compute the checksum seed for an ext4 filesystem. + * + * If the CSUM_SEED feature is set, use the pre-computed seed from the + * superblock. Otherwise, compute it from the filesystem UUID. + */ +u_int32_t +ext4fs_csum_seed(struct m_ext4fs *fs) +{ + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_CSUM_SEED) + return ~fs->m_checksum_seed; + + /* Compute seed from UUID */ + return ext4fs_crc32c(0, fs->m_sble.sb_uuid, + sizeof(fs->m_sble.sb_uuid)); +} + +/* + * Compute the CRC32C checksum of an ext4 superblock. + * + * The checksum covers the entire superblock except for the checksum + * field itself (last 4 bytes). The checksum field is treated as zero + * during computation. + */ +u_int32_t +ext4fs_sb_csum(struct ext4fs *sb) +{ + u_int32_t crc; + size_t offset; + + /* Offset of sb_checksum field within the superblock */ + offset = offsetof(struct ext4fs, sb_checksum); + + /* Compute CRC up to (but not including) the checksum field */ + crc = ext4fs_crc32c(0, sb, offset); + + return ~crc; +} + +/* + * Compute the CRC32C checksum of a block group descriptor. + * + * When CSUM_SEED is set, the seed comes from sb_checksum_seed. + * Otherwise, compute it from the UUID. + * The block_group_id is always chained into the CRC (after the seed). + */ +u_int16_t +ext4fs_bgd_csum(struct m_ext4fs *fs, + struct ext4fs_block_group_descriptor *bgd, u_int32_t block_group_id) +{ + u_int32_t crc; + u_int32_t seed; + u_int32_t block_group_id_le; + size_t size; + struct ext4fs_block_group_descriptor tmp; + + if (!(fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 0; + + seed = ext4fs_csum_seed(fs); + block_group_id_le = htole32(block_group_id); + seed = ext4fs_crc32c(seed, &block_group_id_le, + sizeof(block_group_id_le)); + + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + size = fs->m_block_group_descriptor_size; + else + size = 32; + if (size > sizeof(tmp)) + size = sizeof(tmp); + + memcpy(&tmp, bgd, size); + tmp.bgd_checksum = 0; + crc = ext4fs_crc32c(seed, &tmp, size); + + return (~crc) & 0xFFFF; +} + +/* + * Verify a block group descriptor checksum. + * + * Returns 0 if the checksum is valid, or EINVAL if it doesn't match. + */ +int +ext4fs_bgd_csum_verify(struct m_ext4fs *fs, + struct ext4fs_block_group_descriptor *bgd, u_int32_t block_group_id) +{ + u_int16_t provided, calculated; + + if (!(fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 0; + + provided = letoh16(bgd->bgd_checksum); + calculated = ext4fs_bgd_csum(fs, bgd, block_group_id); + + if (provided != calculated) { + printf("ext4fs: bgd %u checksum mismatch: " + "stored=0x%04x calculated=0x%04x\n", + block_group_id, provided, calculated); + return EINVAL; + } + + return 0; +} + +/* + * Compute the CRC32C checksum of an inode. + * + * The checksum covers the inode number, generation, and the full + * 256-byte inode with checksum fields zeroed. + */ +u_int32_t +ext4fs_inode_csum(struct m_ext4fs *fs, + struct ext4fs_dinode_256 *dp, u_int32_t ino) +{ + u_int32_t crc; + u_int32_t seed; + u_int32_t ino_le; + struct ext4fs_dinode_256 tmp; + + if (!(fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 0; + + seed = ext4fs_csum_seed(fs); + + ino_le = htole32(ino); + crc = ext4fs_crc32c(seed, &ino_le, sizeof(ino_le)); + crc = ext4fs_crc32c(crc, &dp->dinode.i_nfs_generation, + sizeof(dp->dinode.i_nfs_generation)); + + tmp = *dp; + tmp.dinode.i_checksum_lo = 0; + tmp.dinode.i_checksum_hi = 0; + crc = ext4fs_crc32c(crc, &tmp, sizeof(tmp)); + + return ~crc; +} + +/* + * Verify an inode checksum. + * + * Returns 0 if the checksum is valid, or EINVAL if it doesn't match. + */ +int +ext4fs_inode_csum_verify(struct m_ext4fs *fs, + struct ext4fs_dinode_256 *dp, u_int32_t ino) +{ + u_int32_t provided, calculated; + + if (!(fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 0; + + provided = letoh16(dp->dinode.i_checksum_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + provided |= (u_int32_t)letoh16(dp->dinode.i_checksum_hi) << 16; + calculated = ext4fs_inode_csum(fs, dp, ino); + + if (provided != calculated) { + printf("ext4fs: inode %u checksum mismatch: " + "stored=0x%08x calculated=0x%08x\n", + ino, provided, calculated); + return EINVAL; + } + + return 0; +} + +u_int32_t +ext4fs_bitmap_csum(struct m_ext4fs *fs, u_int32_t group, + void *bitmap, size_t size) +{ + u_int32_t crc, seed; + + if (!(fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 0; + + seed = ext4fs_csum_seed(fs); + crc = ext4fs_crc32c(seed, bitmap, size); + + return ~crc; +} + +/* + * Write the checksum tail at the end of a directory block. + * + * The tail is a 12-byte structure placed at block_size - 12. + * Checksum covers: UUID seed, inode number, inode generation, block data. + */ +void +ext4fs_dir_set_csum(struct m_ext4fs *fs, u_int32_t ino, u_int32_t gen_le, + void *buf) +{ + struct ext4fs_directory_tail *tail; + u_int32_t crc, seed, ino_le; + + if (!(fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + tail = (struct ext4fs_directory_tail *) + ((char *)buf + fs->m_block_size - EXT4FS_DIR_TAIL_SIZE); + tail->det_reserved_zero1 = 0; + tail->det_rec_len = htole16(EXT4FS_DIR_TAIL_SIZE); + tail->det_reserved_zero2 = 0; + tail->det_reserved_ft = EXT4FS_DIR_TAIL_FT; + tail->det_checksum = 0; + + seed = ext4fs_csum_seed(fs); + ino_le = htole32(ino); + crc = ext4fs_crc32c(seed, &ino_le, sizeof(ino_le)); + crc = ext4fs_crc32c(crc, &gen_le, sizeof(gen_le)); + crc = ext4fs_crc32c(crc, buf, fs->m_block_size - EXT4FS_DIR_TAIL_SIZE); + tail->det_checksum = htole32(~crc); +} + +/* + * Verify the superblock checksum. + * + * Returns 0 if the checksum is valid, or EINVAL if it doesn't match. + * If metadata checksums are not enabled, always returns 0. + */ +int +ext4fs_sb_csum_verify(struct ext4fs *sb) +{ + u_int32_t provided, calculated; + + /* Check if metadata checksums are enabled */ + if (!(letoh32(sb->sb_feature_ro_compat) & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 0; + + provided = letoh32(sb->sb_checksum); + calculated = ext4fs_sb_csum(sb); + + if (provided != calculated) { + printf("ext4fs: superblock checksum mismatch: " + "stored=0x%08x calculated=0x%08x\n", + provided, calculated); + return EINVAL; + } + + return 0; +} + +/* + * Write the checksum tail of an extent tree block. + * + * The tail is a 4-byte le32 checksum placed right after eh_max entries. + * Checksum covers: UUID seed, inode number, inode generation, + * then the block data up to and including the zeroed tail. + */ +void +ext4fs_extent_block_csum_set(struct m_ext4fs *fs, u_int32_t ino, + u_int32_t gen_le, void *buf) +{ + u_int32_t crc, seed, ino_le; + u_int32_t *tail; + struct ext4fs_extent_header *eh; + size_t tail_offset; + + if (!(fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + eh = (struct ext4fs_extent_header *)buf; + /* Tail is right after eh_max entries */ + tail_offset = sizeof(struct ext4fs_extent_header) + + (size_t)letoh16(eh->eh_max) * sizeof(struct ext4fs_extent); + tail = (u_int32_t *)((char *)buf + tail_offset); + + seed = ext4fs_csum_seed(fs); + ino_le = htole32(ino); + crc = ext4fs_crc32c(seed, &ino_le, sizeof(ino_le)); + crc = ext4fs_crc32c(crc, &gen_le, sizeof(gen_le)); + *tail = 0; + crc = ext4fs_crc32c(crc, buf, tail_offset); + *tail = htole32(~crc); +} diff --git a/sys/ufs/ext4fs/ext4fs_crc32c.h b/sys/ufs/ext4fs/ext4fs_crc32c.h new file mode 100644 index 000000000..2208a3d4d --- /dev/null +++ b/sys/ufs/ext4fs/ext4fs_crc32c.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2025 kmx.io. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef _EXT4FS_CRC32C_H_ +#define _EXT4FS_CRC32C_H_ + +#include + +/* + * CRC32C uses the Castagnoli polynomial: 0x1EDC6F41 + * This is different from the standard CRC32 (ISO 3309) polynomial. + * + * ext4 stores checksums as the bitwise inverse of the CRC32C value. + */ + +/* Compute CRC32C of a buffer, starting from an initial CRC value */ +u_int32_t ext4fs_crc32c(u_int32_t crc, const void *buf, size_t len); + +/* Compute CRC32C with initial value of ~0, then invert result (ext4 style) */ +u_int32_t ext4fs_crc32c_le(u_int32_t crc, const void *buf, size_t len); + +struct m_ext4fs; + +/* Compute block or inode bitmap checksum (group number + bitmap data) */ +u_int32_t ext4fs_bitmap_csum(struct m_ext4fs *fs, u_int32_t group, + void *bitmap, size_t size); + +/* + * Write a directory block checksum tail at the end of buf. + * ino: directory inode number, gen_le: i_nfs_generation (already LE). + * No-op if METADATA_CSUM is not enabled. + */ +void ext4fs_dir_set_csum(struct m_ext4fs *fs, u_int32_t ino, + u_int32_t gen_le, void *buf); + +/* + * Write the extent tree block checksum tail. + * ino: inode number, gen_le: i_nfs_generation (already LE on disk). + * No-op if METADATA_CSUM is not enabled. + */ +void ext4fs_extent_block_csum_set(struct m_ext4fs *fs, u_int32_t ino, + u_int32_t gen_le, void *buf); + +#endif /* _EXT4FS_CRC32C_H_ */ diff --git a/sys/ufs/ext4fs/ext4fs_dinode.h b/sys/ufs/ext4fs/ext4fs_dinode.h new file mode 100644 index 000000000..9ca795cb0 --- /dev/null +++ b/sys/ufs/ext4fs/ext4fs_dinode.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2025 kmx.io. + * Copyright (c) 1997 Manuel Bouyer. + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Modified for ext4fs by kmx.io. + */ + +#include + +#define EXT4FS_EXTENT_HEADER_MAGIC 0xF30A + +struct ext4fs_extent_header { + u_int16_t eh_magic; + u_int16_t eh_entries; + u_int16_t eh_max; + u_int16_t eh_depth; + u_int32_t eh_generation; +} __attribute__((packed)); + +struct ext4fs_extent { + u_int32_t e_block; + u_int16_t e_len; + u_int16_t e_start_hi; + u_int32_t e_start_lo; +} __attribute__((packed)); + +struct ext4fs_extent_idx { + u_int32_t ei_block; + u_int32_t ei_leaf_lo; + u_int16_t ei_leaf_hi; + u_int16_t ei_unused; +} __attribute__((packed)); + +struct ext4fs_dinode { + u_int16_t i_mode; + u_int16_t i_uid_lo; + u_int32_t i_size_lo; + u_int32_t i_atime; + u_int32_t i_ctime; + /* 0x10 */ + u_int32_t i_mtime; + u_int32_t i_dtime; + u_int16_t i_gid_lo; + u_int16_t i_links_count; + u_int32_t i_blocks_lo; + /* 0x20 */ + u_int32_t i_flags; + u_int32_t i_version; + union { + u_int32_t i_block[15]; + struct { + struct ext4fs_extent_header i_extent_header; + union { + struct ext4fs_extent i_extent[4]; + struct ext4fs_extent_idx i_extent_idx[4]; + }; + }; + }; + u_int32_t i_nfs_generation; + u_int32_t i_extended_attributes_lo; + u_int32_t i_size_hi; + /* 0x70 */ + u_int32_t i_fragment_address; + u_int16_t i_blocks_hi; + u_int16_t i_extended_attributes_hi; + u_int16_t i_uid_hi; + u_int16_t i_gid_hi; + u_int16_t i_checksum_lo; + u_int16_t i_reserved_7e; + /* 0x80 */ + u_int16_t i_extra_isize; + u_int16_t i_checksum_hi; + u_int32_t i_ctime_extra; + u_int32_t i_mtime_extra; + u_int32_t i_atime_extra; + /* 0x90 */ + u_int32_t i_crtime; + u_int32_t i_crtime_extra; + u_int32_t i_version_hi; + u_int32_t i_project_id; + /* 0xA0 */ +} __attribute__((packed)); + +struct ext4fs_dinode_256 { + struct ext4fs_dinode dinode; + u_int8_t extended_attributes[256 - sizeof(struct ext4fs_dinode)]; +}; diff --git a/sys/ufs/ext4fs/ext4fs_extern.h b/sys/ufs/ext4fs/ext4fs_extern.h new file mode 100644 index 000000000..783e7f3bf --- /dev/null +++ b/sys/ufs/ext4fs/ext4fs_extern.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2025 kmx.io. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include + +extern const struct vops ext4fs_vops; + +#define IS_EXT4_VNODE(vp) ((vp)->v_tag == VT_EXT4FS) diff --git a/sys/ufs/ext4fs/ext4fs_journal.c b/sys/ufs/ext4fs/ext4fs_journal.c new file mode 100644 index 000000000..c109d3c3b --- /dev/null +++ b/sys/ufs/ext4fs/ext4fs_journal.c @@ -0,0 +1,944 @@ +/* + * Copyright (c) 2025 kmx.io. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * JBD2 journal replay for ext4fs. + * + * Implements the standard three-pass replay algorithm: + * 1. SCAN - walk the journal to find valid transactions + * 2. REVOKE - collect revoked blocks + * 3. REPLAY - write surviving data blocks to the filesystem + * + * All JBD2 on-disk fields are big-endian. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +/* + * Build the journal block map from sb_jnl_blocks[0..14]. + * + * sb_jnl_blocks[0..14] is a copy of inode 8's i_block[0..14], which + * contains an extent tree in the same format as regular file inodes. + * The values are little-endian (copied from the inode). + * + * sb_jnl_blocks[15] = i_size_lo, sb_jnl_blocks[16] = i_size_hi. + */ +static int +jbd2_build_blockmap(struct jbd2_replay_ctx *ctx) +{ + struct m_ext4fs *fs = ctx->rc_fs; + struct ext4fs *sble = &fs->m_sble; + u_int32_t *iblock = sble->sb_jnl_blocks; + struct ext4fs_extent_header *eh; + struct ext4fs_extent *ext; + struct ext4fs_extent_idx *idx; + u_int16_t depth, entries, i; + u_int32_t jblock, maxblocks; + u_int64_t pblock; + u_int32_t len; + + maxblocks = ctx->rc_maxlen; + ctx->rc_blockmap = mallocarray(maxblocks, + sizeof(struct jbd2_blockmap_entry), M_TEMP, M_WAITOK | M_ZERO); + ctx->rc_blockmap_count = maxblocks; + + /* Parse extent header from i_block[0..2] (first 12 bytes) */ + eh = (struct ext4fs_extent_header *)iblock; + if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC) { + printf("ext4fs: journal inode has bad extent magic 0x%x\n", + letoh16(eh->eh_magic)); + return (EINVAL); + } + + depth = letoh16(eh->eh_depth); + entries = letoh16(eh->eh_entries); + + if (depth == 0) { + /* Leaf extents follow the header directly */ + ext = (struct ext4fs_extent *)(eh + 1); + for (i = 0; i < entries; i++) { + u_int32_t lblk = letoh32(ext[i].e_block); + len = letoh16(ext[i].e_len); + pblock = (u_int64_t)letoh16(ext[i].e_start_hi) << 32 | + letoh32(ext[i].e_start_lo); + + for (jblock = 0; jblock < len; jblock++) { + u_int32_t j = lblk + jblock; + if (j < maxblocks) + ctx->rc_blockmap[j].jb_fsblock = + pblock + jblock; + } + } + } else { + /* Depth > 0: index nodes, need to read leaf blocks */ + idx = (struct ext4fs_extent_idx *)(eh + 1); + for (i = 0; i < entries; i++) { + struct buf *bp; + struct ext4fs_extent_header *leh; + struct ext4fs_extent *lext; + u_int16_t lentries, j; + u_int64_t leaf_block; + int error; + + leaf_block = + (u_int64_t)letoh16(idx[i].ei_leaf_hi) << 32 | + letoh32(idx[i].ei_leaf_lo); + + error = bread(ctx->rc_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, leaf_block), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + printf("ext4fs: journal blockmap: " + "can't read index block\n"); + return (error); + } + + leh = (struct ext4fs_extent_header *)bp->b_data; + if (letoh16(leh->eh_magic) != + EXT4FS_EXTENT_HEADER_MAGIC) { + brelse(bp); + printf("ext4fs: journal blockmap: " + "bad leaf magic\n"); + return (EINVAL); + } + if (letoh16(leh->eh_depth) != 0) { + brelse(bp); + printf("ext4fs: journal blockmap: " + "depth > 1 not supported\n"); + return (EINVAL); + } + + lentries = letoh16(leh->eh_entries); + lext = (struct ext4fs_extent *)(leh + 1); + + for (j = 0; j < lentries; j++) { + u_int32_t lblk = letoh32(lext[j].e_block); + len = letoh16(lext[j].e_len); + pblock = + (u_int64_t)letoh16(lext[j].e_start_hi) + << 32 | letoh32(lext[j].e_start_lo); + + for (jblock = 0; jblock < len; jblock++) { + u_int32_t k = lblk + jblock; + if (k < maxblocks) + ctx->rc_blockmap[k]. + jb_fsblock = + pblock + jblock; + } + } + brelse(bp); + } + } + + return (0); +} + +/* + * Read a journal block by journal-relative block number. + */ +static int +jbd2_read_block(struct jbd2_replay_ctx *ctx, u_int32_t jblock, + struct buf **bpp) +{ + struct m_ext4fs *fs = ctx->rc_fs; + u_int64_t fsblock; + + if (jblock >= ctx->rc_blockmap_count) { + printf("ext4fs: journal block %u out of range (%u)\n", + jblock, ctx->rc_blockmap_count); + return (EIO); + } + + fsblock = ctx->rc_blockmap[jblock].jb_fsblock; + if (fsblock == 0) { + printf("ext4fs: journal block %u not mapped\n", jblock); + return (EIO); + } + + return bread(ctx->rc_devvp, (daddr_t)EXT4FS_FSBTODB(fs, fsblock), + fs->m_block_size, bpp); +} + +/* + * Wrap journal block number circularly. + */ +static u_int32_t +jbd2_next_block(struct jbd2_replay_ctx *ctx, u_int32_t block) +{ + block++; + if (block >= ctx->rc_maxlen) + block = ctx->rc_first; + return block; +} + +/* + * Parse one descriptor tag from a descriptor block. + * + * Returns 0 on success, sets *target to the filesystem block, + * *flags to the tag flags, and advances *offset past the tag. + */ +static int +jbd2_parse_tag(struct jbd2_replay_ctx *ctx, char *buf, u_int32_t bufsize, + u_int32_t *offset, u_int64_t *target, u_int32_t *flags) +{ + int has_csum_v3, has_64bit; + u_int32_t tag_size; + + has_csum_v3 = ctx->rc_features_incompat & + JBD2_FEATURE_INCOMPAT_CSUM_V3; + has_64bit = ctx->rc_features_incompat & + JBD2_FEATURE_INCOMPAT_64BIT; + + if (has_csum_v3) { + struct jbd2_block_tag3 *tag3; + + tag_size = sizeof(struct jbd2_block_tag3); + if (*offset + tag_size > bufsize) + return (EINVAL); + + tag3 = (struct jbd2_block_tag3 *)(buf + *offset); + *target = betoh32(tag3->t_blocknr); + *flags = betoh32(tag3->t_flags); + if (has_64bit) + *target |= (u_int64_t)betoh32(tag3->t_blocknr_high) + << 32; + + *offset += tag_size; + if (!(*flags & JBD2_FLAG_SAME_UUID)) + *offset += 16; /* skip UUID */ + } else { + struct jbd2_block_tag *tag; + + tag_size = 8; /* minimum: blocknr + checksum + flags */ + if (*offset + tag_size > bufsize) + return (EINVAL); + + tag = (struct jbd2_block_tag *)(buf + *offset); + *target = betoh32(tag->t_blocknr); + *flags = betoh16(tag->t_flags); + + *offset += tag_size; + if (has_64bit) { + if (*offset + 4 > bufsize) + return (EINVAL); + *target |= (u_int64_t)betoh32(tag->t_blocknr_high) + << 32; + *offset += 4; + } + if (!(*flags & JBD2_FLAG_SAME_UUID)) + *offset += 16; + } + + return (0); +} + +/* + * Add a block to the revocation table. + */ +static void +jbd2_revoke_add(struct jbd2_replay_ctx *ctx, u_int64_t block, + u_int32_t sequence) +{ + u_int32_t i; + + /* Update existing entry if present */ + for (i = 0; i < ctx->rc_revoke_count; i++) { + if (ctx->rc_revoke[i].re_block == block) { + if (sequence > ctx->rc_revoke[i].re_sequence || + (sequence < 0x10000 && + ctx->rc_revoke[i].re_sequence > 0xFFFF0000)) + ctx->rc_revoke[i].re_sequence = sequence; + return; + } + } + + /* Grow table if needed */ + if (ctx->rc_revoke_count >= ctx->rc_revoke_alloc) { + struct jbd2_revoke_entry *newrev; + u_int32_t newalloc; + + newalloc = ctx->rc_revoke_alloc ? ctx->rc_revoke_alloc * 2 + : 64; + newrev = mallocarray(newalloc, + sizeof(struct jbd2_revoke_entry), M_TEMP, + M_WAITOK | M_ZERO); + if (ctx->rc_revoke_count > 0) + memcpy(newrev, ctx->rc_revoke, + ctx->rc_revoke_count * + sizeof(struct jbd2_revoke_entry)); + if (ctx->rc_revoke != NULL) + free(ctx->rc_revoke, M_TEMP, + ctx->rc_revoke_alloc * + sizeof(struct jbd2_revoke_entry)); + ctx->rc_revoke = newrev; + ctx->rc_revoke_alloc = newalloc; + } + + ctx->rc_revoke[ctx->rc_revoke_count].re_block = block; + ctx->rc_revoke[ctx->rc_revoke_count].re_sequence = sequence; + ctx->rc_revoke_count++; +} + +/* + * Check if a block is revoked at or after the given sequence. + */ +static int +jbd2_revoke_check(struct jbd2_replay_ctx *ctx, u_int64_t block, + u_int32_t sequence) +{ + u_int32_t i; + + for (i = 0; i < ctx->rc_revoke_count; i++) { + if (ctx->rc_revoke[i].re_block == block && + (ctx->rc_revoke[i].re_sequence >= sequence || + (ctx->rc_revoke[i].re_sequence < 0x10000 && + sequence > 0xFFFF0000))) + return (1); + } + return (0); +} + +/* + * Check if a block looks like a valid journal header. + */ +static int +jbd2_check_header(struct buf *bp, u_int32_t expected_seq, u_int32_t type) +{ + struct jbd2_header *hdr; + + hdr = (struct jbd2_header *)bp->b_data; + if (betoh32(hdr->h_magic) != JBD2_MAGIC) + return (0); + if (betoh32(hdr->h_sequence) != expected_seq) + return (0); + if (type != 0 && betoh32(hdr->h_blocktype) != type) + return (0); + return (1); +} + +/* + * Count data blocks described by a descriptor block's tags. + */ +static int +jbd2_count_tags(struct jbd2_replay_ctx *ctx, struct buf *bp, + u_int32_t *count) +{ + char *buf; + u_int32_t offset, bufsize, flags; + u_int64_t target; + int error; + + buf = (char *)bp->b_data; + bufsize = ctx->rc_blocksize; + offset = sizeof(struct jbd2_header); + *count = 0; + + while (offset < bufsize) { + error = jbd2_parse_tag(ctx, buf, bufsize, &offset, + &target, &flags); + if (error) + break; + (*count)++; + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + return (0); +} + +/* + * Pass 1: SCAN + * + * Walk the journal from s_start/s_sequence, verify each transaction + * has a matching DESCRIPTOR and COMMIT block, and find the end of + * the valid journal. + */ +static int +jbd2_pass_scan(struct jbd2_replay_ctx *ctx) +{ + u_int32_t block, seq, next_seq; + u_int32_t tag_count; + struct buf *bp; + struct jbd2_header *hdr; + int error; + + block = ctx->rc_start; + seq = ctx->rc_sequence; + ctx->rc_end_sequence = seq; + + printf("ext4fs: journal scan: start block %u sequence %u\n", + block, seq); + + while (1) { + error = jbd2_read_block(ctx, block, &bp); + if (error) { + printf("ext4fs: journal scan: read error at " + "block %u\n", block); + break; + } + + hdr = (struct jbd2_header *)bp->b_data; + if (betoh32(hdr->h_magic) != JBD2_MAGIC) { + brelse(bp); + break; + } + if (betoh32(hdr->h_sequence) != seq) { + brelse(bp); + break; + } + + switch (betoh32(hdr->h_blocktype)) { + case JBD2_DESCRIPTOR_BLOCK: + /* Count tags to know how many data blocks follow */ + error = jbd2_count_tags(ctx, bp, &tag_count); + brelse(bp); + if (error) + goto done; + + /* Skip over data blocks */ + { + u_int32_t i; + for (i = 0; i < tag_count; i++) + block = jbd2_next_block(ctx, block); + } + + /* Next block should be commit */ + block = jbd2_next_block(ctx, block); + error = jbd2_read_block(ctx, block, &bp); + if (error) { + printf("ext4fs: journal scan: missing commit " + "for seq %u\n", seq); + goto done; + } + + if (!jbd2_check_header(bp, seq, + JBD2_COMMIT_BLOCK)) { + brelse(bp); + printf("ext4fs: journal scan: bad commit " + "for seq %u\n", seq); + goto done; + } + brelse(bp); + + /* Valid transaction */ + next_seq = seq + 1; + ctx->rc_end_sequence = next_seq; + seq = next_seq; + block = jbd2_next_block(ctx, block); + break; + + case JBD2_REVOKE_BLOCK: + /* + * A revoke block by itself is part of a transaction. + * There may be multiple revoke blocks before the + * commit. + */ + brelse(bp); + block = jbd2_next_block(ctx, block); + break; + + case JBD2_COMMIT_BLOCK: + /* Unexpected standalone commit — end of journal */ + brelse(bp); + goto done; + + default: + brelse(bp); + goto done; + } + } + +done: + printf("ext4fs: journal scan: end sequence %u (%u transactions)\n", + ctx->rc_end_sequence, + ctx->rc_end_sequence - ctx->rc_sequence); + return (0); +} + +/* + * Pass 2: REVOKE + * + * Walk the journal again, collecting revoked blocks. + */ +static int +jbd2_pass_revoke(struct jbd2_replay_ctx *ctx) +{ + u_int32_t block, seq; + u_int32_t tag_count; + struct buf *bp; + struct jbd2_header *hdr; + struct jbd2_revoke_header *rh; + int has_64bit; + int error; + + has_64bit = ctx->rc_features_incompat & + JBD2_FEATURE_INCOMPAT_64BIT; + + block = ctx->rc_start; + seq = ctx->rc_sequence; + + while (seq < ctx->rc_end_sequence) { + error = jbd2_read_block(ctx, block, &bp); + if (error) + return (error); + + hdr = (struct jbd2_header *)bp->b_data; + if (betoh32(hdr->h_magic) != JBD2_MAGIC || + betoh32(hdr->h_sequence) != seq) { + brelse(bp); + break; + } + + switch (betoh32(hdr->h_blocktype)) { + case JBD2_DESCRIPTOR_BLOCK: + error = jbd2_count_tags(ctx, bp, &tag_count); + brelse(bp); + if (error) + return (error); + { + u_int32_t i; + for (i = 0; i < tag_count; i++) + block = jbd2_next_block(ctx, block); + } + /* Skip commit block */ + block = jbd2_next_block(ctx, block); + block = jbd2_next_block(ctx, block); + seq++; + break; + + case JBD2_REVOKE_BLOCK: + rh = (struct jbd2_revoke_header *)bp->b_data; + { + u_int32_t rcount, off; + rcount = betoh32(rh->r_count); + off = sizeof(struct jbd2_revoke_header); + while (off < rcount) { + u_int64_t revblk; + if (has_64bit) { + if (off + 8 > rcount) + break; + revblk = + (u_int64_t)betoh32( + *(u_int32_t *) + ((char *)bp->b_data + + off)) << 32 | + betoh32( + *(u_int32_t *) + ((char *)bp->b_data + + off + 4)); + off += 8; + } else { + if (off + 4 > rcount) + break; + revblk = betoh32( + *(u_int32_t *) + ((char *)bp->b_data + + off)); + off += 4; + } + jbd2_revoke_add(ctx, revblk, seq); + } + } + brelse(bp); + block = jbd2_next_block(ctx, block); + break; + + case JBD2_COMMIT_BLOCK: + brelse(bp); + block = jbd2_next_block(ctx, block); + seq++; + break; + + default: + brelse(bp); + block = jbd2_next_block(ctx, block); + break; + } + } + + if (ctx->rc_revoke_count > 0) + printf("ext4fs: journal revoke: %u blocks revoked\n", + ctx->rc_revoke_count); + + return (0); +} + +/* + * Pass 3: REPLAY + * + * Walk the journal a third time, writing data blocks to the filesystem + * that have not been revoked. + */ +static int +jbd2_pass_replay(struct jbd2_replay_ctx *ctx) +{ + struct m_ext4fs *fs = ctx->rc_fs; + u_int32_t block, seq; + u_int32_t replayed = 0; + struct buf *bp, *dbp, *wbp; + struct jbd2_header *hdr; + char *buf; + u_int32_t offset, bufsize, flags; + u_int64_t target; + int error; + + block = ctx->rc_start; + seq = ctx->rc_sequence; + + while (seq < ctx->rc_end_sequence) { + error = jbd2_read_block(ctx, block, &bp); + if (error) + return (error); + + hdr = (struct jbd2_header *)bp->b_data; + if (betoh32(hdr->h_magic) != JBD2_MAGIC || + betoh32(hdr->h_sequence) != seq) { + brelse(bp); + break; + } + + switch (betoh32(hdr->h_blocktype)) { + case JBD2_DESCRIPTOR_BLOCK: + buf = (char *)bp->b_data; + bufsize = ctx->rc_blocksize; + offset = sizeof(struct jbd2_header); + + /* Iterate over tags, each followed by a data block */ + while (offset < bufsize) { + error = jbd2_parse_tag(ctx, buf, bufsize, + &offset, &target, &flags); + if (error) + break; + + /* Advance to data block */ + block = jbd2_next_block(ctx, block); + + /* Skip if revoked */ + if (jbd2_revoke_check(ctx, target, seq)) { + if (flags & JBD2_FLAG_LAST_TAG) + break; + continue; + } + + /* Read the journal data block */ + error = jbd2_read_block(ctx, block, &dbp); + if (error) { + printf("ext4fs: journal replay: " + "read error block %u\n", block); + if (flags & JBD2_FLAG_LAST_TAG) + break; + continue; + } + + /* Read the target filesystem block */ + error = bread(ctx->rc_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, target), + fs->m_block_size, &wbp); + if (error) { + brelse(dbp); + printf("ext4fs: journal replay: " + "can't read target %llu\n", + (unsigned long long)target); + if (flags & JBD2_FLAG_LAST_TAG) + break; + continue; + } + + /* Copy data */ + memcpy(wbp->b_data, dbp->b_data, + fs->m_block_size); + brelse(dbp); + + /* Un-escape: restore JBD2 magic if needed */ + if (flags & JBD2_FLAG_ESCAPE) { + u_int32_t magic = htobe32(JBD2_MAGIC); + memcpy(wbp->b_data, &magic, 4); + } + + /* Write to filesystem */ + error = bwrite(wbp); + if (error) { + printf("ext4fs: journal replay: " + "write error target %llu\n", + (unsigned long long)target); + } else { + replayed++; + } + + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + + brelse(bp); + + /* Skip to commit block and past it */ + block = jbd2_next_block(ctx, block); + /* Skip the commit block */ + block = jbd2_next_block(ctx, block); + seq++; + break; + + case JBD2_REVOKE_BLOCK: + brelse(bp); + block = jbd2_next_block(ctx, block); + break; + + case JBD2_COMMIT_BLOCK: + brelse(bp); + block = jbd2_next_block(ctx, block); + seq++; + break; + + default: + brelse(bp); + block = jbd2_next_block(ctx, block); + break; + } + } + + ctx->rc_replay_count = replayed; + printf("ext4fs: journal replay: %u blocks replayed\n", replayed); + + return (0); +} + +/* + * Main entry point: replay the ext4 journal. + * + * Called during mount when the RECOVER incompat flag is set. + * Reads the journal superblock backup from sb_jnl_blocks, + * runs the three-pass replay, then clears the journal. + */ +int +ext4fs_journal_replay(struct vnode *devvp, struct m_ext4fs *fs) +{ + struct jbd2_replay_ctx ctx; + struct jbd2_superblock *jsb; + struct buf *bp; + u_int64_t jblock0; + int error; + + memset(&ctx, 0, sizeof(ctx)); + ctx.rc_devvp = devvp; + ctx.rc_fs = fs; + + /* + * Locate journal block 0 from the block map. + * We need to build the map first. + */ + + /* Read journal superblock: first build the blockmap, + * then read journal block 0 */ + + /* Temporarily set rc_maxlen from sb_jnl_blocks. + * Journal size = sb_jnl_blocks[15] | sb_jnl_blocks[16] << 32, + * in bytes. Divide by blocksize for blocks. */ + { + u_int64_t jsize; + jsize = letoh32(fs->m_sble.sb_jnl_blocks[15]) | + (u_int64_t)letoh32(fs->m_sble.sb_jnl_blocks[16]) << 32; + ctx.rc_maxlen = jsize / fs->m_block_size; + } + + if (ctx.rc_maxlen == 0) { + printf("ext4fs: journal has zero size\n"); + return (EINVAL); + } + + /* Build journal block → filesystem block mapping */ + error = jbd2_build_blockmap(&ctx); + if (error) + goto out; + + /* Read journal superblock (journal block 0) */ + jblock0 = ctx.rc_blockmap[0].jb_fsblock; + if (jblock0 == 0) { + printf("ext4fs: journal block 0 not mapped\n"); + error = EINVAL; + goto out; + } + + error = bread(devvp, (daddr_t)EXT4FS_FSBTODB(fs, jblock0), + fs->m_block_size, &bp); + if (error) { + printf("ext4fs: can't read journal superblock\n"); + goto out; + } + + jsb = (struct jbd2_superblock *)bp->b_data; + + /* Validate journal superblock */ + if (betoh32(jsb->s_header.h_magic) != JBD2_MAGIC) { + printf("ext4fs: bad journal magic 0x%x\n", + betoh32(jsb->s_header.h_magic)); + brelse(bp); + error = EINVAL; + goto out; + } + { + u_int32_t btype = betoh32(jsb->s_header.h_blocktype); + if (btype != JBD2_SUPERBLOCK_V1 && + btype != JBD2_SUPERBLOCK_V2) { + printf("ext4fs: bad journal superblock version %u\n", + btype); + brelse(bp); + error = EINVAL; + goto out; + } + } + + ctx.rc_blocksize = betoh32(jsb->s_blocksize); + ctx.rc_maxlen = betoh32(jsb->s_maxlen); + ctx.rc_first = betoh32(jsb->s_first); + ctx.rc_sequence = betoh32(jsb->s_sequence); + ctx.rc_start = betoh32(jsb->s_start); + + if (betoh32(jsb->s_header.h_blocktype) == JBD2_SUPERBLOCK_V2) + ctx.rc_features_incompat = betoh32(jsb->s_feature_incompat); + else + ctx.rc_features_incompat = 0; + + brelse(bp); + + if (ctx.rc_blocksize != fs->m_block_size) { + printf("ext4fs: journal blocksize %u != fs blocksize %llu\n", + ctx.rc_blocksize, (unsigned long long)fs->m_block_size); + error = EINVAL; + goto out; + } + + /* If s_start == 0, journal is clean — nothing to replay */ + if (ctx.rc_start == 0) { + printf("ext4fs: journal is clean, no replay needed\n"); + error = 0; + goto out; + } + + /* Rebuild blockmap with correct maxlen from journal superblock */ + free(ctx.rc_blockmap, M_TEMP, + ctx.rc_blockmap_count * sizeof(struct jbd2_blockmap_entry)); + ctx.rc_blockmap = NULL; + ctx.rc_blockmap_count = 0; + error = jbd2_build_blockmap(&ctx); + if (error) + goto out; + + printf("ext4fs: replaying journal (sequence %u, start block %u, " + "%u journal blocks)\n", + ctx.rc_sequence, ctx.rc_start, ctx.rc_maxlen); + + /* Pass 1: SCAN */ + error = jbd2_pass_scan(&ctx); + if (error) + goto out; + + if (ctx.rc_end_sequence == ctx.rc_sequence) { + printf("ext4fs: journal has no valid transactions\n"); + goto clear; + } + + /* Pass 2: REVOKE */ + error = jbd2_pass_revoke(&ctx); + if (error) + goto out; + + /* Pass 3: REPLAY */ + error = jbd2_pass_replay(&ctx); + if (error) + goto out; + +clear: + /* + * Mark journal clean: set s_start=0 in journal superblock. + */ + error = bread(devvp, (daddr_t)EXT4FS_FSBTODB(fs, jblock0), + fs->m_block_size, &bp); + if (error) { + printf("ext4fs: can't reread journal superblock\n"); + goto out; + } + + jsb = (struct jbd2_superblock *)bp->b_data; + jsb->s_start = htobe32(0); + /* Advance sequence past what we replayed */ + jsb->s_sequence = htobe32(ctx.rc_end_sequence); + error = bwrite(bp); + if (error) { + printf("ext4fs: can't write journal superblock\n"); + goto out; + } + + /* + * Clear RECOVER flag and set STATE_VALID in ext4 superblock. + */ + { + u_int32_t incompat; + + incompat = letoh32(fs->m_sble.sb_feature_incompat); + incompat &= ~EXT4FS_FEATURE_INCOMPAT_RECOVER; + fs->m_sble.sb_feature_incompat = htole32(incompat); + fs->m_feature_incompat = incompat; + + fs->m_sble.sb_state = htole16(EXT4FS_STATE_VALID); + fs->m_state = EXT4FS_STATE_VALID; + + /* Recompute superblock checksum */ + fs->m_sble.sb_checksum = + htole32(ext4fs_sb_csum(&fs->m_sble)); + + /* Write superblock to disk */ + error = bread(devvp, + (daddr_t)(EXT4FS_SUPER_BLOCK_OFFSET / DEV_BSIZE), + EXT4FS_SUPER_BLOCK_SIZE, &bp); + if (error) { + printf("ext4fs: can't read superblock for update\n"); + goto out; + } + memcpy(bp->b_data, &fs->m_sble, sizeof(struct ext4fs)); + error = bwrite(bp); + if (error) { + printf("ext4fs: can't write superblock\n"); + goto out; + } + } + + printf("ext4fs: journal replay complete\n"); + +out: + if (ctx.rc_blockmap != NULL) + free(ctx.rc_blockmap, M_TEMP, + ctx.rc_blockmap_count * + sizeof(struct jbd2_blockmap_entry)); + if (ctx.rc_revoke != NULL) + free(ctx.rc_revoke, M_TEMP, + ctx.rc_revoke_alloc * + sizeof(struct jbd2_revoke_entry)); + + return (error); +} diff --git a/sys/ufs/ext4fs/ext4fs_journal.h b/sys/ufs/ext4fs/ext4fs_journal.h new file mode 100644 index 000000000..318e757e2 --- /dev/null +++ b/sys/ufs/ext4fs/ext4fs_journal.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2025 kmx.io. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _EXT4FS_JOURNAL_H_ +#define _EXT4FS_JOURNAL_H_ + +/* + * JBD2 journal on-disk structures. + * All JBD2 fields are big-endian. + */ + +#define JBD2_MAGIC 0xC03B3998 + +/* Block types */ +#define JBD2_DESCRIPTOR_BLOCK 1 +#define JBD2_COMMIT_BLOCK 2 +#define JBD2_SUPERBLOCK_V1 3 +#define JBD2_SUPERBLOCK_V2 4 +#define JBD2_REVOKE_BLOCK 5 + +/* Descriptor tag flags */ +#define JBD2_FLAG_ESCAPE 0x01 +#define JBD2_FLAG_SAME_UUID 0x02 +#define JBD2_FLAG_DELETED 0x04 +#define JBD2_FLAG_LAST_TAG 0x08 + +/* Journal feature flags (in journal superblock) */ +#define JBD2_FEATURE_COMPAT_CHECKSUM 0x01 + +#define JBD2_FEATURE_INCOMPAT_REVOKE 0x01 +#define JBD2_FEATURE_INCOMPAT_64BIT 0x02 +#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x04 +#define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x08 +#define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x10 + +/* Common block header (12 bytes) */ +struct jbd2_header { + u_int32_t h_magic; + u_int32_t h_blocktype; + u_int32_t h_sequence; +} __attribute__((packed)); + +/* Journal superblock */ +struct jbd2_superblock { + struct jbd2_header s_header; + /* 0x0C */ + u_int32_t s_blocksize; + u_int32_t s_maxlen; + u_int32_t s_first; + /* 0x18 */ + u_int32_t s_sequence; + u_int32_t s_start; + /* 0x20 */ + u_int32_t s_errno; + /* V2+ fields */ + u_int32_t s_feature_compat; + u_int32_t s_feature_incompat; + u_int32_t s_feature_ro_compat; + /* 0x30 */ + u_int8_t s_uuid[16]; + /* 0x40 */ + u_int32_t s_nr_users; + u_int32_t s_dynsuper; + /* 0x48 */ + u_int32_t s_max_transaction; + u_int32_t s_max_trans_data; + /* 0x50 */ + u_int8_t s_checksum_type; + u_int8_t s_padding2[3]; + /* 0x54 */ + u_int8_t s_padding[168]; + /* 0xFC */ + u_int32_t s_checksum; + /* 0x100 */ + u_int8_t s_users[16 * 48]; +} __attribute__((packed)); + +/* Descriptor block tag v3 (CSUM_V3, 16 bytes without UUID) */ +struct jbd2_block_tag3 { + u_int32_t t_blocknr; + u_int32_t t_flags; + u_int32_t t_blocknr_high; + u_int32_t t_checksum; +} __attribute__((packed)); + +/* Descriptor block tag v2 (no CSUM_V3) */ +struct jbd2_block_tag { + u_int32_t t_blocknr; + u_int16_t t_checksum; + u_int16_t t_flags; + u_int32_t t_blocknr_high; /* only if 64BIT */ +} __attribute__((packed)); + +/* Revoke block header */ +struct jbd2_revoke_header { + struct jbd2_header r_header; + u_int32_t r_count; /* bytes used in this block */ +} __attribute__((packed)); + +/* Revocation table entry */ +struct jbd2_revoke_entry { + u_int64_t re_block; + u_int32_t re_sequence; +}; + +/* Block map entry: journal block -> filesystem block */ +struct jbd2_blockmap_entry { + u_int64_t jb_fsblock; /* filesystem block number */ +}; + +/* In-memory replay context */ +struct jbd2_replay_ctx { + struct vnode *rc_devvp; + struct m_ext4fs *rc_fs; + + /* Journal geometry (from journal superblock, host order) */ + u_int32_t rc_blocksize; + u_int32_t rc_maxlen; + u_int32_t rc_first; + u_int32_t rc_sequence; /* starting sequence */ + u_int32_t rc_start; /* starting block */ + + /* Journal feature flags */ + u_int32_t rc_features_incompat; + + /* Block map: journal block number -> filesystem block */ + struct jbd2_blockmap_entry *rc_blockmap; + u_int32_t rc_blockmap_count; + + /* Revocation table */ + struct jbd2_revoke_entry *rc_revoke; + u_int32_t rc_revoke_count; + u_int32_t rc_revoke_alloc; + + /* Scan result */ + u_int32_t rc_end_sequence; + u_int32_t rc_replay_count; +}; + +int ext4fs_journal_replay(struct vnode *, struct m_ext4fs *); + +#endif /* _EXT4FS_JOURNAL_H_ */ diff --git a/sys/ufs/ext4fs/ext4fs_vfsops.c b/sys/ufs/ext4fs/ext4fs_vfsops.c new file mode 100644 index 000000000..5440f9898 --- /dev/null +++ b/sys/ufs/ext4fs/ext4fs_vfsops.c @@ -0,0 +1,1460 @@ +/* + * Copyright (c) 2025 kmx.io. + * Copyright (c) 1997 Manuel Bouyer. + * Copyright (c) 1989, 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Modified for ext4fs by kmx.io. + */ +#include +#include +#include +#include +#include +#include +//#include +#include +#include +#include +//#include +#include +//#include +//#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +//#include +#include + +#include +#include +#include + +struct pool ext4fs_inode_pool; +struct pool ext4fs_dinode_pool; + +#define PRINTF_FEATURES(mask, features) \ + for (i = 0; i < nitems(features); i++) \ + if ((mask) & (features)[i].f_mask) \ + printf("%s ", (features)[i].f_name) + +int ext4fs_block_group_has_super_block(int); +int ext4fs_mountfs(struct vnode *, struct mount *, struct proc *); +int ext4fs_sbcheck(struct ext4fs *, int); +void ext4fs_sbload(struct ext4fs *, struct m_ext4fs *); +int ext4fs_sbfill(struct vnode *, struct m_ext4fs *); + +const struct vfsops ext4fs_vfsops = { + .vfs_mount = ext4fs_mount, + .vfs_start = ufs_start, + .vfs_unmount = ext4fs_unmount, + .vfs_root = ufs_root, + .vfs_quotactl = ufs_quotactl, + .vfs_statfs = ext4fs_statfs, + .vfs_sync = ext4fs_sync, + .vfs_vget = ext4fs_vget, + .vfs_fhtovp = ext4fs_fhtovp, + .vfs_vptofh = ext4fs_vptofh, + .vfs_init = ext4fs_init, + .vfs_sysctl = ext4fs_sysctl, + .vfs_checkexp = ufs_check_export, +}; + +struct pool ext4fs_inode_pool; + +int +ext4fs_block_group_has_super_block(int group) +{ + int a3, a5, a7; + + if (group == 0 || group == 1) + return 1; + for (a3 = 3, a5 = 5, a7 = 7; + a3 <= group || a5 <= group || a7 <= group; + a3 *= 3, a5 *= 5, a7 *= 7) + if (group == a3 || group == a5 || group == a7) + return 1; + return 0; +} + +int +ext4fs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp) +{ + (void)mp; + (void)fhp; + (void)vpp; + printf("ext4fs_fhtovp: not implemented\n"); + return (EOPNOTSUPP); +} + +/* + * Flush out all the files in a filesystem. + */ +int +ext4fs_flushfiles(struct mount *mp, int flags, struct proc *p) +{ + struct ufsmount *ump; + int error; + + ump = VFSTOUFS(mp); + /* + * Flush all the files. + */ + if ((error = vflush(mp, NULL, flags)) != 0) + return (error); + /* + * Flush filesystem metadata. + */ + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(ump->um_devvp, p->p_ucred, MNT_WAIT, p); + VOP_UNLOCK(ump->um_devvp); + return (error); +} + +int +ext4fs_init(struct vfsconf *vfsp) +{ + int result; + (void)vfsp; + pool_init(&ext4fs_inode_pool, sizeof(struct inode), 0, + IPL_NONE, PR_WAITOK, "ext4inopl", NULL); + pool_init(&ext4fs_dinode_pool, sizeof(struct ext4fs_dinode_256), 0, + IPL_NONE, PR_WAITOK, "ext4dinopl", NULL); + if ((result = ufs_init(vfsp))) { + return result; + } + return (0); +} + +int +ext4fs_mount(struct mount *mp, const char *path, void *data, + struct nameidata *ndp, struct proc *p) +{ + struct ufs_args *args; + struct vnode *devvp; + int error; + struct m_ext4fs *mfs; + char fname[MNAMELEN]; + char fspec[MNAMELEN]; + struct ufsmount *ump = NULL; + + args = data; + error = copyinstr(args->fspec, fspec, sizeof(fspec), NULL); + if (error) + goto error; + + if (disk_map(fspec, fname, MNAMELEN, DM_OPENBLCK) == -1) + memcpy(fname, fspec, sizeof(fname)); + + NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fname, p); + if ((error = namei(ndp)) != 0) + goto error; + devvp = ndp->ni_vp; + + if (devvp->v_type != VBLK) { + error = ENOTBLK; + goto error_devvp; + } + if (major(devvp->v_rdev) >= nblkdev) { + error = ENXIO; + goto error_devvp; + } + if ((mp->mnt_flag & MNT_UPDATE) == 0) { + error = ext4fs_mountfs(devvp, mp, p); + } else { + ump = VFSTOUFS(mp); + if (devvp != ump->um_devvp) + error = EINVAL; /* XXX needs translation */ + else + vrele(devvp); + } + if (error) + goto error_devvp; + ump = VFSTOUFS(mp); + mfs = ump->um_e4fs; + + strlcpy(mp->mnt_stat.f_mntfromname, fname, + sizeof(mp->mnt_stat.f_mntfromname)); + strlcpy(mp->mnt_stat.f_mntonname, path, + sizeof(mp->mnt_stat.f_mntonname)); + + goto success; + +error_devvp: + vrele(devvp); + +error: +success: + return (error); +} + +/* + * Common code for mount and mountroot + */ +int +ext4fs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p) +{ + struct ufsmount *ump; + struct buf *bp; + struct ext4fs *sble; + struct m_ext4fs *mfs; + dev_t dev; + int error, ronly; + struct ucred *cred; + + dev = devvp->v_rdev; + cred = p ? p->p_ucred : NOCRED; + /* + * Disallow multiple mounts of the same device. + * Disallow mounting of a device that is currently in use + * (except for root, which might share swap device for miniroot). + * Flush out any old buffers remaining from a previous use. + */ + if ((error = vfs_mountedon(devvp)) != 0) + return (error); + if (vcount(devvp) > 1 && devvp != rootvp) + return (EBUSY); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = vinvalbuf(devvp, V_SAVE, cred, p, 0, INFSLP); + VOP_UNLOCK(devvp); + if (error != 0) + return (error); + + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); + if (error) + return (error); + + bp = NULL; + ump = NULL; + + /* + * Read the superblock from disk. + */ + error = bread(devvp, (daddr_t)(EXT4FS_SUPER_BLOCK_OFFSET / + DEV_BSIZE), + EXT4FS_SUPER_BLOCK_SIZE, &bp); + if (error) + goto out; + sble = (struct ext4fs *)bp->b_data; + error = ext4fs_sbcheck(sble, ronly); + if (error) + goto out; + + ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); + mfs = ump->um_e4fs = malloc(sizeof(struct m_ext4fs), M_UFSMNT, + M_WAITOK | M_ZERO); + + /* + * Copy in the superblock, compute in-memory values + * and load group descriptors. + */ + ext4fs_sbload(sble, mfs); + if ((error = ext4fs_sbfill(devvp, mfs)) != 0) + goto out; + brelse(bp); + bp = NULL; + sble = &mfs->m_sble; + + /* + * If the filesystem needs journal recovery, replay it now. + * For r/o mounts, we temporarily reopen the device r/w. + */ + if ((mfs->m_feature_compat & EXT4FS_FEATURE_COMPAT_HAS_JOURNAL) && + (mfs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_RECOVER)) { + int reopen_ro = 0; + + if (ronly) { + /* Reopen device r/w for replay */ + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + VOP_CLOSE(devvp, FREAD, cred, p); + VOP_UNLOCK(devvp); + error = VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, p); + if (error) { + printf("ext4fs: can't reopen device r/w " + "for journal replay\n"); + goto out; + } + reopen_ro = 1; + } + + error = ext4fs_journal_replay(devvp, mfs); + if (error) { + printf("ext4fs: journal replay failed: %d\n", error); + printf("ext4fs: use Linux e2fsck to repair\n"); + /* Leave RECOVER set, fail the mount */ + if (reopen_ro) { + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + VOP_CLOSE(devvp, FREAD | FWRITE, cred, p); + VOP_UNLOCK(devvp); + VOP_OPEN(devvp, FREAD, FSCRED, p); + } + goto out; + } + + /* Reopen device r/o if it was a r/o mount */ + if (reopen_ro) { + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + VOP_CLOSE(devvp, FREAD | FWRITE, cred, p); + VOP_UNLOCK(devvp); + error = VOP_OPEN(devvp, FREAD, FSCRED, p); + if (error) { + printf("ext4fs: can't reopen device r/o\n"); + goto out; + } + } + + /* + * Replay may have changed group descriptors and + * superblock counters. Reload them. + */ + if (mfs->m_gd != NULL) { + size_t gd_size = mfs->m_block_group_count * + sizeof(struct ext4fs_block_group_descriptor); + free(mfs->m_gd, M_UFSMNT, gd_size); + mfs->m_gd = NULL; + } + /* Re-read superblock from disk */ + error = bread(devvp, + (daddr_t)(EXT4FS_SUPER_BLOCK_OFFSET / DEV_BSIZE), + EXT4FS_SUPER_BLOCK_SIZE, &bp); + if (error) + goto out; + ext4fs_sbload((struct ext4fs *)bp->b_data, mfs); + brelse(bp); + bp = NULL; + + error = ext4fs_sbfill(devvp, mfs); + if (error) + goto out; + sble = &mfs->m_sble; + } + + ump->um_e4fs->m_read_only = ronly; + ump->um_fstype = UM_EXT4FS; + + if (ronly == 0) { + if (mfs->m_state == EXT4FS_STATE_VALID) + mfs->m_state = 0; + else + mfs->m_state = EXT4FS_STATE_ERROR; + mfs->m_fs_was_modified = 1; + } + + mp->mnt_data = ump; + mp->mnt_stat.f_fsid.val[0] = (long)dev; + mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; + mp->mnt_stat.f_namemax = MAXNAMLEN; + mp->mnt_flag |= MNT_LOCAL; + ump->um_mountp = mp; + + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_nindir = EXT4FS_NINDIR(mfs); + ump->um_bptrtodb = mfs->m_fs_block_to_disk_block; + ump->um_seqinc = 1; /* no frags */ + ump->um_maxsymlinklen = EXT4FS_SYMLINK_LEN_MAX; + devvp->v_specmountpoint = mp; + + if (ronly == 0) + ext4fs_sbwrite(mp); + + return (0); +out: + if (devvp->v_specinfo) + devvp->v_specmountpoint = NULL; + if (bp) + brelse(bp); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p); + VOP_UNLOCK(devvp); + if (ump) { + if (mfs && mfs->m_gd != NULL) { + size_t gd_size = mfs->m_block_group_count * + sizeof(struct ext4fs_block_group_descriptor); + free(mfs->m_gd, M_UFSMNT, gd_size); + } + free(mfs, M_UFSMNT, sizeof *mfs); + free(ump, M_UFSMNT, sizeof *ump); + mp->mnt_data = NULL; + } + return (error); +} + +int +ext4fs_sbcheck(struct ext4fs *sble, int ronly) +{ + u_int32_t mask, tmp; + int i; + + tmp = letoh16(sble->sb_magic); + if (tmp != EXT4FS_MAGIC) { + printf("ext2fs: wrong magic number 0x%x\n", tmp); + return (EIO); /* XXX needs translation */ + } + + if (ext4fs_sb_csum_verify(sble) != 0) { + printf("ext4fs: superblock checksum verification failed\n"); + return (EINVAL); + } + + tmp = letoh32(sble->sb_log_block_size); + if (tmp > 2) { + /* skewed log(block size): 1024 -> 0 | 2048 -> 1 | 4096 -> 2 */ + tmp += 10; + printf("ext2fs: wrong log2(block size) %d\n", tmp); + return (EIO); /* XXX needs translation */ + } + + if (letoh32(sble->sb_blocks_per_group) == 0) { + printf("ext4fs: zero blocks per group\n"); + return (EIO); + } + + if (letoh32(sble->sb_inodes_per_group) == 0) { + printf("ext4fs: zero inodes per group\n"); + return (EIO); + } + + tmp = letoh32(sble->sb_revision_level); + if (tmp != EXT4FS_REV_DYNAMIC) { + printf("ext2fs: wrong revision number 0x%x\n", tmp); + return (EIO); /* XXX needs translation */ + } + + tmp = letoh16(sble->sb_inode_size); + if (tmp != 256) { + printf("ext4fs: unsupported inode size: %d\n", tmp); + return (EINVAL); + } + + tmp = letoh32(sble->sb_first_non_reserved_inode); + if (tmp != EXT4FS_INODE_FIRST) { + printf("ext4fs: first inode at 0x%x\n", tmp); + return (EINVAL); /* XXX needs translation */ + } + + tmp = letoh32(sble->sb_block_group_descriptor_size); + if (tmp != sizeof(struct ext4fs_block_group_descriptor)) { + printf("ext4fs: block group descriptor size is 0x%x\n", + tmp); + return (EINVAL); + } + + tmp = letoh32(sble->sb_feature_incompat); + mask = tmp & ~EXT4FS_FEATURE_INCOMPAT_SUPPORTED; + if (mask) { + printf("ext4fs: unsupported incompat features: "); + PRINTF_FEATURES(mask, ext4fs_feature_incompat); + printf("\n"); + return (EINVAL); /* XXX needs translation */ + } + + if (tmp & EXT4FS_FEATURE_INCOMPAT_RECOVER) { + printf("ext4fs: file system needs journal recovery\n"); + if (!(letoh32(sble->sb_feature_compat) & + EXT4FS_FEATURE_COMPAT_HAS_JOURNAL)) { + printf("ext4fs: RECOVER set but no journal\n"); + return (EINVAL); + } + /* Allow mount to proceed; replay happens in mountfs */ + } + + tmp = letoh32(sble->sb_feature_ro_compat) & + ~EXT4FS_FEATURE_RO_COMPAT_SUPPORTED; + if (!ronly && tmp) { + printf("ext4fs: unsupported R/O compat features: "); + PRINTF_FEATURES(tmp, ext4fs_feature_ro_compat); + printf("\n"); + return (EROFS); + } + + if (!ronly && + !(letoh32(sble->sb_feature_incompat) & + EXT4FS_FEATURE_INCOMPAT_RECOVER) && + !(letoh16(sble->sb_state) & EXT4FS_STATE_VALID)) { + printf("ext4fs: file system not clean, run e2fsck\n"); + return (EROFS); + } + + return (0); +} + +int +ext4fs_sbfill(struct vnode *devvp, struct m_ext4fs *mfs) +{ + struct ext4fs_dinode *rdp; + struct buf *bp; + daddr_t dblk; + u_int64_t ritb, rblk; + u_int32_t rgroup, rindex, roff; + size_t gd_size; + int error, i; + + mfs->m_block_group_count = howmany(mfs->m_blocks_count - + mfs->m_first_data_block, + mfs->m_blocks_per_group); + + mfs->m_block_size_shift = EXT4FS_LOG_MIN_BLOCK_SIZE + + mfs->m_log_block_size; + mfs->m_block_size = 1 << mfs->m_block_size_shift; + mfs->m_block_group_descriptor_blocks_count = + howmany(mfs->m_block_group_count, + mfs->m_block_size / + sizeof(struct ext4fs_block_group_descriptor)); + mfs->m_fs_block_to_disk_block = mfs->m_log_block_size + 1; + mfs->m_inodes_per_block = mfs->m_block_size / mfs->m_inode_size; + mfs->m_inode_table_blocks_per_group = mfs->m_inodes_per_group / + mfs->m_inodes_per_block; + + gd_size = mfs->m_block_group_count * sizeof(struct ext4fs_block_group_descriptor); + mfs->m_gd = malloc(gd_size, M_UFSMNT, M_WAITOK); + + dblk = (mfs->m_first_data_block + 1) << mfs->m_fs_block_to_disk_block; + for (i = 0; i < mfs->m_block_group_descriptor_blocks_count; i++) { + size_t off = (size_t)i * mfs->m_block_size; + size_t n = mfs->m_block_size; + + /* Don't copy past end of m_gd allocation */ + if (off + n > gd_size) + n = gd_size - off; + + error = bread(devvp, dblk + (i << mfs->m_fs_block_to_disk_block), + mfs->m_block_size, &bp); + if (error) { + printf("ext4fs_sbfill: failed to read block group descriptors: %d\n", error); + free(mfs->m_gd, M_UFSMNT, gd_size); + mfs->m_gd = NULL; + return (error); + } + memcpy((char *)mfs->m_gd + off, bp->b_data, n); + brelse(bp); + } + + /* Verify block group descriptor checksums */ + for (i = 0; i < mfs->m_block_group_count; i++) { + if ((error = ext4fs_bgd_csum_verify(mfs, &mfs->m_gd[i], + i)) != 0) { + printf("ext4fs_sbfill: block group %d checksum " + "verification failed\n", i); + free(mfs->m_gd, M_UFSMNT, gd_size); + mfs->m_gd = NULL; + return (error); + } + } + + /* + * Read the resize inode (inode 7) to get its doubly-indirect + * block pointer. Needed for BLOCK_UNINIT bitmap reconstruction. + */ + mfs->m_resize_dind_block = 0; + rgroup = (7 - 1) / mfs->m_inodes_per_group; + rindex = (7 - 1) % mfs->m_inodes_per_group; + ritb = letoh32(mfs->m_gd[rgroup].bgd_inode_table_block_lo); + rblk = ritb + (rindex * mfs->m_inode_size) / mfs->m_block_size; + roff = (rindex * mfs->m_inode_size) % mfs->m_block_size; + error = bread(devvp, (daddr_t)EXT4FS_FSBTODB(mfs, rblk), + mfs->m_block_size, &bp); + if (error) { + brelse(bp); + } else { + rdp = (struct ext4fs_dinode *) + ((char *)bp->b_data + roff); + mfs->m_resize_dind_block = letoh32(rdp->i_block[13]); + brelse(bp); + } + + return (0); +} + +void +ext4fs_sbload(struct ext4fs *sble, struct m_ext4fs *dest) +{ + int feature_incompat_64bit; + feature_incompat_64bit = letoh32(sble->sb_feature_incompat) & + EXT4FS_FEATURE_INCOMPAT_64BIT; + /* Keep a copy of the raw little-endian superblock */ + memcpy(&dest->m_sble, sble, sizeof(dest->m_sble)); + dest->m_inodes_count = letoh32(sble->sb_inodes_count); + dest->m_blocks_count = letoh32(sble->sb_blocks_count_lo); + dest->m_reserved_blocks_count = + letoh32(sble->sb_reserved_blocks_count_lo); + dest->m_free_blocks_count = + letoh32(sble->sb_free_blocks_count_lo); + dest->m_free_inodes_count = letoh32(sble->sb_free_inodes_count); + dest->m_first_data_block = letoh32(sble->sb_first_data_block); + dest->m_log_block_size = letoh32(sble->sb_log_block_size); + dest->m_log_cluster_size = letoh32(sble->sb_log_cluster_size); + dest->m_blocks_per_group = letoh32(sble->sb_blocks_per_group); + dest->m_clusters_per_group = letoh32(sble->sb_clusters_per_group); + dest->m_inodes_per_group = letoh32(sble->sb_inodes_per_group); + dest->m_mount_time = letoh32(sble->sb_mount_time_lo); + dest->m_write_time = letoh32(sble->sb_write_time_lo); + dest->m_mount_count = letoh16(sble->sb_mount_count); + dest->m_max_mount_count_before_fsck = (int16_t)letoh16(sble->sb_max_mount_count_before_fsck); + dest->m_state = letoh16(sble->sb_state); + dest->m_errors = letoh16(sble->sb_errors); + dest->m_revision_level_minor = letoh16(sble->sb_revision_level_minor); + dest->m_check_time = letoh32(sble->sb_check_time_lo); + dest->m_check_interval = letoh32(sble->sb_check_interval); + dest->m_creator_os = letoh32(sble->sb_creator_os); + dest->m_revision_level = letoh32(sble->sb_revision_level); + dest->m_default_reserved_uid = letoh16(sble->sb_default_reserved_uid); + dest->m_default_reserved_gid = letoh16(sble->sb_default_reserved_gid); + dest->m_first_non_reserved_inode = letoh32(sble->sb_first_non_reserved_inode); + dest->m_inode_size = letoh16(sble->sb_inode_size); + dest->m_block_group_id = letoh16(sble->sb_block_group_id); + dest->m_feature_compat = letoh32(sble->sb_feature_compat); + dest->m_feature_incompat = letoh32(sble->sb_feature_incompat); + dest->m_feature_ro_compat = letoh32(sble->sb_feature_ro_compat); + dest->m_algorithm_usage_bitmap = letoh32(sble->sb_algorithm_usage_bitmap); + dest->m_reserved_bgdt_blocks = letoh16(sble->sb_reserved_bgdt_blocks); + dest->m_journal_inode_number = letoh32(sble->sb_journal_inode_number); + dest->m_journal_device_number = letoh32(sble->sb_journal_device_number); + dest->m_last_orphan = letoh32(sble->sb_last_orphan); + dest->m_block_group_descriptor_size = letoh16(sble->sb_block_group_descriptor_size); + dest->m_default_mount_opts = letoh32(sble->sb_default_mount_opts); + dest->m_first_meta_block_group = letoh32(sble->sb_first_meta_block_group); + dest->m_newfs_time = letoh32(sble->sb_newfs_time_lo); + dest->m_inode_size_extra_min = letoh16(sble->sb_inode_size_extra_min); + dest->m_inode_size_extra_want = letoh16(sble->sb_inode_size_extra_want); + dest->m_flags = letoh32(sble->sb_flags); + dest->m_raid_stride_block_count = letoh16(sble->sb_raid_stride_block_count); + dest->m_mmp_interval = letoh16(sble->sb_mmp_interval); + dest->m_mmp_block = letoh64(sble->sb_mmp_block); + dest->m_raid_stripe_width_block_count = letoh32(sble->sb_raid_stripe_width_block_count); + dest->m_kilobytes_written = letoh64(sble->sb_kilobytes_written); + dest->m_error_count = letoh32(sble->sb_error_count); + dest->m_first_error_time = letoh32(sble->sb_first_error_time_lo); + dest->m_first_error_inode = letoh32(sble->sb_first_error_inode); + dest->m_first_error_block = letoh64(sble->sb_first_error_block); + dest->m_first_error_line = letoh32(sble->sb_first_error_line); + dest->m_last_error_time = letoh32(sble->sb_last_error_time_lo); + dest->m_last_error_inode = letoh32(sble->sb_last_error_inode); + dest->m_last_error_line = letoh32(sble->sb_last_error_line); + dest->m_last_error_block = letoh64(sble->sb_last_error_block); + dest->m_user_quota_inode = letoh32(sble->sb_user_quota_inode); + dest->m_group_quota_inode = letoh32(sble->sb_group_quota_inode); + dest->m_overhead_clusters = letoh32(sble->sb_overhead_clusters); + dest->m_backup_block_groups[0] = letoh32(sble->sb_backup_block_groups[0]); + dest->m_backup_block_groups[1] = letoh32(sble->sb_backup_block_groups[1]); + dest->m_lost_and_found_inode = letoh32(sble->sb_lost_and_found_inode); + dest->m_project_quota_inode = letoh32(sble->sb_project_quota_inode); + dest->m_checksum_seed = letoh32(sble->sb_checksum_seed); + dest->m_encoding = letoh16(sble->sb_encoding); + dest->m_encoding_flags = letoh16(sble->sb_encoding_flags); + dest->m_orphan_file_inode = letoh16(sble->sb_orphan_file_inode); + if (feature_incompat_64bit) { + dest->m_blocks_count |= (u_int64_t) + letoh32(sble->sb_blocks_count_hi) << 32; + dest->m_reserved_blocks_count |= (u_int64_t) + letoh32(sble->sb_reserved_blocks_count_hi) + << 32; + dest->m_free_blocks_count |= (u_int64_t) + letoh32(sble->sb_free_blocks_count_hi) << 32; + dest->m_mount_time |= (u_int64_t) + letoh32(sble->sb_mount_time_hi) << 32; + dest->m_check_time |= (u_int64_t) + letoh32(sble->sb_check_time_hi) << 32; + dest->m_newfs_time |= (u_int64_t) + letoh32(sble->sb_newfs_time_hi) << 32; + dest->m_first_error_time |= (u_int64_t) + letoh32(sble->sb_first_error_time_hi) << 32; + dest->m_last_error_time |= (u_int64_t) + letoh32(sble->sb_last_error_time_hi) << 32; + } +} + +int +ext4fs_statfs(struct mount *mp, struct statfs *sbp, struct proc *p) +{ + struct ufsmount *ump; + struct m_ext4fs *mfs; + const u_int32_t overhead_per_group_block_bitmap = 1; + const u_int32_t overhead_per_group_inode_bitmap = 1; + u_int32_t overhead, overhead_per_group; + int ngroups; + + (void)p; + ump = VFSTOUFS(mp); + mfs = ump->um_e4fs; + + overhead_per_group = overhead_per_group_block_bitmap + + overhead_per_group_inode_bitmap + + mfs->m_inode_table_blocks_per_group; + overhead = mfs->m_first_data_block + + mfs->m_block_group_count * overhead_per_group; + if (mfs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER) { + int i; + for (i = 0, ngroups = 0; i < mfs->m_block_group_count; i++) { + if (ext4fs_block_group_has_super_block(i)) + ngroups++; + } + } else { + ngroups = mfs->m_block_group_count; + } + overhead += ngroups * + (1 + mfs->m_block_group_descriptor_blocks_count); + + sbp->f_bsize = mfs->m_block_size; + sbp->f_iosize = mfs->m_block_size; + sbp->f_blocks = mfs->m_blocks_count - overhead; + sbp->f_bfree = mfs->m_free_blocks_count; + if (sbp->f_bfree > mfs->m_reserved_blocks_count) + sbp->f_bavail = sbp->f_bfree - mfs->m_reserved_blocks_count; + else + sbp->f_bavail = 0; + sbp->f_files = mfs->m_inodes_count; + sbp->f_favail = sbp->f_ffree = mfs->m_free_inodes_count; + copy_statfs_info(sbp, mp); + + return (0); +} + +/* + * Write a block group descriptor back to disk with updated checksum. + */ +int +ext4fs_bgd_write(struct m_ext4fs *fs, struct vnode *devvp, u_int32_t group) +{ + struct buf *bp; + struct ext4fs_block_group_descriptor *gd; + u_int32_t bgds_per_block, bgd_block, bgd_off; + daddr_t dblk; + int error; + + bgds_per_block = fs->m_block_size / + sizeof(struct ext4fs_block_group_descriptor); + bgd_block = group / bgds_per_block; + bgd_off = (group % bgds_per_block) * + sizeof(struct ext4fs_block_group_descriptor); + + dblk = (fs->m_first_data_block + 1 + bgd_block) << + fs->m_fs_block_to_disk_block; + + error = bread(devvp, dblk, fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (error); + } + + /* Update in-memory checksum */ + gd = &fs->m_gd[group]; + gd->bgd_checksum = htole16(ext4fs_bgd_csum(fs, gd, group)); + + /* Copy to buffer and write */ + memcpy((char *)bp->b_data + bgd_off, gd, + sizeof(struct ext4fs_block_group_descriptor)); + + bdwrite(bp); + return (0); +} + +/* + * Write the superblock to disk with updated counters and checksum. + */ +int +ext4fs_sbwrite(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct m_ext4fs *fs = ump->um_e4fs; + struct ext4fs *sble = &fs->m_sble; + struct buf *bp; + struct timespec ts; + int error; + + /* Update counters in on-disk superblock */ + sble->sb_free_blocks_count_lo = + htole32((u_int32_t)fs->m_free_blocks_count); + sble->sb_free_blocks_count_hi = + htole32((u_int32_t)(fs->m_free_blocks_count >> 32)); + sble->sb_free_inodes_count = htole32(fs->m_free_inodes_count); + + getnanotime(&ts); + sble->sb_write_time_lo = htole32((u_int32_t)ts.tv_sec); + + sble->sb_state = htole16(fs->m_state); + + /* Recompute checksum */ + sble->sb_checksum = htole32(ext4fs_sb_csum(sble)); + + /* Write to disk at the fixed superblock offset */ + error = bread(ump->um_devvp, + (daddr_t)(EXT4FS_SUPER_BLOCK_OFFSET / DEV_BSIZE), + EXT4FS_SUPER_BLOCK_SIZE, &bp); + if (error) { + brelse(bp); + return (error); + } + + memcpy(bp->b_data, sble, sizeof(struct ext4fs)); + return (bwrite(bp)); +} + +static u_long ext4fs_gennumber; + +/* + * Allocate an inode in the file system. + */ +int +ext4fs_inode_alloc(struct inode *pip, mode_t mode, struct ucred *cred, + struct vnode **vpp) +{ + struct m_ext4fs *fs = pip->i_e4fs; + struct vnode *pvp = ITOV(pip); + struct ext4fs_block_group_descriptor *gd; + struct buf *bp, *tbp; + struct inode *ip; + u_int32_t group, ngroups, ino_in_group, pbit, tb, it_blocks; + u_int32_t best, best_free, fi, g, free_inodes, icsum; + u_int32_t itu, first_unused, dirs; + u_int64_t bitmap_blk, itb; + ufsino_t ino; + char *ibp; + int error, i; + + *vpp = NULL; + + if (fs->m_free_inodes_count == 0) + return (ENOSPC); + + ngroups = fs->m_block_group_count; + + /* Pick starting group */ + if ((mode & S_IFMT) == S_IFDIR) { + best = 0; + best_free = 0; + for (i = 0; i < ngroups; i++) { + fi = letoh16(fs->m_gd[i].bgd_free_inodes_count_lo); + if (fi > best_free) { + best_free = fi; + best = i; + } + } + group = best; + } else { + group = (pip->i_number - 1) / fs->m_inodes_per_group; + } + + /* Scan groups starting from preferred */ + for (i = 0; i < ngroups; i++) { + g = (group + i) % ngroups; + gd = &fs->m_gd[g]; + free_inodes = letoh16(gd->bgd_free_inodes_count_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + free_inodes |= + (u_int32_t)letoh16(gd->bgd_free_inodes_count_hi) + << 16; + if (free_inodes == 0) + continue; + + /* Read inode bitmap */ + bitmap_blk = letoh32(gd->bgd_inode_bitmap_block_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + bitmap_blk |= (u_int64_t) + letoh32(gd->bgd_inode_bitmap_block_hi) << 32; + + error = bread(pip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + continue; + } + + ibp = (char *)bp->b_data; + + /* + * If INODE_UNINIT, the bitmap is stale. Zero it, + * set padding bits, and zero all inode table blocks. + */ + if (letoh16(gd->bgd_flags) & + EXT4FS_BGD_FLAG_INODE_UNINIT) { + memset(ibp, 0, fs->m_block_size); + for (pbit = fs->m_inodes_per_group; + pbit < fs->m_block_size * 8; pbit++) + setbit(ibp, pbit); + + itb = letoh32(gd->bgd_inode_table_block_lo); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + itb |= (u_int64_t)letoh32( + gd->bgd_inode_table_block_hi) << 32; + it_blocks = fs->m_inode_table_blocks_per_group; + for (tb = 0; tb < it_blocks; tb++) { + error = bread(pip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, + itb + tb), + fs->m_block_size, &tbp); + if (error) { + brelse(tbp); + continue; + } + memset(tbp->b_data, 0, + fs->m_block_size); + error = bwrite(tbp); + } + } + + /* Find free inode bit */ + for (ino_in_group = 0; ino_in_group < fs->m_inodes_per_group; + ino_in_group++) { + if (isclr(ibp, ino_in_group)) { + setbit(ibp, ino_in_group); + + icsum = ext4fs_bitmap_csum(fs, g, ibp, + fs->m_inodes_per_group / 8); + gd->bgd_inode_bitmap_checksum_lo = + htole16(icsum & 0xFFFF); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_inode_bitmap_checksum_hi + = htole16( + (icsum >> 16) & 0xFFFF); + + error = bwrite(bp); + if (error) + return (error); + + /* Compute inode number (1-based) */ + ino = g * fs->m_inodes_per_group + + ino_in_group + 1; + + /* Get vnode for new inode */ + error = VFS_VGET(pvp->v_mount, ino, vpp); + if (error) { + ext4fs_inode_free(pip, ino, mode); + return (error); + } + + /* Clear INODE_UNINIT flag if set */ + gd->bgd_flags = htole16(letoh16(gd->bgd_flags) & + ~EXT4FS_BGD_FLAG_INODE_UNINIT); + + /* Update BGD free count */ + free_inodes--; + gd->bgd_free_inodes_count_lo = + htole16(free_inodes & 0xFFFF); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_free_inodes_count_hi = + htole16((free_inodes >> 16) & + 0xFFFF); + + if ((mode & S_IFMT) == S_IFDIR) { + dirs = letoh16( + gd->bgd_used_dirs_count_lo); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + dirs |= (u_int32_t)letoh16( + gd->bgd_used_dirs_count_hi) + << 16; + dirs++; + gd->bgd_used_dirs_count_lo = + htole16(dirs & 0xFFFF); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_used_dirs_count_hi = + htole16((dirs >> 16) & + 0xFFFF); + } + + itu = letoh16(gd->bgd_inode_table_unused_lo); + first_unused = + fs->m_inodes_per_group - itu; + if (ino_in_group >= first_unused) { + itu = fs->m_inodes_per_group - + ino_in_group - 1; + gd->bgd_inode_table_unused_lo = + htole16(itu & 0xFFFF); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_inode_table_unused_hi = + htole16((itu >> 16) & + 0xFFFF); + } + + ext4fs_bgd_write(fs, pip->i_devvp, g); + + /* Update superblock counters */ + fs->m_free_inodes_count--; + fs->m_sble.sb_free_inodes_count = + htole32(fs->m_free_inodes_count); + fs->m_fs_was_modified = 1; + + ip = VTOI(*vpp); + + /* Zero the dinode */ + memset(ip->i_e4din, 0, + sizeof(struct ext4fs_dinode_256)); + + /* Initialize extent header */ + ip->i_e4din->dinode.i_extent_header.eh_magic = + htole16(EXT4FS_EXTENT_HEADER_MAGIC); + ip->i_e4din->dinode.i_extent_header.eh_entries = + htole16(0); + ip->i_e4din->dinode.i_extent_header.eh_max = + htole16(4); + ip->i_e4din->dinode.i_extent_header.eh_depth = + htole16(0); + ip->i_e4din->dinode.i_flags = + htole32(EXTFS_INODE_FLAG_EXTENTS); + + /* Set extra_isize */ + ip->i_e4din->dinode.i_extra_isize = + htole16(sizeof(struct ext4fs_dinode) - + 128); + + /* Set generation number */ + if (++ext4fs_gennumber < + (u_long)gettime()) + ext4fs_gennumber = gettime(); + ip->i_e4din->dinode.i_nfs_generation = + htole32(ext4fs_gennumber); + + return (0); + } + } + + brelse(bp); + } + + return (ENOSPC); +} + +/* + * Free an inode. + */ +void +ext4fs_inode_free(struct inode *pip, ufsino_t ino, mode_t mode) +{ + struct m_ext4fs *fs = pip->i_e4fs; + struct ext4fs_block_group_descriptor *gd; + struct buf *bp; + u_int64_t bitmap_blk; + u_int32_t group, ino_in_group, free_inodes, icsum; + char *ibp; + int error; + + group = (ino - 1) / fs->m_inodes_per_group; + ino_in_group = (ino - 1) % fs->m_inodes_per_group; + gd = &fs->m_gd[group]; + + bitmap_blk = letoh32(gd->bgd_inode_bitmap_block_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + bitmap_blk |= + (u_int64_t)letoh32(gd->bgd_inode_bitmap_block_hi) << 32; + + error = bread(pip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return; + } + + ibp = (char *)bp->b_data; + clrbit(ibp, ino_in_group); + + icsum = ext4fs_bitmap_csum(fs, group, ibp, + fs->m_inodes_per_group / 8); + gd->bgd_inode_bitmap_checksum_lo = htole16(icsum & 0xFFFF); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_inode_bitmap_checksum_hi = + htole16((icsum >> 16) & 0xFFFF); + + error = bwrite(bp); + if (error) + return; + + /* Update BGD */ + free_inodes = letoh16(gd->bgd_free_inodes_count_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + free_inodes |= + (u_int32_t)letoh16(gd->bgd_free_inodes_count_hi) << 16; + free_inodes++; + gd->bgd_free_inodes_count_lo = htole16(free_inodes & 0xFFFF); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_free_inodes_count_hi = + htole16((free_inodes >> 16) & 0xFFFF); + + if ((mode & S_IFMT) == S_IFDIR) { + u_int32_t dirs; + dirs = letoh16(gd->bgd_used_dirs_count_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + dirs |= (u_int32_t) + letoh16(gd->bgd_used_dirs_count_hi) << 16; + dirs--; + gd->bgd_used_dirs_count_lo = htole16(dirs & 0xFFFF); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_used_dirs_count_hi = + htole16((dirs >> 16) & 0xFFFF); + } + + ext4fs_bgd_write(fs, pip->i_devvp, group); + + /* Update superblock counters */ + fs->m_free_inodes_count++; + fs->m_sble.sb_free_inodes_count = + htole32(fs->m_free_inodes_count); + fs->m_fs_was_modified = 1; +} + +static int +ext4fs_sync_vnode(struct vnode *vp, void *arg) +{ + struct ext4fs_sync_args *esa = arg; + struct inode *ip; + int error, s, skip; + + if (vp->v_type == VNON) + return (0); + + ip = VTOI(vp); + if (ip == NULL || ip->i_e4din == NULL) + return (0); + + s = splbio(); + skip = (ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && + LIST_EMPTY(&vp->v_dirtyblkhd); + splx(s); + + if (skip) + return (0); + + if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT)) + return (0); + + if ((error = VOP_FSYNC(vp, esa->cred, esa->waitfor, esa->p)) != 0) + esa->allerror = error; + + vput(vp); + return (0); +} + +int +ext4fs_sync(struct mount *mp, int waitfor, int stall, + struct ucred *cred, struct proc *p) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct m_ext4fs *fs = ump->um_e4fs; + struct ext4fs_sync_args esa; + int error; + + if (fs->m_read_only) + return (0); + + esa.p = p; + esa.cred = cred; + esa.allerror = 0; + esa.waitfor = waitfor; + + vfs_mount_foreach_vnode(mp, ext4fs_sync_vnode, &esa); + + if (waitfor != MNT_LAZY) { + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p))) + esa.allerror = error; + VOP_UNLOCK(ump->um_devvp); + } + + /* Write superblock if modified */ + if (fs->m_fs_was_modified) { + if ((error = ext4fs_sbwrite(mp))) + esa.allerror = error; + } + + return (esa.allerror); +} + +int +ext4fs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen, struct proc *p) +{ + (void)name; + (void)namelen; + (void)oldp; + (void)oldlenp; + (void)newp; + (void)newlen; + (void)p; + printf("ext4fs_sysctl: not implemented\n"); + return (EOPNOTSUPP); +} + +int +ext4fs_unmount(struct mount *mp, int mntflags, struct proc *p) +{ + struct ufsmount *ump; + struct m_ext4fs *mfs; + int error, flags; + + flags = 0; + if (mntflags & MNT_FORCE) + flags |= FORCECLOSE; + if ((error = ext4fs_flushfiles(mp, flags, p)) != 0) + return (error); + ump = VFSTOUFS(mp); + mfs = ump->um_e4fs; + + if (!mfs->m_read_only && mfs->m_fs_was_modified) { + mfs->m_state = EXT4FS_STATE_VALID; + ext4fs_sbwrite(mp); + } + + if (ump->um_devvp->v_type != VBAD) + ump->um_devvp->v_specmountpoint = NULL; + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + (void)VOP_CLOSE(ump->um_devvp, mfs->m_read_only ? FREAD : + FREAD|FWRITE, NOCRED, p); + vput(ump->um_devvp); + if (mfs->m_gd != NULL) { + size_t gd_size = mfs->m_block_group_count * + sizeof(struct ext4fs_block_group_descriptor); + free(mfs->m_gd, M_UFSMNT, gd_size); + } + free(mfs, M_UFSMNT, sizeof *mfs); + free(ump, M_UFSMNT, sizeof *ump); + mp->mnt_data = NULL; + mp->mnt_flag &= ~MNT_LOCAL; + return (0); +} + +int +ext4fs_vget(struct mount *mp, ino_t ino, struct vnode **vpp) +{ + struct m_ext4fs *fs; + struct inode *ip; + struct ufsmount *ump; + struct buf *bp; + struct vnode *vp; + struct ext4fs_block_group_descriptor *gd; + struct ext4fs_dinode *dp; + dev_t dev; + daddr_t disk_block; + u_int64_t inode_table_block; + u_int32_t inode_group, inode_index, block_in_table, offset_in_block; + u_int32_t itable_unused; + u_int16_t bgd_flags, imode; + int error; + + if (ino > (ufsino_t)-1) + panic("ext4fs_vget: alien ino_t %llu", + (unsigned long long)ino); + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + fs = ump->um_e4fs; + + retry: + if ((*vpp = ufs_ihashget(dev, ino)) != NULL) { + return (0); + } + + /* Allocate a new vnode/inode. */ + if ((error = getnewvnode(VT_EXT4FS, mp, &ext4fs_vops, &vp)) != 0) { + *vpp = NULL; + return (error); + } + ip = pool_get(&ext4fs_inode_pool, PR_WAITOK|PR_ZERO); + rrw_init_flags(&ip->i_lock, "inode", RWL_DUPOK | RWL_IS_VNODE); + vp->v_data = ip; + ip->i_vnode = vp; + ip->i_ump = ump; + ip->i_e4fs = fs; + ip->i_dev = dev; + ip->i_number = ino; + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + error = ufs_ihashins(ip); + + if (error) { + /* + * The vnode was locked by ufs_ihashins, then unlocked on error. + * We need to properly clean up the inode and vnode. + * vrele will trigger reclaim which will free the inode. + */ + vrele(vp); + + if (error == EEXIST) + goto retry; + + return (error); + } + + vref(ip->i_devvp); + + /* Calculate inode location on disk */ + if (ino == 0 || ino > fs->m_inodes_count) { + vput(vp); + *vpp = NULL; + return (ESTALE); + } + inode_group = (ino - 1) / fs->m_inodes_per_group; + if (inode_group >= fs->m_block_group_count) { + vput(vp); + *vpp = NULL; + return (ESTALE); + } + inode_index = (ino - 1) % fs->m_inodes_per_group; + block_in_table = inode_index / fs->m_inodes_per_block; + offset_in_block = (inode_index % fs->m_inodes_per_block) * + fs->m_inode_size; + + gd = &fs->m_gd[inode_group]; + inode_table_block = letoh32(gd->bgd_inode_table_block_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + inode_table_block |= (u_int64_t) + letoh32(gd->bgd_inode_table_block_hi) << 32; + + /* Read the block containing this inode */ + disk_block = (inode_table_block + block_in_table) << + fs->m_fs_block_to_disk_block; + error = bread(ump->um_devvp, disk_block, fs->m_block_size, &bp); + if (error) { + vput(vp); + brelse(bp); + *vpp = NULL; + return (error); + } + + dp = (struct ext4fs_dinode *)((char *)bp->b_data + offset_in_block); + + /* Allocate space for on-disk inode */ + ip->i_e4din = pool_get(&ext4fs_dinode_pool, PR_WAITOK|PR_ZERO); + + /* + * If the group has INODE_UNINIT set, or the inode is in the + * unused portion of the inode table, the on-disk data is + * garbage. Keep the zeroed buffer and skip checksum verification. + */ + bgd_flags = letoh16(gd->bgd_flags); + itable_unused = letoh16(gd->bgd_inode_table_unused_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + itable_unused |= (u_int32_t) + letoh16(gd->bgd_inode_table_unused_hi) << 16; + if ((bgd_flags & EXT4FS_BGD_FLAG_INODE_UNINIT) || + inode_index >= fs->m_inodes_per_group - itable_unused) { + memset(dp, 0, fs->m_inode_size); + error = bwrite(bp); + if (error) { + pool_put(&ext4fs_dinode_pool, ip->i_e4din); + ip->i_e4din = NULL; + vput(vp); + *vpp = NULL; + return (error); + } + } else { + memcpy(ip->i_e4din, dp, fs->m_inode_size); + brelse(bp); + + /* Verify inode checksum for initialized slots */ + if (letoh16(ip->i_e4din->dinode.i_mode) != 0 || + letoh16(ip->i_e4din->dinode.i_links_count) != 0 || + letoh32(ip->i_e4din->dinode.i_dtime) != 0) { + error = ext4fs_inode_csum_verify(fs, + ip->i_e4din, ino); + if (error) { + pool_put(&ext4fs_dinode_pool, + ip->i_e4din); + ip->i_e4din = NULL; + vput(vp); + *vpp = NULL; + return (error); + } + } + } + + /* Set vnode type based on inode mode */ + imode = letoh16(ip->i_e4din->dinode.i_mode); + switch (imode & S_IFMT) { + case S_IFDIR: + vp->v_type = VDIR; + break; + case S_IFREG: + vp->v_type = VREG; + break; + case S_IFLNK: + vp->v_type = VLNK; + break; + case S_IFBLK: + vp->v_type = VBLK; + break; + case S_IFCHR: + vp->v_type = VCHR; + break; + case S_IFIFO: + vp->v_type = VFIFO; + break; + case S_IFSOCK: + vp->v_type = VSOCK; + break; + default: + vp->v_type = VNON; + break; + } + + /* Set effective link count */ + ip->i_effnlink = letoh16(ip->i_e4din->dinode.i_links_count); + + /* Set VROOT flag for root inode */ + if (ip->i_number == EXT4FS_INODE_ROOT_DIR) + vp->v_flag |= VROOT; + + /* If the inode was deleted, reset all fields */ + if (letoh32(ip->i_e4din->dinode.i_dtime) != 0) { + vp->v_type = VNON; + ip->i_effnlink = 0; + } + + *vpp = vp; + return (0); +} + +int +ext4fs_vptofh(struct vnode *vp, struct fid *fhp) +{ + (void)vp; + (void)fhp; + printf("ext4fs_vptofh: not implemented\n"); + return (EOPNOTSUPP); +} diff --git a/sys/ufs/ext4fs/ext4fs_vnops.c b/sys/ufs/ext4fs/ext4fs_vnops.c new file mode 100644 index 000000000..4968514a8 --- /dev/null +++ b/sys/ufs/ext4fs/ext4fs_vnops.c @@ -0,0 +1,3842 @@ +/* + * Copyright (c) 2025 kmx.io. + * Copyright (c) 1997 Manuel Bouyer. + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Modified for ext4fs by kmx.io. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* Convert ext4 directory entry file type to BSD dirent type */ +static const u_int8_t ext4fs_type_to_dt[EXT4FS_FT_MAX] = { + [EXT4FS_FT_UNKNOWN] = DT_UNKNOWN, + [EXT4FS_FT_REG_FILE] = DT_REG, + [EXT4FS_FT_DIR] = DT_DIR, + [EXT4FS_FT_CHRDEV] = DT_CHR, + [EXT4FS_FT_BLKDEV] = DT_BLK, + [EXT4FS_FT_FIFO] = DT_FIFO, + [EXT4FS_FT_SOCK] = DT_SOCK, + [EXT4FS_FT_SYMLINK] = DT_LNK, +}; + +/* + * Look up the physical block number for a given logical block number + * using the extent tree in the inode. + * Returns 0 on success with the physical block stored in *pblk. + */ +static int +ext4fs_extent_pblk(struct inode *ip, u_int64_t lbn, u_int64_t *pblk, + u_int64_t *ncontig) +{ + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct ext4fs_extent_header *eh; + struct m_ext4fs *fs = ip->i_e4fs; + struct buf *bp = NULL; + u_int16_t entries, depth; + int error, found, i; + + /* Start with the extent header in the inode */ + eh = &din->i_extent_header; + if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC) + return (EIO); + + depth = letoh16(eh->eh_depth); + entries = letoh16(eh->eh_entries); + + /* Walk down the extent tree */ + while (depth > 0) { + struct ext4fs_extent_idx *idx; + u_int64_t child_blk; + + /* Index node: find the child that covers lbn */ + idx = (struct ext4fs_extent_idx *)(eh + 1); + found = -1; + for (i = 0; i < (int)entries; i++) { + if (letoh32(idx[i].ei_block) <= lbn) + found = i; + else + break; + } + if (found < 0) { + if (bp != NULL) + brelse(bp); + return (EIO); + } + + /* Read the child node block */ + child_blk = letoh32(idx[found].ei_leaf_lo); + child_blk |= (u_int64_t)letoh16(idx[found].ei_leaf_hi) << 32; + + if (bp != NULL) + brelse(bp); + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, child_blk), + fs->m_block_size, &bp); + if (error) { + if (bp != NULL) + brelse(bp); + return (error); + } + + eh = (struct ext4fs_extent_header *)bp->b_data; + if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC) { + brelse(bp); + return (EIO); + } + depth = letoh16(eh->eh_depth); + entries = letoh16(eh->eh_entries); + } + + /* Leaf node: search for the extent containing lbn */ + { + struct ext4fs_extent *ext; + ext = (struct ext4fs_extent *)(eh + 1); + for (i = 0; i < (int)entries; i++) { + u_int32_t e_block = letoh32(ext[i].e_block); + u_int16_t e_len = letoh16(ext[i].e_len); + + /* High bit of e_len marks uninitialized extents */ + if (e_len > 32768) + e_len -= 32768; + + if (lbn >= e_block && lbn < e_block + e_len) { + u_int64_t start = letoh32(ext[i].e_start_lo); + start |= + (u_int64_t)letoh16(ext[i].e_start_hi) << 32; + *pblk = start + (lbn - e_block); + if (ncontig != NULL) + *ncontig = e_len - (lbn - e_block); + if (bp != NULL) + brelse(bp); + return (0); + } + } + } + + if (bp != NULL) + brelse(bp); + + /* Block not covered by any extent — hole */ + *pblk = 0; + if (ncontig != NULL) + *ncontig = 1; + return (0); +} + +/* + * Write inode back to disk with checksum update. + */ +int +ext4fs_update(struct inode *ip, int waitfor) +{ + struct m_ext4fs *fs = ip->i_e4fs; + struct buf *bp; + u_int32_t inode_group, inode_index, block_in_table, offset_in_block; + struct ext4fs_block_group_descriptor *gd; + u_int64_t inode_table_block; + daddr_t disk_block; + u_int32_t csum; + int error; + + if (ITOV(ip)->v_mount->mnt_flag & MNT_RDONLY) + return (0); + + EXT4FS_ITIMES(ip); + + if ((ip->i_flag & IN_MODIFIED) == 0) { + return (0); + } + + ip->i_flag &= ~IN_MODIFIED; + + /* Locate inode on disk */ + inode_group = (ip->i_number - 1) / fs->m_inodes_per_group; + inode_index = (ip->i_number - 1) % fs->m_inodes_per_group; + block_in_table = inode_index / fs->m_inodes_per_block; + offset_in_block = (inode_index % fs->m_inodes_per_block) * + fs->m_inode_size; + + gd = &fs->m_gd[inode_group]; + inode_table_block = letoh32(gd->bgd_inode_table_block_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + inode_table_block |= + (u_int64_t)letoh32(gd->bgd_inode_table_block_hi) << 32; + + disk_block = (inode_table_block + block_in_table) << + fs->m_fs_block_to_disk_block; + + error = bread(ip->i_devvp, disk_block, fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (error); + } + + /* + * Verify extent header integrity before writing. + * If the inode uses extents (not a fast symlink), the magic + * must be valid. Refuse to persist corruption. + */ + { + u_int16_t wr_mode = letoh16(ip->i_e4din->dinode.i_mode); + u_int32_t wr_flags = letoh32(ip->i_e4din->dinode.i_flags); + u_int16_t wr_magic = + letoh16(ip->i_e4din->dinode.i_extent_header.eh_magic); + if (wr_mode != 0 && + (wr_flags & EXTFS_INODE_FLAG_EXTENTS) && + wr_magic != EXT4FS_EXTENT_HEADER_MAGIC) { + printf("ext4fs_update: REFUSING to write ino=%u " + "with corrupt extent header! " + "magic=0x%x mode=0%o flags=0x%x\n", + ip->i_number, wr_magic, wr_mode, wr_flags); + brelse(bp); + return (EIO); + } + } + + /* Recompute inode checksum */ + csum = ext4fs_inode_csum(fs, ip->i_e4din, ip->i_number); + ip->i_e4din->dinode.i_checksum_lo = htole16(csum & 0xFFFF); + ip->i_e4din->dinode.i_checksum_hi = htole16((csum >> 16) & 0xFFFF); + + /* Copy inode to buffer */ + memcpy((char *)bp->b_data + offset_in_block, ip->i_e4din, + fs->m_inode_size); + + if (waitfor) { + error = bwrite(bp); + return (error); + } + + bdwrite(bp); + return (0); +} + +/* + * Set inode size (both low and high 32-bit fields). + */ +void +ext4fs_setsize(struct inode *ip, u_int64_t size) +{ + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + + din->i_size_lo = htole32((u_int32_t)size); + din->i_size_hi = htole32((u_int32_t)(size >> 32)); +} + +/* + * Allocate a filesystem block. + * Tries the group of the goal block first, then scans all groups. + */ +int +ext4fs_blkalloc(struct inode *ip, u_int64_t goal, u_int32_t count, + u_int64_t *bnp, u_int32_t *countp) +{ + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_block_group_descriptor *gd; + struct buf *bp, *dbp; + u_int64_t bitmap_blk, grp_start, bb, ib, itb; + u_int32_t group, ngroups, g, blk_in_group, free_blocks; + u_int32_t it_blocks, mb, pbit, rb, bcsum; + u_int32_t start_bit, nalloced, k; + u_int32_t *dind; + char *bbp; + int error, i, j, has_sb; + + *bnp = 0; + *countp = 0; + + if (count == 0) + count = 1; + + if (fs->m_free_blocks_count == 0) + return (ENOSPC); + + ngroups = fs->m_block_group_count; + + /* Pick starting group from goal */ + if (goal >= fs->m_first_data_block && goal < fs->m_blocks_count) + group = (goal - fs->m_first_data_block) / + fs->m_blocks_per_group; + else + group = (ip->i_number - 1) / fs->m_inodes_per_group; + + for (i = 0; i < ngroups; i++) { + g = (group + i) % ngroups; + gd = &fs->m_gd[g]; + + free_blocks = letoh16(gd->bgd_free_blocks_count_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + free_blocks |= (u_int32_t) + letoh16(gd->bgd_free_blocks_count_hi) << 16; + if (free_blocks == 0) + continue; + + /* Read block bitmap */ + bitmap_blk = letoh32(gd->bgd_block_bitmap_block_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + bitmap_blk |= (u_int64_t) + letoh32(gd->bgd_block_bitmap_block_hi) << 32; + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + continue; + } + bbp = (char *)bp->b_data; + + /* + * If BLOCK_UNINIT is set, the on-disk bitmap block + * may contain garbage. Zero it and mark metadata + * blocks (bitmaps, inode table) as used. + */ + if (letoh16(gd->bgd_flags) & + EXT4FS_BGD_FLAG_BLOCK_UNINIT) { + grp_start = (u_int64_t)g * + fs->m_blocks_per_group + + fs->m_first_data_block; + memset(bbp, 0, fs->m_block_size); + /* + * Mark superblock, GDT, and reserved + * GDT blocks for groups that have them. + */ + has_sb = 0; + if (!(fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_SPARSE_SUPER)) + has_sb = 1; + else if (g == 0 || g == 1) + has_sb = 1; + else { + u_int64_t n; + for (n = 3; n <= g; n *= 3) + if (n == g) has_sb = 1; + for (n = 5; n <= g; n *= 5) + if (n == g) has_sb = 1; + for (n = 7; n <= g; n *= 7) + if (n == g) has_sb = 1; + } + if (has_sb) { + u_int32_t overhead = 1 + + fs->m_block_group_descriptor_blocks_count + + fs->m_reserved_bgdt_blocks; + for (mb = 0; mb < overhead; mb++) + setbit(bbp, mb); + } + /* Block bitmap */ + bb = letoh32(gd->bgd_block_bitmap_block_lo); + if (bb >= grp_start && + bb < grp_start + fs->m_blocks_per_group) + setbit(bbp, bb - grp_start); + /* Inode bitmap */ + ib = letoh32(gd->bgd_inode_bitmap_block_lo); + if (ib >= grp_start && + ib < grp_start + fs->m_blocks_per_group) + setbit(bbp, ib - grp_start); + /* Inode table */ + itb = letoh32(gd->bgd_inode_table_block_lo); + it_blocks = (fs->m_inodes_per_group * + fs->m_inode_size + fs->m_block_size - 1) / + fs->m_block_size; + for (mb = 0; mb < it_blocks; mb++) { + u_int64_t b = itb + mb; + if (b >= grp_start && + b < grp_start + + fs->m_blocks_per_group) + setbit(bbp, b - grp_start); + } + for (pbit = fs->m_blocks_per_group; + pbit < fs->m_block_size * 8; pbit++) + setbit(bbp, pbit); + /* Mark resize inode (inode 7) blocks */ + if (fs->m_resize_dind_block != 0) { + if (fs->m_resize_dind_block >= grp_start && + fs->m_resize_dind_block < + grp_start + fs->m_blocks_per_group) + setbit(bbp, + fs->m_resize_dind_block - + grp_start); + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, + fs->m_resize_dind_block), + fs->m_block_size, &dbp); + if (!error) { + dind = (u_int32_t *)dbp->b_data; + for (j = 0; + j < fs->m_block_size / 4; + j++) { + rb = letoh32(dind[j]); + if (rb == 0) + continue; + if (rb >= grp_start && + rb < grp_start + + fs->m_blocks_per_group) + setbit(bbp, + rb - grp_start); + } + brelse(dbp); + } else { + brelse(dbp); + } + } + gd->bgd_flags = htole16(letoh16( + gd->bgd_flags) & + ~EXT4FS_BGD_FLAG_BLOCK_UNINIT); + ext4fs_bgd_write(fs, ip->i_devvp, g); + } + + /* Start scan from goal bit if goal is in this group */ + start_bit = 0; + if (goal >= fs->m_first_data_block && + goal < fs->m_blocks_count) { + u_int32_t goal_group = (goal - fs->m_first_data_block) / + fs->m_blocks_per_group; + if (goal_group == g) + start_bit = (goal - fs->m_first_data_block) % + fs->m_blocks_per_group; + } + + /* Scan bitmap for free block(s) */ + for (blk_in_group = start_bit; + blk_in_group < fs->m_blocks_per_group; + blk_in_group++) { + if (isclr(bbp, blk_in_group)) { + /* Found first free bit; grab contiguous run */ + nalloced = 1; + setbit(bbp, blk_in_group); + for (k = 1; k < count && + blk_in_group + k < fs->m_blocks_per_group && + isclr(bbp, blk_in_group + k); k++) { + setbit(bbp, blk_in_group + k); + nalloced++; + } + + bcsum = ext4fs_bitmap_csum(fs, g, bbp, + fs->m_block_size); + gd->bgd_block_bitmap_checksum_lo = + htole16(bcsum & 0xFFFF); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_block_bitmap_checksum_hi + = htole16( + (bcsum >> 16) & 0xFFFF); + + bdwrite(bp); + + /* Update BGD */ + free_blocks -= nalloced; + gd->bgd_free_blocks_count_lo = + htole16(free_blocks & 0xFFFF); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_free_blocks_count_hi = + htole16((free_blocks >> 16) & + 0xFFFF); + + ext4fs_bgd_write(fs, ip->i_devvp, g); + + /* Update superblock counters */ + fs->m_free_blocks_count -= nalloced; + fs->m_sble.sb_free_blocks_count_lo = + htole32((u_int32_t) + fs->m_free_blocks_count); + fs->m_sble.sb_free_blocks_count_hi = + htole32((u_int32_t) + (fs->m_free_blocks_count >> 32)); + fs->m_fs_was_modified = 1; + + *bnp = (u_int64_t)g * fs->m_blocks_per_group + + blk_in_group + fs->m_first_data_block; + *countp = nalloced; + + return (0); + } + } + + brelse(bp); + } + + return (ENOSPC); +} + +/* + * Free a filesystem block. + */ +void +ext4fs_blkfree(struct inode *ip, u_int64_t bno) +{ + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_block_group_descriptor *gd; + struct buf *bp; + u_int64_t bitmap_blk; + u_int32_t group, blk_in_group, free_blocks; + char *bbp; + int error; + + if (bno < fs->m_first_data_block || bno >= fs->m_blocks_count) + return; + + group = (bno - fs->m_first_data_block) / fs->m_blocks_per_group; + if (group >= fs->m_block_group_count) + return; + blk_in_group = (bno - fs->m_first_data_block) % + fs->m_blocks_per_group; + gd = &fs->m_gd[group]; + + /* Read block bitmap */ + bitmap_blk = letoh32(gd->bgd_block_bitmap_block_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + bitmap_blk |= + (u_int64_t)letoh32(gd->bgd_block_bitmap_block_hi) << 32; + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return; + } + + bbp = (char *)bp->b_data; + clrbit(bbp, blk_in_group); + + /* Update block bitmap checksum in BGD */ + { + u_int32_t bcsum = ext4fs_bitmap_csum(fs, group, bbp, + fs->m_block_size); + gd->bgd_block_bitmap_checksum_lo = htole16(bcsum & 0xFFFF); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_block_bitmap_checksum_hi = + htole16((bcsum >> 16) & 0xFFFF); + } + + bdwrite(bp); + + /* Update BGD */ + free_blocks = letoh16(gd->bgd_free_blocks_count_lo); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + free_blocks |= + (u_int32_t)letoh16(gd->bgd_free_blocks_count_hi) << 16; + free_blocks++; + gd->bgd_free_blocks_count_lo = htole16(free_blocks & 0xFFFF); + if (fs->m_feature_incompat & EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_free_blocks_count_hi = + htole16((free_blocks >> 16) & 0xFFFF); + + ext4fs_bgd_write(fs, ip->i_devvp, group); + + /* Update superblock counters */ + fs->m_free_blocks_count++; + fs->m_sble.sb_free_blocks_count_lo = + htole32((u_int32_t)fs->m_free_blocks_count); + fs->m_sble.sb_free_blocks_count_hi = + htole32((u_int32_t)(fs->m_free_blocks_count >> 32)); + fs->m_fs_was_modified = 1; +} + +/* + * Promote a depth-0 extent tree to depth 1. + * Called when the inline extent array is full (4/4 entries). + * Allocates a leaf block, copies the 4 inline extents into it, + * and converts the inode root to an index node with one entry. + */ +static int +ext4fs_extent_grow_tree(struct inode *ip) +{ + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct ext4fs_extent_header *eh = &din->i_extent_header; + struct ext4fs_extent_header *leaf_eh; + struct buf *bp; + u_int64_t leaf_blk; + u_int16_t maxleaf; + u_int64_t i_blocks; + u_int32_t got; + int error; + + if (letoh16(eh->eh_depth) != 0) + return (EIO); + if (letoh16(eh->eh_entries) != 4) + return (EIO); + + /* Allocate a block for the leaf node */ + error = ext4fs_blkalloc(ip, 0, 1, &leaf_blk, &got); + if (error) + return (error); + + /* Get buffer for the new leaf block */ + bp = getblk(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk), + fs->m_block_size, 0, INFSLP); + clrbuf(bp); + + /* Initialize leaf block header */ + maxleaf = (fs->m_block_size - sizeof(struct ext4fs_extent_header)) / + sizeof(struct ext4fs_extent); + leaf_eh = (struct ext4fs_extent_header *)bp->b_data; + leaf_eh->eh_magic = htole16(EXT4FS_EXTENT_HEADER_MAGIC); + leaf_eh->eh_entries = htole16(4); + leaf_eh->eh_max = htole16(maxleaf); + leaf_eh->eh_depth = htole16(0); + leaf_eh->eh_generation = htole32(0); + + /* Copy 4 inline extents into the leaf block */ + memcpy((char *)bp->b_data + sizeof(struct ext4fs_extent_header), + din->i_extent, 4 * sizeof(struct ext4fs_extent)); + + ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, bp->b_data); + bdwrite(bp); + + /* Convert inode root to index node with depth=1 */ + eh->eh_depth = htole16(1); + eh->eh_entries = htole16(1); + /* eh_max stays 4 (4 index entries fit in the inline area) */ + + /* Set up the single index entry pointing to the leaf block */ + { + struct ext4fs_extent_idx *idx = din->i_extent_idx; + idx[0].ei_block = din->i_extent[0].e_block; /* first lbn */ + idx[0].ei_leaf_lo = htole32((u_int32_t)leaf_blk); + idx[0].ei_leaf_hi = htole16((u_int16_t)(leaf_blk >> 32)); + idx[0].ei_unused = 0; + /* Zero remaining index slots */ + memset(&idx[1], 0, 3 * sizeof(struct ext4fs_extent_idx)); + } + + /* Update inode block count for the leaf block */ + i_blocks = letoh32(din->i_blocks_lo) | + ((u_int64_t)letoh16(din->i_blocks_hi) << 32); + i_blocks += fs->m_block_size / DEV_BSIZE; + din->i_blocks_lo = htole32((u_int32_t)i_blocks); + din->i_blocks_hi = htole16((u_int16_t)(i_blocks >> 32)); + + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + return (0); +} + +/* + * Split a full leaf block into two. + * The old leaf keeps the first half, a new leaf gets the second half. + * A new index entry is added to the parent (the inode root). + * Returns ENOSPC if the parent index is also full (depth 2+ needed). + */ +static int +ext4fs_leaf_split(struct inode *ip, struct buf *old_bp, + struct ext4fs_extent_header *old_eh) +{ + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct ext4fs_extent_header *root_eh = &din->i_extent_header; + struct ext4fs_extent_idx *root_idx = din->i_extent_idx; + struct ext4fs_extent_header *new_eh; + struct ext4fs_extent *old_ext, *new_ext; + struct buf *new_bp; + u_int64_t new_blk; + u_int32_t new_first_block; + u_int16_t old_entries, new_entries, maxleaf; + u_int16_t root_entries, root_max; + u_int64_t i_blocks; + u_int32_t got; + int error, i; + + old_entries = letoh16(old_eh->eh_entries); + maxleaf = letoh16(old_eh->eh_max); + old_ext = (struct ext4fs_extent *)(old_eh + 1); + /* Check parent has room for new index entry */ + root_entries = letoh16(root_eh->eh_entries); + root_max = letoh16(root_eh->eh_max); + if (root_entries >= root_max) { + brelse(old_bp); + return (ENOSPC); /* Would need depth 2+, phase 4 */ + } + + /* Allocate block for new leaf */ + error = ext4fs_blkalloc(ip, 0, 1, &new_blk, &got); + if (error) { + brelse(old_bp); + return (error); + } + + /* Split ~50/50 */ + new_entries = old_entries / 2; + old_entries = old_entries - new_entries; + + /* + * Save the first logical block of the new (second) half + * BEFORE we write any buffers, since new_ext will point + * into new_bp->b_data which is consumed by bwrite. + */ + new_first_block = letoh32(old_ext[old_entries].e_block); + + new_bp = getblk(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, new_blk), + fs->m_block_size, 0, INFSLP); + clrbuf(new_bp); + + /* Initialize new leaf */ + new_eh = (struct ext4fs_extent_header *)new_bp->b_data; + new_eh->eh_magic = htole16(EXT4FS_EXTENT_HEADER_MAGIC); + new_eh->eh_entries = htole16(new_entries); + new_eh->eh_max = htole16(maxleaf); + new_eh->eh_depth = htole16(0); + new_eh->eh_generation = htole32(0); + + new_ext = (struct ext4fs_extent *)(new_eh + 1); + memcpy(new_ext, &old_ext[old_entries], + new_entries * sizeof(struct ext4fs_extent)); + + ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, new_bp->b_data); + bdwrite(new_bp); + + /* Update old leaf */ + old_eh->eh_entries = htole16(old_entries); + ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, old_bp->b_data); + bdwrite(old_bp); + + /* Add new index entry in parent root (keep sorted by ei_block) */ + { + struct ext4fs_extent_idx entry; + + entry.ei_block = htole32(new_first_block); + entry.ei_leaf_lo = htole32((u_int32_t)new_blk); + entry.ei_leaf_hi = htole16((u_int16_t)(new_blk >> 32)); + entry.ei_unused = 0; + + /* Find insertion point */ + for (i = 0; i < root_entries; i++) { + if (letoh32(root_idx[i].ei_block) > new_first_block) + break; + } + if (i < root_entries) + memmove(&root_idx[i + 1], &root_idx[i], + (root_entries - i) * + sizeof(struct ext4fs_extent_idx)); + root_idx[i] = entry; + root_eh->eh_entries = htole16(root_entries + 1); + } + + /* Update inode block count for the new leaf block */ + i_blocks = letoh32(din->i_blocks_lo) | + ((u_int64_t)letoh16(din->i_blocks_hi) << 32); + i_blocks += fs->m_block_size / DEV_BSIZE; + din->i_blocks_lo = htole32((u_int32_t)i_blocks); + din->i_blocks_hi = htole16((u_int16_t)(i_blocks >> 32)); + + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + return (0); +} + +/* + * Insert an extent into a depth > 0 extent tree. + * Walks the index to find the correct leaf, tries merge, + * inserts if room, splits leaf if full. + */ +static int +ext4fs_extent_insert_depth(struct inode *ip, u_int32_t lbn, u_int64_t pblk, + u_int16_t len) +{ + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct ext4fs_extent_header *root_eh = &din->i_extent_header; + struct ext4fs_extent_idx *idx; + struct ext4fs_extent_header *leaf_eh; + struct ext4fs_extent *ext; + struct buf *bp; + u_int64_t leaf_blk; + u_int16_t root_entries, leaf_entries, leaf_max; + int error, found, i; + + root_entries = letoh16(root_eh->eh_entries); + if (root_entries == 0) + return (EIO); + + /* Find the index entry whose subtree covers lbn */ + idx = din->i_extent_idx; + found = 0; + for (i = 0; i < root_entries; i++) { + if (letoh32(idx[i].ei_block) <= lbn) + found = i; + else + break; + } + + /* Read the leaf block */ + leaf_blk = letoh32(idx[found].ei_leaf_lo) | + ((u_int64_t)letoh16(idx[found].ei_leaf_hi) << 32); + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (error); + } + + leaf_eh = (struct ext4fs_extent_header *)bp->b_data; + if (letoh16(leaf_eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC) { + brelse(bp); + return (EIO); + } + + leaf_entries = letoh16(leaf_eh->eh_entries); + leaf_max = letoh16(leaf_eh->eh_max); + ext = (struct ext4fs_extent *)(leaf_eh + 1); + + /* Try to merge with last extent in this leaf */ + if (leaf_entries > 0) { + struct ext4fs_extent *last = &ext[leaf_entries - 1]; + u_int32_t last_block = letoh32(last->e_block); + u_int16_t last_len = letoh16(last->e_len); + u_int64_t last_start = letoh32(last->e_start_lo) | + ((u_int64_t)letoh16(last->e_start_hi) << 32); + + if (last_block + last_len == lbn && + last_start + last_len == pblk && + last_len + len <= 32768) { + last->e_len = htole16(last_len + len); + ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, bp->b_data); + bdwrite(bp); + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + return (0); + } + } + + /* Room in leaf? */ + if (leaf_entries < leaf_max) { + /* Find insertion point (keep sorted) */ + for (i = 0; i < leaf_entries; i++) { + if (letoh32(ext[i].e_block) > lbn) + break; + } + if (i < leaf_entries) + memmove(&ext[i + 1], &ext[i], + (leaf_entries - i) * + sizeof(struct ext4fs_extent)); + + ext[i].e_block = htole32(lbn); + ext[i].e_len = htole16(len); + ext[i].e_start_lo = htole32((u_int32_t)pblk); + ext[i].e_start_hi = htole16((u_int16_t)(pblk >> 32)); + + leaf_eh->eh_entries = htole16(leaf_entries + 1); + ext4fs_extent_block_csum_set(fs, ip->i_number, din->i_nfs_generation, bp->b_data); + bdwrite(bp); + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + return (0); + } + + /* Leaf is full - need to split */ + error = ext4fs_leaf_split(ip, bp, leaf_eh); + if (error) + return (error); + + /* bp was consumed by leaf_split (bwrite'd). Retry the insert. */ + return (ext4fs_extent_insert_depth(ip, lbn, pblk, len)); +} + +/* + * Insert an extent into the inode's extent tree. + * Handles depth 0 (inline) and depth > 0 (tree) cases. + * Tries to merge with the last extent if contiguous. + */ +static int +ext4fs_extent_insert(struct inode *ip, u_int32_t lbn, u_int64_t pblk, + u_int16_t len) +{ + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct ext4fs_extent_header *eh = &din->i_extent_header; + struct ext4fs_extent *ext = din->i_extent; + u_int16_t entries, maxe, depth; + int error, i; + + if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC) + return (EIO); + + depth = letoh16(eh->eh_depth); + + /* Depth > 0: delegate to tree insert */ + if (depth > 0) + return (ext4fs_extent_insert_depth(ip, lbn, pblk, len)); + + /* Depth 0: inline extents */ + entries = letoh16(eh->eh_entries); + maxe = letoh16(eh->eh_max); + + /* Try to merge with last extent */ + if (entries > 0) { + struct ext4fs_extent *last = &ext[entries - 1]; + u_int32_t last_block = letoh32(last->e_block); + u_int16_t last_len = letoh16(last->e_len); + u_int64_t last_start = letoh32(last->e_start_lo) | + ((u_int64_t)letoh16(last->e_start_hi) << 32); + + if (last_block + last_len == lbn && + last_start + last_len == pblk && + last_len + len <= 32768) { + last->e_len = htole16(last_len + len); + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + return (0); + } + } + + /* Room for a new inline entry? */ + if (entries < maxe) { + /* Find insertion point (keep sorted by lbn) */ + for (i = 0; i < entries; i++) { + if (letoh32(ext[i].e_block) > lbn) + break; + } + + /* Shift entries to make room */ + if (i < entries) + memmove(&ext[i + 1], &ext[i], + (entries - i) * sizeof(struct ext4fs_extent)); + + /* Insert new extent */ + ext[i].e_block = htole32(lbn); + ext[i].e_len = htole16(len); + ext[i].e_start_lo = htole32((u_int32_t)pblk); + ext[i].e_start_hi = htole16((u_int16_t)(pblk >> 32)); + + eh->eh_entries = htole16(entries + 1); + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + + return (0); + } + + /* Inline full - grow tree to depth 1, then insert */ + error = ext4fs_extent_grow_tree(ip); + if (error) + return (error); + + return (ext4fs_extent_insert_depth(ip, lbn, pblk, len)); +} + +/* + * Allocate a buffer for a logical block. + * If the block is already mapped, just read it. + * Otherwise, allocate a new physical block and insert extent. + */ +static int +ext4fs_buf_alloc(struct inode *ip, u_int64_t lbn, int size, + struct ucred *cred, struct buf **bpp, int flags) +{ + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + u_int64_t pblk, goal, ncontig, i_blocks; + int error; + + /* Check if already mapped */ + error = ext4fs_extent_pblk(ip, lbn, &pblk, &ncontig); + if (error == 0 && pblk != 0) { + /* Already mapped, just read */ + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, bpp); + if (error) + brelse(*bpp); + return (error); + } + error = 0; + + /* Not mapped - allocate a new block */ + /* Goal: try to be contiguous with last extent */ + goal = 0; + if (letoh16(din->i_extent_header.eh_entries) > 0) { + u_int16_t depth = letoh16(din->i_extent_header.eh_depth); + + if (depth == 0) { + u_int16_t ent = letoh16(din->i_extent_header.eh_entries); + struct ext4fs_extent *last = &din->i_extent[ent - 1]; + u_int64_t last_start = letoh32(last->e_start_lo) | + ((u_int64_t)letoh16(last->e_start_hi) << 32); + goal = last_start + letoh16(last->e_len); + } else { + /* Walk to last leaf to find last extent */ + u_int16_t ent = letoh16(din->i_extent_header.eh_entries); + struct ext4fs_extent_idx *idx = din->i_extent_idx; + u_int64_t leaf_blk; + struct buf *gbp; + + leaf_blk = letoh32(idx[ent - 1].ei_leaf_lo) | + ((u_int64_t)letoh16(idx[ent - 1].ei_leaf_hi) << 32); + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk), + fs->m_block_size, &gbp); + if (error == 0) { + struct ext4fs_extent_header *leh = + (struct ext4fs_extent_header *)gbp->b_data; + u_int16_t lent = letoh16(leh->eh_entries); + if (lent > 0 && letoh16(leh->eh_magic) == + EXT4FS_EXTENT_HEADER_MAGIC) { + struct ext4fs_extent *le = + (struct ext4fs_extent *)(leh + 1); + u_int64_t ls = + letoh32(le[lent - 1].e_start_lo) | + ((u_int64_t)letoh16( + le[lent - 1].e_start_hi) << 32); + goal = ls + letoh16(le[lent - 1].e_len); + } + brelse(gbp); + } else { + brelse(gbp); + } + } + } + + { + u_int32_t got; + + error = ext4fs_blkalloc(ip, goal, 1, &pblk, &got); + if (error) + return (error); + error = ext4fs_extent_insert(ip, lbn, pblk, 1); + if (error) { + ext4fs_blkfree(ip, pblk); + return (error); + } + /* Update inode block count (i_blocks is in 512-byte sectors) */ + i_blocks = letoh32(din->i_blocks_lo) | + ((u_int64_t)letoh16(din->i_blocks_hi) << 32); + i_blocks += fs->m_block_size / DEV_BSIZE; + din->i_blocks_lo = htole32((u_int32_t)i_blocks); + din->i_blocks_hi = htole16((u_int16_t)(i_blocks >> 32)); + } + + /* Set extents flag */ + din->i_flags |= htole32(EXTFS_INODE_FLAG_EXTENTS); + + ip->i_flag |= IN_CHANGE | IN_UPDATE; + + /* Get buffer for the new block */ + *bpp = getblk(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, 0, INFSLP); + if (flags & B_CLRBUF) + clrbuf(*bpp); + + return (0); +} + +/* + * Free all blocks described by an array of extents. + * Batches frees by block group for efficiency. + */ +static void +ext4fs_free_extents(struct inode *ip, struct ext4fs_extent *ext, + u_int16_t entries) +{ + struct m_ext4fs *fs = ip->i_e4fs; + int i; + + for (i = 0; i < entries; i++) { + u_int64_t start = letoh32(ext[i].e_start_lo) | + ((u_int64_t)letoh16(ext[i].e_start_hi) << 32); + u_int32_t len = letoh16(ext[i].e_len); + u_int32_t freed = 0; + + if (len > 32768) + len -= 32768; + + while (freed < len) { + u_int64_t bno = start + freed; + u_int32_t group, blk_in_group; + struct ext4fs_block_group_descriptor *gd; + + if (bno < fs->m_first_data_block || + bno >= fs->m_blocks_count) { + freed++; + continue; + } + group = (bno - fs->m_first_data_block) / + fs->m_blocks_per_group; + if (group >= fs->m_block_group_count) { + freed++; + continue; + } + blk_in_group = (bno - fs->m_first_data_block) % + fs->m_blocks_per_group; + gd = &fs->m_gd[group]; + u_int64_t bitmap_blk; + struct buf *bbp; + u_int32_t n, k, free_blocks; + int berr; + + /* How many blocks fall in this group? */ + n = fs->m_blocks_per_group - blk_in_group; + if (n > len - freed) + n = len - freed; + + bitmap_blk = letoh32(gd->bgd_block_bitmap_block_lo); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + bitmap_blk |= (u_int64_t)letoh32( + gd->bgd_block_bitmap_block_hi) << 32; + + berr = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, bitmap_blk), + fs->m_block_size, &bbp); + if (berr) { + brelse(bbp); + freed += n; + continue; + } + + for (k = 0; k < n; k++) + clrbit((char *)bbp->b_data, blk_in_group + k); + + { + u_int32_t bcsum = ext4fs_bitmap_csum(fs, group, + bbp->b_data, fs->m_block_size); + gd->bgd_block_bitmap_checksum_lo = + htole16(bcsum & 0xFFFF); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_block_bitmap_checksum_hi = + htole16((bcsum >> 16) & 0xFFFF); + } + bdwrite(bbp); + + free_blocks = letoh16(gd->bgd_free_blocks_count_lo); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + free_blocks |= (u_int32_t)letoh16( + gd->bgd_free_blocks_count_hi) << 16; + free_blocks += n; + gd->bgd_free_blocks_count_lo = + htole16(free_blocks & 0xFFFF); + if (fs->m_feature_incompat & + EXT4FS_FEATURE_INCOMPAT_64BIT) + gd->bgd_free_blocks_count_hi = + htole16((free_blocks >> 16) & 0xFFFF); + ext4fs_bgd_write(fs, ip->i_devvp, group); + + fs->m_free_blocks_count += n; + fs->m_sble.sb_free_blocks_count_lo = + htole32((u_int32_t)fs->m_free_blocks_count); + fs->m_sble.sb_free_blocks_count_hi = + htole32((u_int32_t)(fs->m_free_blocks_count >> 32)); + fs->m_fs_was_modified = 1; + + freed += n; + } + } +} + +/* + * Trim extents: free blocks beyond new_nblocks, trim straddling extents. + * Returns number of filesystem blocks freed. + */ +static u_int64_t +ext4fs_trim_extents(struct inode *ip, struct ext4fs_extent *ext, + u_int16_t *entries_p, u_int32_t new_nblocks) +{ + u_int16_t entries = *entries_p; + u_int64_t blocks_freed = 0; + u_int16_t new_count = 0; + int i; + + for (i = 0; i < entries; i++) { + u_int32_t eblk = letoh32(ext[i].e_block); + u_int16_t raw_len = letoh16(ext[i].e_len); + u_int16_t elen = raw_len; + u_int64_t estart; + int uninit = 0; + + if (elen > 32768) { + elen -= 32768; + uninit = 1; + } + estart = letoh32(ext[i].e_start_lo) | + ((u_int64_t)letoh16(ext[i].e_start_hi) << 32); + + if (eblk >= new_nblocks) { + /* Entirely past boundary — free all */ + struct ext4fs_extent tmp = ext[i]; + tmp.e_len = htole16(elen); + ext4fs_free_extents(ip, &tmp, 1); + blocks_freed += elen; + } else if (eblk + elen > new_nblocks) { + /* Straddles boundary — trim */ + u_int32_t keep = new_nblocks - eblk; + u_int32_t discard = elen - keep; + struct ext4fs_extent tmp; + + /* Free the tail */ + tmp.e_block = htole32(eblk + keep); + tmp.e_start_lo = htole32( + (u_int32_t)(estart + keep)); + tmp.e_start_hi = htole16( + (u_int16_t)((estart + keep) >> 32)); + tmp.e_len = htole16(discard); + ext4fs_free_extents(ip, &tmp, 1); + blocks_freed += discard; + + /* Keep the trimmed extent */ + ext[new_count] = ext[i]; + ext[new_count].e_len = htole16( + keep | (uninit ? 32768 : 0)); + new_count++; + } else { + /* Entirely before boundary — keep */ + if (new_count != i) + ext[new_count] = ext[i]; + new_count++; + } + } + + *entries_p = new_count; + return (blocks_freed); +} + +/* + * Truncate inode to given length. + * Handles grow (extend with hole), shrink to 0, and shrink to non-zero. + * Supports both depth-0 (inline) and depth > 0 (tree) extent trees. + */ +int +ext4fs_truncate(struct inode *ip, off_t length, int flags, struct ucred *cred) +{ + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct ext4fs_extent_header *eh = &din->i_extent_header; + struct vnode *vp = ITOV(ip); + off_t cursize; + u_int16_t entries, depth; + u_int64_t blocks_freed; + + cursize = (off_t)letoh32(din->i_size_lo) | + ((off_t)letoh32(din->i_size_hi) << 32); + + if (length == cursize) + return (0); + + if (length < 0) + return (EINVAL); + + if (letoh16(eh->eh_magic) != EXT4FS_EXTENT_HEADER_MAGIC) + return (EIO); + + depth = letoh16(eh->eh_depth); + entries = letoh16(eh->eh_entries); + + if (length > cursize) { + /* Grow: just update size. Gap becomes hole. */ + ext4fs_setsize(ip, length); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + uvm_vnp_setsize(vp, length); + return (ext4fs_update(ip, 1)); + } + + /* Shrink */ + blocks_freed = 0; + + if (length == 0) { + /* Truncate to 0: free everything */ + if (depth == 0) { + ext4fs_free_extents(ip, din->i_extent, entries); + } else { + struct ext4fs_extent_idx *idx = din->i_extent_idx; + int i; + + for (i = 0; i < entries; i++) { + u_int64_t leaf_blk; + struct ext4fs_extent_header *leaf_eh; + struct ext4fs_extent *leaf_ext; + struct buf *bp; + u_int16_t leaf_entries; + int error; + + leaf_blk = letoh32(idx[i].ei_leaf_lo) | + ((u_int64_t)letoh16( + idx[i].ei_leaf_hi) << 32); + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + continue; + } + + leaf_eh = (struct ext4fs_extent_header *) + bp->b_data; + if (letoh16(leaf_eh->eh_magic) != + EXT4FS_EXTENT_HEADER_MAGIC) { + brelse(bp); + continue; + } + + leaf_entries = letoh16(leaf_eh->eh_entries); + leaf_ext = (struct ext4fs_extent *) + (leaf_eh + 1); + + ext4fs_free_extents(ip, leaf_ext, + leaf_entries); + brelse(bp); + + ext4fs_blkfree(ip, leaf_blk); + } + } + + /* Reset inode root to depth 0, 0 entries */ + memset(din->i_extent, 0, + 4 * sizeof(struct ext4fs_extent)); + eh->eh_entries = htole16(0); + eh->eh_depth = htole16(0); + + /* Zero block count */ + din->i_blocks_lo = htole32(0); + din->i_blocks_hi = htole16(0); + } else { + /* Truncate to non-zero length */ + u_int32_t new_nblocks; + u_int64_t i_blocks, freed_512; + + new_nblocks = (length + fs->m_block_size - 1) / + fs->m_block_size; + + /* Zero out partial block tail */ + if (length % fs->m_block_size != 0) { + u_int32_t offset = length % fs->m_block_size; + u_int64_t pblk; + + if (ext4fs_extent_pblk(ip, new_nblocks - 1, + &pblk, NULL) == 0 && pblk != 0) { + struct buf *bp; + int error; + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (error); + } + memset((char *)bp->b_data + offset, 0, + fs->m_block_size - offset); + bdwrite(bp); + } + } + + /* Free/trim extents past new_nblocks */ + if (depth == 0) { + blocks_freed = ext4fs_trim_extents(ip, + din->i_extent, &entries, new_nblocks); + eh->eh_entries = htole16(entries); + } else { + struct ext4fs_extent_idx *idx = din->i_extent_idx; + u_int16_t new_idx_count = 0; + int i; + + for (i = 0; i < entries; i++) { + u_int64_t leaf_blk; + struct ext4fs_extent_header *leaf_eh; + struct ext4fs_extent *leaf_ext; + struct buf *bp; + u_int16_t leaf_entries; + int error; + + leaf_blk = letoh32(idx[i].ei_leaf_lo) | + ((u_int64_t)letoh16( + idx[i].ei_leaf_hi) << 32); + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, leaf_blk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + continue; + } + + leaf_eh = (struct ext4fs_extent_header *) + bp->b_data; + if (letoh16(leaf_eh->eh_magic) != + EXT4FS_EXTENT_HEADER_MAGIC) { + brelse(bp); + continue; + } + + leaf_entries = letoh16(leaf_eh->eh_entries); + leaf_ext = (struct ext4fs_extent *) + (leaf_eh + 1); + + blocks_freed += ext4fs_trim_extents(ip, + leaf_ext, &leaf_entries, new_nblocks); + + if (leaf_entries == 0) { + /* Leaf now empty — free it */ + brelse(bp); + ext4fs_blkfree(ip, leaf_blk); + } else { + leaf_eh->eh_entries = + htole16(leaf_entries); + ext4fs_extent_block_csum_set(fs, + ip->i_number, + din->i_nfs_generation, + bp->b_data); + bdwrite(bp); + if (new_idx_count != i) + idx[new_idx_count] = idx[i]; + new_idx_count++; + } + } + + entries = new_idx_count; + eh->eh_entries = htole16(entries); + + /* If all index entries gone, collapse to depth 0 */ + if (entries == 0) { + memset(din->i_extent, 0, + 4 * sizeof(struct ext4fs_extent)); + eh->eh_depth = htole16(0); + } + } + + /* Update i_blocks */ + i_blocks = letoh32(din->i_blocks_lo); + if (fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_HUGE_FILE) + i_blocks |= + (u_int64_t)letoh16(din->i_blocks_hi) << 32; + freed_512 = blocks_freed * + (fs->m_block_size / DEV_BSIZE); + if (i_blocks >= freed_512) + i_blocks -= freed_512; + else + i_blocks = 0; + din->i_blocks_lo = htole32((u_int32_t)i_blocks); + din->i_blocks_hi = htole16((u_int16_t)(i_blocks >> 32)); + } + + /* Update size */ + ext4fs_setsize(ip, length); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + + /* Purge cached data */ + uvm_vnp_setsize(vp, length); + vinvalbuf(vp, 0, NOCRED, curproc, 0, INFSLP); + + return (ext4fs_update(ip, 1)); +} + +/* Forward declarations */ + +int ext4fs_access(void *); +int ext4fs_advlock(void *); +int ext4fs_bmap(void *); +int ext4fs_chmod(struct vnode *, mode_t, struct ucred *); +int ext4fs_chown(struct vnode *, uid_t, gid_t, struct ucred *); +int ext4fs_create(void *); +int ext4fs_fsync(void *); +int ext4fs_getattr(void *); +int ext4fs_inactive(void *); +int ext4fs_link(void *); +int ext4fs_lookup(void *); +int ext4fs_mkdir(void *); +int ext4fs_mknod(void *); +int ext4fs_open(void *); +int ext4fs_pathconf(void *); +int ext4fs_print(void *); +int ext4fs_read(void *); +int ext4fs_readdir(void *); +int ext4fs_readlink(void *); +int ext4fs_reclaim(void *); +int ext4fs_remove(void *); +int ext4fs_rename(void *); +int ext4fs_rmdir(void *); +int ext4fs_setattr(void *); +int ext4fs_strategy(void *); +int ext4fs_symlink(void *); +int ext4fs_write(void *); + +const struct vops ext4fs_vops = { + .vop_lookup = ext4fs_lookup, + .vop_create = ext4fs_create, + .vop_mknod = ext4fs_mknod, + .vop_open = ext4fs_open, + .vop_close = ufs_close, + .vop_access = ext4fs_access, + .vop_getattr = ext4fs_getattr, + .vop_setattr = ext4fs_setattr, + .vop_read = ext4fs_read, + .vop_write = ext4fs_write, + .vop_ioctl = ufs_ioctl, + .vop_kqfilter = ufs_kqfilter, + .vop_revoke = NULL, + .vop_fsync = ext4fs_fsync, + .vop_remove = ext4fs_remove, + .vop_link = ext4fs_link, + .vop_rename = ext4fs_rename, + .vop_mkdir = ext4fs_mkdir, + .vop_rmdir = ext4fs_rmdir, + .vop_symlink = ext4fs_symlink, + .vop_readdir = ext4fs_readdir, + .vop_readlink = ext4fs_readlink, + .vop_abortop = NULL, + .vop_inactive = ext4fs_inactive, + .vop_reclaim = ext4fs_reclaim, + .vop_lock = ufs_lock, + .vop_unlock = ufs_unlock, + .vop_bmap = ext4fs_bmap, + .vop_strategy = ext4fs_strategy, + .vop_print = ext4fs_print, + .vop_pathconf = ext4fs_pathconf, + .vop_advlock = ext4fs_advlock, + .vop_bwrite = NULL, +}; + +/* Stub implementations */ + +int +ext4fs_lookup(void *v) +{ + struct vop_lookup_args *ap = v; + struct vnode *vdp = ap->a_dvp; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct inode *dp = VTOI(vdp); + struct m_ext4fs *fs = dp->i_e4fs; + struct ext4fs_dinode *din = &dp->i_e4din->dinode; + struct ext4fs_directory *ep; + struct vnode *tdp; + struct buf *bp; + int flags = cnp->cn_flags; + int nameiop = cnp->cn_nameiop; + int lockparent = flags & LOCKPARENT; + ino_t foundino = 0; + off_t off, filesz; + u_int64_t lbn, pblk, blkoff; + u_int16_t reclen; + int error; + + /* For CREATE: track free slot info */ + int slotneeded = 0; + int slotsize = 0; + off_t slotoffset = -1; + off_t prevoff = -1; + + *vpp = NULL; + + /* Check accessibility of directory */ + if ((error = VOP_ACCESS(vdp, VEXEC, cnp->cn_cred, cnp->cn_proc)) != 0) { + return (error); + } + + if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) && + (nameiop == DELETE || nameiop == RENAME)) + return (EROFS); + + /* Check the name cache */ + if ((error = cache_lookup(vdp, vpp, cnp)) >= 0) + return (error); + + /* Search directory for the name */ + filesz = (off_t)letoh32(din->i_size_lo) | + ((off_t)letoh32(din->i_size_hi) << 32); + + if (nameiop == CREATE || nameiop == RENAME) + slotneeded = EXT4FS_DIRSIZ(cnp->cn_namelen); + + for (off = 0; off < filesz; ) { + lbn = EXT4FS_LBLKNO(fs, off); + + error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL); + if (error || pblk == 0) { + return (error ? error : EIO); + } + + error = bread(dp->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (error); + } + + blkoff = EXT4FS_BLKOFF(fs, off); + prevoff = -1; + + while (blkoff < fs->m_block_size && off < filesz) { + ep = (struct ext4fs_directory *) + ((char *)bp->b_data + blkoff); + reclen = letoh16(ep->e4d_reclen); + + if (reclen == 0) { + brelse(bp); + return (EIO); + } + + /* Skip directory checksum tail entry */ + if (letoh32(ep->e4d_ino) == 0 && + ep->e4d_namlen == 0 && + ep->e4d_type == EXT4FS_DIR_TAIL_FT && + reclen == EXT4FS_DIR_TAIL_SIZE) { + off += reclen; + blkoff += reclen; + continue; + } + + /* Track free space for CREATE/RENAME */ + if ((nameiop == CREATE || nameiop == RENAME) && + slotoffset == -1) { + int freespace; + + if (letoh32(ep->e4d_ino) == 0) { + freespace = reclen; + } else { + freespace = reclen - + EXT4FS_DIRSIZ(ep->e4d_namlen); + } + if (freespace >= slotneeded) { + slotoffset = off; + slotsize = reclen; + } + } + + if (letoh32(ep->e4d_ino) != 0 && + ep->e4d_namlen == cnp->cn_namelen && + memcmp(cnp->cn_nameptr, ep->e4d_name, + cnp->cn_namelen) == 0) { + foundino = letoh32(ep->e4d_ino); + dp->i_ino = foundino; + dp->i_reclen = reclen; + dp->i_offset = off; + /* For DELETE: count = prev entry to this */ + if (nameiop == DELETE && prevoff != -1) + dp->i_count = off - prevoff; + else + dp->i_count = 0; + brelse(bp); + goto found; + } + + prevoff = off; + off += reclen; + blkoff += reclen; + } + + brelse(bp); + } + + /* Not found */ + if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { + if (vdp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, + cnp->cn_proc)) != 0) + return (error); + /* Save free slot info for direnter */ + if (slotoffset == -1) { + dp->i_offset = filesz; + dp->i_count = 0; + } else { + dp->i_offset = slotoffset; + dp->i_count = slotsize; + } + cnp->cn_flags |= SAVENAME; + if (!lockparent) { + VOP_UNLOCK(vdp); + cnp->cn_flags |= PDIRUNLOCK; + } + return (EJUSTRETURN); + } + + if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) + cache_enter(vdp, *vpp, cnp); + return (ENOENT); + +found: + if ((flags & ISLASTCN) && nameiop == LOOKUP) + dp->i_diroff = EXT4FS_LBLKNO(fs, dp->i_offset) * + fs->m_block_size; + + /* + * If deleting, and at end of pathname, return parameters + * which can be used to remove file. If the wantparent flag + * isn't set, we return only the directory (in ndp->ni_dvp), + * otherwise we go on and lock the inode, being careful with ".". + */ + if (nameiop == DELETE && (flags & ISLASTCN)) { + if ((error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, + cnp->cn_proc)) != 0) + return (error); + if (dp->i_number == foundino) { + vref(vdp); + *vpp = vdp; + return (0); + } + if ((error = VFS_VGET(vdp->v_mount, foundino, &tdp)) != 0) + return (error); + *vpp = tdp; + if (!lockparent) { + VOP_UNLOCK(vdp); + cnp->cn_flags |= PDIRUNLOCK; + } + return (0); + } + + /* + * If rewriting (RENAME), return the inode and the + * information required to rewrite the present directory + * Must get inode of directory entry to verify it's a + * regular file, or empty directory. + */ + if (nameiop == RENAME && (flags & ISLASTCN)) { + if ((error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, + cnp->cn_proc)) != 0) + return (error); + if (dp->i_number == foundino) + return (EISDIR); + if ((error = VFS_VGET(vdp->v_mount, foundino, &tdp)) != 0) + return (error); + *vpp = tdp; + cnp->cn_flags |= SAVENAME; + if (!lockparent) { + VOP_UNLOCK(vdp); + cnp->cn_flags |= PDIRUNLOCK; + } + return (0); + } + + if (flags & ISDOTDOT) { + /* ".." - unlock parent, get child, optionally relock */ + VOP_UNLOCK(vdp); + cnp->cn_flags |= PDIRUNLOCK; + error = VFS_VGET(vdp->v_mount, foundino, &tdp); + if (error) { + if (vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY) == 0) + cnp->cn_flags &= ~PDIRUNLOCK; + return (error); + } + if (lockparent && (flags & ISLASTCN)) { + if ((error = vn_lock(vdp, LK_EXCLUSIVE)) != 0) { + vput(tdp); + return (error); + } + cnp->cn_flags &= ~PDIRUNLOCK; + } + *vpp = tdp; + } else if (dp->i_number == foundino) { + /* "." - return same vnode */ + vref(vdp); + *vpp = vdp; + } else { + if ((error = VFS_VGET(vdp->v_mount, foundino, &tdp)) != 0) + return (error); + if (!lockparent || !(flags & ISLASTCN)) { + VOP_UNLOCK(vdp); + cnp->cn_flags |= PDIRUNLOCK; + } + *vpp = tdp; + } + + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + return (0); +} + +/* + * Common code to create a new inode and enter it in a directory. + */ +static int +ext4fs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp) +{ + struct inode *ip, *pdir; + struct vnode *tvp; + struct ext4fs_dinode *din; + int error; + + pdir = VTOI(dvp); + + *vpp = NULL; + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; + + + error = ext4fs_inode_alloc(pdir, mode, cnp->cn_cred, &tvp); + if (error) { + pool_put(&namei_pool, cnp->cn_pnbuf); + return (error); + } + + ip = VTOI(tvp); + din = &ip->i_e4din->dinode; + /* Set owner from cred and parent */ + din->i_uid_lo = htole16(cnp->cn_cred->cr_uid & 0xFFFF); + din->i_uid_hi = htole16((cnp->cn_cred->cr_uid >> 16) & 0xFFFF); + { + gid_t gid = letoh16(pdir->i_e4din->dinode.i_gid_lo) | + ((gid_t)letoh16(pdir->i_e4din->dinode.i_gid_hi) << 16); + din->i_gid_lo = htole16(gid & 0xFFFF); + din->i_gid_hi = htole16((gid >> 16) & 0xFFFF); + } + + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + din->i_mode = htole16(mode); + tvp->v_type = IFTOVT(mode); + ip->i_effnlink = 1; + din->i_links_count = htole16(1); + + /* Clear SGID if not group member */ + if ((mode & ISGID) && + !groupmember(letoh16(din->i_gid_lo) | + ((gid_t)letoh16(din->i_gid_hi) << 16), cnp->cn_cred) && + suser_ucred(cnp->cn_cred)) + din->i_mode = htole16(letoh16(din->i_mode) & ~ISGID); + + /* Write inode to disk before directory entry */ + if ((error = ext4fs_update(ip, 1)) != 0) + goto bad; + error = ext4fs_direnter(ip, dvp, cnp); + if (error != 0) + goto bad; + + if ((cnp->cn_flags & SAVESTART) == 0) + pool_put(&namei_pool, cnp->cn_pnbuf); + *vpp = tvp; + return (0); + +bad: + pool_put(&namei_pool, cnp->cn_pnbuf); + ip->i_effnlink = 0; + din->i_links_count = htole16(0); + ip->i_flag |= IN_CHANGE; + tvp->v_type = VNON; + vput(tvp); + return (error); +} + +int +ext4fs_create(void *v) +{ + struct vop_create_args *ap = v; + return (ext4fs_makeinode( + MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), + ap->a_dvp, ap->a_vpp, ap->a_cnp)); +} + +int +ext4fs_mknod(void *v) +{ + struct vop_mknod_args *ap = v; + struct vnode **vpp = ap->a_vpp; + struct vnode *tvp; + struct inode *ip; + int error; + + error = ext4fs_makeinode( + MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), + ap->a_dvp, &tvp, ap->a_cnp); + if (error) + return (error); + + ip = VTOI(tvp); + + /* Store device number */ + if (ap->a_vap->va_rdev != VNOVAL) { + /* Old format in i_block[0], new format in i_block[1] */ + ip->i_e4din->dinode.i_block[0] = + htole32(ap->a_vap->va_rdev); + ip->i_e4din->dinode.i_block[1] = + htole32(ap->a_vap->va_rdev); + } + + ip->i_flag |= IN_CHANGE | IN_UPDATE; + ext4fs_update(ip, 1); + + *vpp = tvp; + return (0); +} + +int +ext4fs_open(void *v) +{ + struct vop_open_args *ap = v; + struct inode *ip = VTOI(ap->a_vp); + u_int32_t iflags = letoh32(ip->i_e4din->dinode.i_flags); + + /* Deny write access to immutable files, non-append to append-only */ + if ((iflags & EXTFS_INODE_FLAG_IMMUTABLE) && + (ap->a_mode & (FWRITE | O_TRUNC))) + return (EPERM); + if ((iflags & EXTFS_INODE_FLAG_APPEND) && + (ap->a_mode & (FWRITE | O_TRUNC)) && + !(ap->a_mode & O_APPEND)) + return (EPERM); + + return (0); +} + +int +ext4fs_access(void *v) +{ + struct vop_access_args *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + u_int32_t iflags; + mode_t mode; + uid_t uid; + gid_t gid; + + /* Deny write access to immutable files */ + iflags = letoh32(din->i_flags); + if ((ap->a_mode & VWRITE) && (iflags & EXTFS_INODE_FLAG_IMMUTABLE)) + return (EPERM); + + mode = letoh16(din->i_mode); + uid = letoh16(din->i_uid_lo) | + ((uid_t)letoh16(din->i_uid_hi) << 16); + gid = letoh16(din->i_gid_lo) | + ((gid_t)letoh16(din->i_gid_hi) << 16); + + return (vaccess(vp->v_type, mode, uid, gid, ap->a_mode, ap->a_cred)); +} + +int +ext4fs_getattr(void *v) +{ + struct vop_getattr_args *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + + struct ext4fs_dinode_256 *din = ip->i_e4din; + struct vattr *vap = ap->a_vap; + + /* Copy from inode table */ + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + vap->va_mode = letoh16(din->dinode.i_mode) & ALLPERMS; + vap->va_nlink = letoh16(din->dinode.i_links_count); + vap->va_uid = letoh16(din->dinode.i_uid_lo); + vap->va_uid |= (uid_t)letoh16(din->dinode.i_uid_hi) << 16; + vap->va_gid = letoh16(din->dinode.i_gid_lo); + vap->va_gid |= (gid_t)letoh16(din->dinode.i_gid_hi) << 16; + vap->va_rdev = 0; + vap->va_size = letoh32(din->dinode.i_size_lo); + vap->va_size |= (off_t)letoh32(din->dinode.i_size_hi) << 32; + + /* Convert timestamps with nanosecond precision */ + vap->va_atime.tv_sec = letoh32(din->dinode.i_atime); + vap->va_atime.tv_nsec = letoh32(din->dinode.i_atime_extra) >> 2; + vap->va_mtime.tv_sec = letoh32(din->dinode.i_mtime); + vap->va_mtime.tv_nsec = letoh32(din->dinode.i_mtime_extra) >> 2; + vap->va_ctime.tv_sec = letoh32(din->dinode.i_ctime); + vap->va_ctime.tv_nsec = letoh32(din->dinode.i_ctime_extra) >> 2; + + vap->va_flags = 0; + vap->va_gen = letoh32(din->dinode.i_nfs_generation); + + /* Set appropriate block size */ + if (vp->v_type == VBLK) + vap->va_blocksize = BLKDEV_IOSIZE; + else if (vp->v_type == VCHR) + vap->va_blocksize = MAXBSIZE; + else + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + + vap->va_bytes = letoh32(din->dinode.i_blocks_lo); + vap->va_bytes |= (off_t)letoh16(din->dinode.i_blocks_hi) << 32; + vap->va_bytes *= DEV_BSIZE; + vap->va_type = vp->v_type; + vap->va_filerev = 0; + + return (0); +} + +int +ext4fs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred) +{ + struct inode *ip = VTOI(vp); + + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + uid_t uid; + gid_t gid; + u_int16_t cur_mode; + int error; + + uid = letoh16(din->i_uid_lo) | + ((uid_t)letoh16(din->i_uid_hi) << 16); + gid = letoh16(din->i_gid_lo) | + ((gid_t)letoh16(din->i_gid_hi) << 16); + + if (cred->cr_uid != uid && (error = suser_ucred(cred))) + return (error); + if (cred->cr_uid) { + if (vp->v_type != VDIR && (mode & S_ISTXT)) + return (EFTYPE); + if (!groupmember(gid, cred) && (mode & ISGID)) + return (EPERM); + } + + cur_mode = letoh16(din->i_mode); + cur_mode &= ~ALLPERMS; + cur_mode |= (mode & ALLPERMS); + din->i_mode = htole16(cur_mode); + ip->i_flag |= IN_CHANGE; + + if ((vp->v_flag & VTEXT) && (cur_mode & S_ISTXT) == 0) + (void)uvm_vnp_uncache(vp); + + return (0); +} + +int +ext4fs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred) +{ + struct inode *ip = VTOI(vp); + + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + uid_t ouid; + gid_t ogid; + u_int16_t mode; + int error; + + ouid = letoh16(din->i_uid_lo) | + ((uid_t)letoh16(din->i_uid_hi) << 16); + ogid = letoh16(din->i_gid_lo) | + ((gid_t)letoh16(din->i_gid_hi) << 16); + + if (uid == (uid_t)VNOVAL) + uid = ouid; + if (gid == (gid_t)VNOVAL) + gid = ogid; + + if ((cred->cr_uid != ouid || uid != ouid || + (gid != ogid && !groupmember(gid, cred))) && + (error = suser_ucred(cred))) + return (error); + + din->i_uid_lo = htole16(uid & 0xFFFF); + din->i_uid_hi = htole16((uid >> 16) & 0xFFFF); + din->i_gid_lo = htole16(gid & 0xFFFF); + din->i_gid_hi = htole16((gid >> 16) & 0xFFFF); + + if (ouid != uid || ogid != gid) + ip->i_flag |= IN_CHANGE; + if (ouid != uid && cred->cr_uid != 0) { + mode = letoh16(din->i_mode); + mode &= ~S_ISUID; + din->i_mode = htole16(mode); + } + if (ogid != gid && cred->cr_uid != 0) { + mode = letoh16(din->i_mode); + mode &= ~S_ISGID; + din->i_mode = htole16(mode); + } + + return (0); +} + +int +ext4fs_setattr(void *v) +{ + struct vop_setattr_args *ap = v; + struct vattr *vap = ap->a_vap; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct ucred *cred = ap->a_cred; + + int error = 0; + + if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || + (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || + (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || + ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) + return (EINVAL); + + if (vap->va_flags != VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((error = suser_ucred(cred))) + return (error); + u_int32_t iflags = letoh32(din->i_flags); + iflags &= ~(EXTFS_INODE_FLAG_APPEND | + EXTFS_INODE_FLAG_IMMUTABLE); + iflags |= (vap->va_flags & SF_APPEND) ? + EXTFS_INODE_FLAG_APPEND : 0; + iflags |= (vap->va_flags & SF_IMMUTABLE) ? + EXTFS_INODE_FLAG_IMMUTABLE : 0; + din->i_flags = htole32(iflags); + ip->i_flag |= IN_CHANGE; + } + + if (vap->va_uid != (uid_t)VNOVAL || + vap->va_gid != (gid_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + error = ext4fs_chown(vp, vap->va_uid, vap->va_gid, cred); + if (error) + return (error); + } + + if (vap->va_size != VNOVAL) { + switch (vp->v_type) { + case VDIR: + return (EISDIR); + case VLNK: + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + break; + default: + break; + } + error = ext4fs_truncate(ip, vap->va_size, 0, cred); + if (error) + return (error); + } + + if ((vap->va_vaflags & VA_UTIMES_CHANGE) || + vap->va_atime.tv_nsec != VNOVAL || + vap->va_mtime.tv_nsec != VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + uid_t uid = letoh16(din->i_uid_lo) | + ((uid_t)letoh16(din->i_uid_hi) << 16); + if (cred->cr_uid != uid && + (error = suser_ucred(cred)) && + ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || + (error = VOP_ACCESS(vp, VWRITE, cred, ap->a_p)))) + return (error); + if (vap->va_mtime.tv_nsec != VNOVAL) + ip->i_flag |= IN_CHANGE | IN_UPDATE; + else if (vap->va_vaflags & VA_UTIMES_CHANGE) + ip->i_flag |= IN_CHANGE; + if (vap->va_atime.tv_nsec != VNOVAL) + ip->i_flag |= IN_ACCESS; + EXT4FS_ITIMES(ip); + if (vap->va_mtime.tv_nsec != VNOVAL) { + din->i_mtime = + htole32((u_int32_t)vap->va_mtime.tv_sec); + din->i_mtime_extra = + htole32(vap->va_mtime.tv_nsec << 2); + } + if (vap->va_atime.tv_nsec != VNOVAL) { + din->i_atime = + htole32((u_int32_t)vap->va_atime.tv_sec); + din->i_atime_extra = + htole32(vap->va_atime.tv_nsec << 2); + } + ip->i_flag |= IN_MODIFIED; + error = ext4fs_update(ip, 1); + if (error) + return (error); + } + + if (vap->va_mode != (mode_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + error = ext4fs_chmod(vp, vap->va_mode, cred); + } + + return (error); +} + +int +ext4fs_read(void *v) +{ + struct vop_read_args *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct uio *uio = ap->a_uio; + struct buf *bp; + off_t filesz, bytesinfile; + daddr_t lbn, nextlbn; + int error, blkoffset, xfersize, size; + + if (vp->v_type == VDIR) + return (EISDIR); + if (uio->uio_offset < 0) + return (EINVAL); + if (uio->uio_resid == 0) + return (0); + + filesz = (off_t)letoh32(din->i_size_lo) | + ((off_t)letoh32(din->i_size_hi) << 32); + + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + bytesinfile = filesz - uio->uio_offset; + if (bytesinfile <= 0) + break; + + lbn = EXT4FS_LBLKNO(fs, uio->uio_offset); + nextlbn = lbn + 1; + blkoffset = EXT4FS_BLKOFF(fs, uio->uio_offset); + size = fs->m_block_size; + + xfersize = size - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (bytesinfile < xfersize) + xfersize = bytesinfile; + + if ((u_int64_t)nextlbn * fs->m_block_size >= filesz) + error = bread(vp, lbn, size, &bp); + else if (lbn - 1 == ip->i_ci.ci_lastr || + uio->uio_resid > xfersize) + error = bread_cluster(vp, lbn, size, &bp); + else + error = bread(vp, lbn, size, &bp); + if (error) + break; + ip->i_ci.ci_lastr = lbn; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); + if (error) + break; + brelse(bp); + } + if (bp != NULL) + brelse(bp); + + if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) + ip->i_flag |= IN_ACCESS; + + return (error); +} + +int +ext4fs_write(void *v) +{ + struct vop_write_args *ap = v; + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct inode *ip = VTOI(vp); + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct buf *bp; + off_t filesz; + u_int64_t lbn, pblk, ncontig, prealloc_start; + u_int32_t prealloc_count, prealloc_got, prealloc_i; + u_int64_t i_blocks; + int ioflag = ap->a_ioflag; + int blkoffset, xfersize; + int error; + size_t resid; + ssize_t overrun; + + if (uio->uio_resid == 0) + return (0); + + switch (vp->v_type) { + case VREG: + break; + case VLNK: + break; + case VDIR: + return (EOPNOTSUPP); + default: + panic("ext4fs_write: type"); + } + + filesz = (off_t)letoh32(din->i_size_lo) | + ((off_t)letoh32(din->i_size_hi) << 32); + + if (ioflag & IO_APPEND) + uio->uio_offset = filesz; + + if (uio->uio_offset < 0) + return (EINVAL); + + if ((error = vn_fsizechk(vp, uio, ioflag, &overrun))) + return (error); + + resid = uio->uio_resid; + + for (error = 0; uio->uio_resid > 0; ) { + lbn = EXT4FS_LBLKNO(fs, uio->uio_offset); + blkoffset = EXT4FS_BLKOFF(fs, uio->uio_offset); + xfersize = fs->m_block_size - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + + /* + * For full-block writes past EOF, batch-allocate + * contiguous blocks for the remaining write. + */ + if (blkoffset == 0 && xfersize == fs->m_block_size && + uio->uio_offset >= filesz && + (ext4fs_extent_pblk(ip, lbn, &pblk, &ncontig) != 0 || + pblk == 0)) { + /* Count full blocks remaining in this write */ + prealloc_count = uio->uio_resid / fs->m_block_size; + if (prealloc_count > 32768) + prealloc_count = 32768; + if (prealloc_count == 0) + prealloc_count = 1; + /* Goal: contiguous with last extent */ + pblk = 0; + if (letoh16(din->i_extent_header.eh_entries) > 0) { + u_int64_t dummy; + u_int64_t nc; + /* Use lbn-1 to find last mapped block */ + if (lbn > 0 && ext4fs_extent_pblk(ip, + lbn - 1, &dummy, &nc) == 0 && + dummy != 0) + pblk = dummy + 1; + } + error = ext4fs_blkalloc(ip, pblk, prealloc_count, + &prealloc_start, &prealloc_got); + if (error) + break; + error = ext4fs_extent_insert(ip, lbn, + prealloc_start, prealloc_got); + if (error) { + for (prealloc_i = 0; + prealloc_i < prealloc_got; + prealloc_i++) + ext4fs_blkfree(ip, + prealloc_start + prealloc_i); + break; + } + i_blocks = letoh32(din->i_blocks_lo) | + ((u_int64_t)letoh16(din->i_blocks_hi) << 32); + i_blocks += (u_int64_t)prealloc_got * + (fs->m_block_size / DEV_BSIZE); + din->i_blocks_lo = htole32((u_int32_t)i_blocks); + din->i_blocks_hi = + htole16((u_int16_t)(i_blocks >> 32)); + din->i_flags |= + htole32(EXTFS_INODE_FLAG_EXTENTS); + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + /* Now use the first allocated block */ + pblk = prealloc_start; + } else if (ext4fs_extent_pblk(ip, lbn, &pblk, + &ncontig) == 0 && pblk != 0) { + /* Already mapped */ + } else { + /* Partial block or not past EOF: single alloc */ + error = ext4fs_buf_alloc(ip, lbn, fs->m_block_size, + ap->a_cred, &bp, B_CLRBUF); + if (error) + break; + goto do_io; + } + + /* Full block: getblk without read; partial: bread */ + if (blkoffset == 0 && xfersize == fs->m_block_size) { + bp = getblk(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, 0, INFSLP); + } else { + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + break; + } + } +do_io: + error = uiomove((char *)bp->b_data + blkoffset, xfersize, + uio); + if (error) { + brelse(bp); + break; + } + + if (ioflag & IO_SYNC) + (void)bwrite(bp); + else if (xfersize + blkoffset == fs->m_block_size) + bawrite(bp); + else + bdwrite(bp); + (void)uvm_vnp_uncache(vp); + if (xfersize == 0) + break; + + /* Update file size if we wrote past end */ + if (uio->uio_offset > filesz) { + ext4fs_setsize(ip, uio->uio_offset); + filesz = uio->uio_offset; + uvm_vnp_setsize(vp, filesz); + } + + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + + /* Clear setuid/setgid bits on write by non-root */ + if (resid > uio->uio_resid && ap->a_cred && + ap->a_cred->cr_uid != 0) { + u_int16_t mode = letoh16(din->i_mode); + mode &= ~(S_ISUID | S_ISGID); + din->i_mode = htole16(mode); + } + + if (error == 0 && resid > uio->uio_resid && (ioflag & IO_SYNC)) + error = ext4fs_update(ip, 1); + + uio->uio_resid += overrun; + return (error); +} + +int +ext4fs_fsync(void *v) +{ + struct vop_fsync_args *ap = v; + struct vnode *vp = ap->a_vp; + + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (0); + + vflushbuf(vp, ap->a_waitfor == MNT_WAIT); + return (ext4fs_update(VTOI(vp), ap->a_waitfor == MNT_WAIT)); +} + +int +ext4fs_remove(void *v) +{ + struct vop_remove_args *ap = v; + struct vnode *vp = ap->a_vp; + struct vnode *dvp = ap->a_dvp; + struct inode *ip = VTOI(vp); + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + u_int16_t nlink; + int error; + + if (vp->v_type == VDIR) { + error = EPERM; + goto out; + } + + /* Cannot remove immutable or append-only files */ + if (letoh32(din->i_flags) & + (EXTFS_INODE_FLAG_IMMUTABLE | EXTFS_INODE_FLAG_APPEND)) { + error = EPERM; + goto out; + } + + error = ext4fs_dirremove(dvp, ap->a_cnp); + if (error) + goto out; + + nlink = letoh16(din->i_links_count); + if (nlink > 0) + nlink--; + din->i_links_count = htole16(nlink); + ip->i_effnlink = nlink; + ip->i_flag |= IN_CHANGE; + +out: + return (error); +} + +int +ext4fs_link(void *v) +{ + struct vop_link_args *ap = v; + struct vnode *dvp = ap->a_dvp; + struct vnode *vp = ap->a_vp; + struct componentname *cnp = ap->a_cnp; + struct inode *ip = VTOI(vp); + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + u_int16_t nlink; + int error; + + if (vp->v_type == VDIR) { + error = EPERM; + goto out2; + } + if (dvp->v_mount != vp->v_mount) { + error = EXDEV; + goto out2; + } + + nlink = letoh16(din->i_links_count); + if (nlink >= EXT4FS_LINK_MAX) { + error = EMLINK; + goto out2; + } + + if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0) + goto out2; + + nlink++; + din->i_links_count = htole16(nlink); + ip->i_effnlink = nlink; + ip->i_flag |= IN_CHANGE; + error = ext4fs_update(ip, 1); + if (error) + goto out1; + + error = ext4fs_direnter(ip, dvp, cnp); + if (error) { + nlink--; + din->i_links_count = htole16(nlink); + ip->i_effnlink = nlink; + ip->i_flag |= IN_CHANGE; + } + +out1: + if (dvp != vp) + VOP_UNLOCK(vp); +out2: + vput(dvp); + return (error); +} + +/* + * Check if source is an ancestor of target in the directory hierarchy. + * Prevents creating directory loops via rename. + * target vnode must be locked on entry and will be vput on exit. + */ +static int +ext4fs_checkpath(struct inode *source, struct inode *target, struct ucred *cred) +{ + struct vnode *vp; + struct m_ext4fs *fs = source->i_e4fs; + u_int32_t ino; + int error = 0; + + vp = ITOV(target); + if (target->i_number == source->i_number) { + error = EEXIST; + goto out; + } + if (target->i_number == ROOTINO) + goto out; + + for (;;) { + struct inode *ip = VTOI(vp); + struct buf *bp; + struct ext4fs_directory *dot, *dotdot; + u_int64_t pblk; + + if (vp->v_type != VDIR) { + error = ENOTDIR; + break; + } + + /* Read ".." from first directory block */ + error = ext4fs_extent_pblk(ip, 0, &pblk, NULL); + if (error || pblk == 0) { + if (!error) error = EIO; + break; + } + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + break; + } + + /* ".." is the second entry after "." */ + dot = (struct ext4fs_directory *)bp->b_data; + dotdot = (struct ext4fs_directory *) + ((char *)bp->b_data + letoh16(dot->e4d_reclen)); + if (dotdot->e4d_namlen != 2 || + dotdot->e4d_name[0] != '.' || + dotdot->e4d_name[1] != '.') { + brelse(bp); + error = ENOTDIR; + break; + } + ino = letoh32(dotdot->e4d_ino); + brelse(bp); + + if (ino == source->i_number) { + error = EINVAL; + break; + } + if (ino == ROOTINO) + break; + + VOP_UNLOCK(vp); + error = VFS_VGET(vp->v_mount, ino, &vp); + if (error) { + vp = NULL; + break; + } + } + +out: + if (error == ENOTDIR) + printf("ext4fs_checkpath: .. not a directory\n"); + if (vp != NULL) + vput(vp); + return (error); +} + +int +ext4fs_rename(void *v) +{ + struct vop_rename_args *ap = v; + struct vnode *tvp = ap->a_tvp; + struct vnode *tdvp = ap->a_tdvp; + struct vnode *fvp = ap->a_fvp; + struct vnode *fdvp = ap->a_fdvp; + struct componentname *tcnp = ap->a_tcnp; + struct componentname *fcnp = ap->a_fcnp; + struct inode *ip, *xp = NULL, *dp; + struct ext4fs_dinode *din; + int doingdirectory = 0, oldparent = 0, newparent = 0; + int error = 0; + u_int16_t nlink; + + /* Check for cross-device rename */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; +abortit: + VOP_ABORTOP(tdvp, tcnp); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fdvp, fcnp); + vrele(fdvp); + vrele(fvp); + return (error); + } + + /* Lock source */ + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) + goto abortit; + + dp = VTOI(fdvp); + ip = VTOI(fvp); + din = &ip->i_e4din->dinode; + + nlink = letoh16(din->i_links_count); + if ((letoh32(din->i_flags) & + (EXTFS_INODE_FLAG_IMMUTABLE | EXTFS_INODE_FLAG_APPEND))) { + VOP_UNLOCK(fvp); + error = EPERM; + goto abortit; + } + + if ((letoh16(din->i_mode) & S_IFMT) == S_IFDIR) { + doingdirectory = 1; + oldparent = dp->i_number; + } + + /* Bump link count temporarily for crash safety */ + nlink++; + din->i_links_count = htole16(nlink); + ip->i_effnlink = nlink; + ip->i_flag |= IN_CHANGE; + if ((error = ext4fs_update(ip, 1)) != 0) { + VOP_UNLOCK(fvp); + goto abortit; + } + + /* Check write access for changing ".." */ + if (doingdirectory) + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, + tcnp->cn_proc); + VOP_UNLOCK(fvp); + vrele(fdvp); + + /* + * If ".." must be changed (ie the directory gets a new parent) + * then the source directory must not be in the directory + * hierarchy above the target. + */ + dp = VTOI(tdvp); + if (oldparent != dp->i_number) + newparent = dp->i_number; + + if (doingdirectory && newparent) { + if (error) /* write access check above */ + goto bad; + if (tvp) + vput(tvp); + /* checkpath vput's tdvp, compensate */ + vref(tdvp); + error = ext4fs_checkpath(ip, dp, tcnp->cn_cred); + if (error) { + vrele(tdvp); + goto out; + } + if ((tcnp->cn_flags & SAVESTART) == 0) + panic("ext4fs_rename: lost to startdir"); + error = vfs_relookup(tdvp, &tvp, tcnp); + if (error) { + vrele(tdvp); + goto out; + } + vrele(tdvp); + dp = VTOI(tdvp); + } + + xp = NULL; + if (tvp) + xp = VTOI(tvp); + + /* + * 2) If target doesn't exist, link the target to the source + * and unlink the source. Otherwise, rewrite the target + * directory entry to reference the source inode. + */ + if (xp == NULL) { + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't fool with the link count. + */ + if (doingdirectory && newparent) { + u_int16_t pnlink = letoh16( + dp->i_e4din->dinode.i_links_count); + pnlink++; + dp->i_e4din->dinode.i_links_count = htole16(pnlink); + dp->i_effnlink = pnlink; + dp->i_flag |= IN_CHANGE; + if ((error = ext4fs_update(dp, 1)) != 0) + goto bad; + } + error = ext4fs_direnter(ip, tdvp, tcnp); + if (error) { + if (doingdirectory && newparent) { + u_int16_t pnlink = letoh16( + dp->i_e4din->dinode.i_links_count); + if (pnlink > 1) + pnlink--; + dp->i_e4din->dinode.i_links_count = + htole16(pnlink); + dp->i_effnlink = pnlink; + dp->i_flag |= IN_CHANGE; + (void)ext4fs_update(dp, 1); + } + goto bad; + } + vput(tdvp); + } else { + /* + * Target exists. If replacing a directory, + * check that it is empty BEFORE rewriting + * the directory entry. + */ + if (doingdirectory) { + if (!ext4fs_dirempty(xp, dp->i_number, + tcnp->cn_cred)) { + error = ENOTEMPTY; + goto bad; + } + } + + /* Rewrite the entry to point to source inode */ + error = ext4fs_dirrewrite(dp, ip, tcnp); + if (error) + goto bad; + + /* + * If the target directory is in the same + * directory as the source directory, + * decrement the link count on the parent + * of the target directory. + */ + if (doingdirectory && !newparent) { + u_int16_t pnlink = letoh16( + dp->i_e4din->dinode.i_links_count); + if (pnlink > 1) + pnlink--; + dp->i_e4din->dinode.i_links_count = htole16(pnlink); + dp->i_effnlink = pnlink; + dp->i_flag |= IN_CHANGE; + } + vput(tdvp); + + /* + * Adjust the link count of the target to + * reflect the dirrewrite above. + */ + { + u_int16_t xnlink = + letoh16(xp->i_e4din->dinode.i_links_count); + if (xnlink > 0) + xnlink--; + if (doingdirectory) { + if (xnlink > 0) + xnlink--; + error = ext4fs_truncate(xp, 0, 0, + tcnp->cn_cred); + } + xp->i_e4din->dinode.i_links_count = htole16(xnlink); + xp->i_effnlink = xnlink; + xp->i_flag |= IN_CHANGE; + } + vput(tvp); + xp = NULL; + } + + /* + * 3) Unlink the source. + * Re-lookup the source entry to get correct i_offset/i_count, + * since the target lookup overwrites them (especially when + * fdvp == tdvp, i.e., same-directory rename). + */ + fcnp->cn_flags &= ~MODMASK; + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + if ((fcnp->cn_flags & SAVESTART) == 0) + panic("ext4fs_rename: lost from startdir"); + (void) vfs_relookup(fdvp, &fvp, fcnp); + if (fvp != NULL) { + xp = VTOI(fvp); + dp = VTOI(fdvp); + } else { + if (doingdirectory) + panic("ext4fs_rename: lost dir entry"); + vrele(ap->a_fvp); + return (0); + } + + if (xp != ip) { + if (doingdirectory) + panic("ext4fs_rename: lost dir entry"); + } else { + /* If directory moved to new parent, update ".." */ + if (doingdirectory && newparent) { + struct buf *dbp; + struct ext4fs_directory *dotdot; + u_int64_t dpblk; + + dp->i_e4din->dinode.i_links_count = htole16( + letoh16(dp->i_e4din->dinode.i_links_count) - 1); + dp->i_effnlink--; + dp->i_flag |= IN_CHANGE; + + error = ext4fs_extent_pblk(ip, 0, &dpblk, NULL); + if (error == 0 && dpblk != 0) { + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(ip->i_e4fs, + dpblk), ip->i_e4fs->m_block_size, &dbp); + if (error == 0) { + dotdot = (struct ext4fs_directory *) + ((char *)dbp->b_data + + letoh16(((struct ext4fs_directory *) + dbp->b_data)->e4d_reclen)); + dotdot->e4d_ino = htole32(newparent); + ext4fs_dir_set_csum(ip->i_e4fs, + ip->i_number, + ip->i_e4din->dinode. + i_nfs_generation, + dbp->b_data); + bwrite(dbp); + } else + brelse(dbp); + } + } + + error = ext4fs_dirremove(fdvp, fcnp); + if (!error) { + nlink = letoh16( + xp->i_e4din->dinode.i_links_count); + if (nlink > 0) + nlink--; + xp->i_e4din->dinode.i_links_count = htole16(nlink); + xp->i_effnlink = nlink; + xp->i_flag |= IN_CHANGE; + } + } + if (dp) + vput(fdvp); + if (xp) + vput(fvp); + vrele(ap->a_fvp); + return (error); + +bad: + if (xp) + vput(ITOV(xp)); + vput(ITOV(dp)); +out: + if (doingdirectory) + ip->i_flag &= ~IN_RENAME; + if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { + nlink = letoh16(ip->i_e4din->dinode.i_links_count); + if (nlink > 0) + nlink--; + ip->i_e4din->dinode.i_links_count = htole16(nlink); + ip->i_effnlink = nlink; + ip->i_flag |= IN_CHANGE; + vput(fvp); + } else + vrele(fvp); + return (error); +} + +int +ext4fs_mkdir(void *v) +{ + struct vop_mkdir_args *ap = v; + struct vnode *dvp = ap->a_dvp; + struct vattr *vap = ap->a_vap; + struct componentname *cnp = ap->a_cnp; + struct inode *dp = VTOI(dvp); + struct inode *ip; + struct vnode *tvp; + struct buf *bp; + struct ext4fs_directory *dirp; + struct ext4fs_dinode *din; + struct m_ext4fs *fs = dp->i_e4fs; + int error; + u_int16_t nlink; + + nlink = letoh16(dp->i_e4din->dinode.i_links_count); + if (nlink >= EXT4FS_LINK_MAX) { + error = EMLINK; + pool_put(&namei_pool, cnp->cn_pnbuf); + goto out; + } + + /* Allocate inode for new directory */ + error = ext4fs_inode_alloc(dp, S_IFDIR | vap->va_mode, + cnp->cn_cred, &tvp); + if (error) { + pool_put(&namei_pool, cnp->cn_pnbuf); + goto out; + } + + ip = VTOI(tvp); + din = &ip->i_e4din->dinode; + + /* Set owner */ + din->i_uid_lo = htole16(cnp->cn_cred->cr_uid & 0xFFFF); + din->i_uid_hi = htole16((cnp->cn_cred->cr_uid >> 16) & 0xFFFF); + { + gid_t gid = letoh16(dp->i_e4din->dinode.i_gid_lo) | + ((gid_t)letoh16(dp->i_e4din->dinode.i_gid_hi) << 16); + din->i_gid_lo = htole16(gid & 0xFFFF); + din->i_gid_hi = htole16((gid >> 16) & 0xFFFF); + } + + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + din->i_mode = htole16(S_IFDIR | vap->va_mode); + tvp->v_type = VDIR; + ip->i_effnlink = 2; + din->i_links_count = htole16(2); + + /* Allocate first block for "." and ".." */ + error = ext4fs_buf_alloc(ip, 0, fs->m_block_size, cnp->cn_cred, + &bp, B_CLRBUF); + if (error) + goto bad; + + /* Write "." entry */ + dirp = (struct ext4fs_directory *)bp->b_data; + dirp->e4d_ino = htole32((u_int32_t)ip->i_number); + dirp->e4d_reclen = htole16(12); + dirp->e4d_namlen = 1; + dirp->e4d_type = EXT4FS_FT_DIR; + dirp->e4d_name[0] = '.'; + + /* Write ".." entry */ + dirp = (struct ext4fs_directory *)((char *)bp->b_data + 12); + dirp->e4d_ino = htole32((u_int32_t)dp->i_number); + dirp->e4d_reclen = htole16(fs->m_block_size - 12 - + ((fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM) ? EXT4FS_DIR_TAIL_SIZE : 0)); + dirp->e4d_namlen = 2; + dirp->e4d_type = EXT4FS_FT_DIR; + dirp->e4d_name[0] = '.'; + dirp->e4d_name[1] = '.'; + + ext4fs_dir_set_csum(fs, ip->i_number, + ip->i_e4din->dinode.i_nfs_generation, bp->b_data); + error = bwrite(bp); + if (error) + goto bad; + + /* Set directory size */ + ext4fs_setsize(ip, fs->m_block_size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + + /* Write inode before directory entry */ + if ((error = ext4fs_update(ip, 1)) != 0) + goto bad; + + /* Increment parent's link count for ".." */ + nlink++; + dp->i_e4din->dinode.i_links_count = htole16(nlink); + dp->i_effnlink = nlink; + dp->i_flag |= IN_CHANGE; + if ((error = ext4fs_update(dp, 1)) != 0) + goto bad; + + /* Enter new directory in parent */ + error = ext4fs_direnter(ip, dvp, cnp); + if (error) { + /* Undo parent nlink */ + nlink--; + dp->i_e4din->dinode.i_links_count = htole16(nlink); + dp->i_effnlink = nlink; + dp->i_flag |= IN_CHANGE; + goto bad; + } + + if ((cnp->cn_flags & SAVESTART) == 0) + pool_put(&namei_pool, cnp->cn_pnbuf); + *ap->a_vpp = tvp; + + vput(dvp); + return (0); + +bad: + pool_put(&namei_pool, cnp->cn_pnbuf); + ip->i_effnlink = 0; + din->i_links_count = htole16(0); + ip->i_flag |= IN_CHANGE; + tvp->v_type = VNON; + vput(tvp); +out: + vput(dvp); + return (error); +} + +int +ext4fs_rmdir(void *v) +{ + struct vop_rmdir_args *ap = v; + struct vnode *vp = ap->a_vp; + struct vnode *dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct inode *ip = VTOI(vp); + struct inode *dp = VTOI(dvp); + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + u_int16_t nlink; + int error; + + /* Directory must be empty */ + if (!ext4fs_dirempty(ip, dp->i_number, cnp->cn_cred)) { + error = ENOTEMPTY; + goto out; + } + + /* Remove entry from parent */ + error = ext4fs_dirremove(dvp, cnp); + if (error) + goto out; + + /* Decrement parent's link count ("..") */ + nlink = letoh16(dp->i_e4din->dinode.i_links_count); + if (nlink > 1) + nlink--; + dp->i_e4din->dinode.i_links_count = htole16(nlink); + dp->i_effnlink = nlink; + dp->i_flag |= IN_CHANGE; + + cache_purge(dvp); + + /* Set target link count to 0 */ + din->i_links_count = htole16(0); + ip->i_effnlink = 0; + ip->i_flag |= IN_CHANGE; + + /* Truncate directory contents */ + error = ext4fs_truncate(ip, 0, 0, cnp->cn_cred); + + cache_purge(vp); + +out: + if (dvp == vp) + vrele(vp); + else + vput(vp); + vput(dvp); + return (error); +} + +int +ext4fs_symlink(void *v) +{ + struct vop_symlink_args *ap = v; + struct vnode *dvp = ap->a_dvp; + struct vattr *vap = ap->a_vap; + struct componentname *cnp = ap->a_cnp; + struct vnode **vpp = ap->a_vpp; + struct inode *ip; + int error, len; + + error = ext4fs_makeinode(S_IFLNK | vap->va_mode, dvp, vpp, cnp); + if (error) { + vput(dvp); + return (error); + } + + ip = VTOI(*vpp); + len = strlen(ap->a_target); + + if (len <= EXT4FS_SYMLINK_LEN_MAX) { + /* Fast symlink: store inline in i_block[] */ + memcpy(ip->i_e4din->dinode.i_block, ap->a_target, len); + ext4fs_setsize(ip, len); + /* Clear EXTENTS flag for fast symlinks */ + ip->i_e4din->dinode.i_flags &= + ~htole32(EXTFS_INODE_FLAG_EXTENTS); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + error = ext4fs_update(ip, 1); + } else { + /* Slow symlink: write to data blocks */ + struct uio auio; + struct iovec aiov; + + aiov.iov_base = ap->a_target; + aiov.iov_len = len; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = NULL; + auio.uio_resid = len; + error = VOP_WRITE(*vpp, &auio, IO_NODELOCKED, ap->a_cnp->cn_cred); + } + + vput(*vpp); + vput(dvp); + return (error); +} + +int +ext4fs_readdir(void *v) +{ + struct vop_readdir_args *ap = v; + struct uio *uio = ap->a_uio; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct ext4fs_directory *ep; + struct dirent dstd; + struct buf *bp; + off_t off, filesz; + u_int64_t lbn, pblk, blkoff; + u_int16_t reclen; + int error = 0; + + if (vp->v_type != VDIR) + return (ENOTDIR); + + filesz = (off_t)letoh32(din->i_size_lo) | + ((off_t)letoh32(din->i_size_hi) << 32); + off = uio->uio_offset; + + while (off < filesz && uio->uio_resid > 0) { + lbn = EXT4FS_LBLKNO(fs, off); + + error = ext4fs_extent_pblk(ip, lbn, &pblk, NULL); + if (error || pblk == 0) { + if (!error) error = EIO; + break; + } + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + break; + } + + blkoff = EXT4FS_BLKOFF(fs, off); + + while (blkoff < fs->m_block_size && off < filesz) { + ep = (struct ext4fs_directory *) + ((char *)bp->b_data + blkoff); + reclen = letoh16(ep->e4d_reclen); + + if (reclen < 8 || reclen > fs->m_block_size || + blkoff + reclen > fs->m_block_size) { + error = EIO; + brelse(bp); + goto done; + } + + if (letoh32(ep->e4d_ino) != 0) { + u_int8_t namlen = ep->e4d_namlen; + + if (namlen > reclen - 8 || + namlen > MAXNAMLEN) { + error = EIO; + brelse(bp); + goto done; + } + + memset(&dstd, 0, sizeof(dstd)); + dstd.d_fileno = letoh32(ep->e4d_ino); + dstd.d_namlen = namlen; + + if (ep->e4d_type < EXT4FS_FT_MAX) + dstd.d_type = + ext4fs_type_to_dt[ep->e4d_type]; + else + dstd.d_type = DT_UNKNOWN; + + memcpy(dstd.d_name, ep->e4d_name, + namlen); + dstd.d_name[dstd.d_namlen] = '\0'; + dstd.d_reclen = DIRENT_SIZE(&dstd); + dstd.d_off = off + reclen; + + if (dstd.d_reclen > uio->uio_resid) { + brelse(bp); + goto done; + } + + error = uiomove(&dstd, dstd.d_reclen, uio); + if (error) { + brelse(bp); + goto done; + } + } + + off += reclen; + blkoff += reclen; + } + + brelse(bp); + } + +done: + uio->uio_offset = off; + *ap->a_eofflag = (off >= filesz); + return (error); +} + +int +ext4fs_readlink(void *v) +{ + struct vop_readlink_args *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + u_int64_t filesz; + + filesz = (u_int64_t)letoh32(din->i_size_lo) | + ((u_int64_t)letoh32(din->i_size_hi) << 32); + + /* Fast symlink: target stored inline in i_block[] area */ + if (filesz <= EXT4FS_SYMLINK_LEN_MAX && + !(letoh32(din->i_flags) & EXTFS_INODE_FLAG_EXTENTS)) { + return (uiomove((char *)din->i_block, filesz, ap->a_uio)); + } + + /* Slow symlink: target stored in data blocks */ + return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); +} + +/* + * Enter a directory entry for inode ip into directory dvp. + */ +int +ext4fs_direnter(struct inode *ip, struct vnode *dvp, + struct componentname *cnp) +{ + struct inode *dp = VTOI(dvp); + struct m_ext4fs *fs = dp->i_e4fs; + struct ext4fs_dinode *ddin = &dp->i_e4din->dinode; + struct ext4fs_directory *ep, *nep; + struct buf *bp; + u_int64_t pblk; + off_t filesz; + int entrysize, error, loc; + u_int16_t reclen, mode; + + entrysize = EXT4FS_DIRSIZ(cnp->cn_namelen); + mode = letoh16(ip->i_e4din->dinode.i_mode); + + filesz = (off_t)letoh32(ddin->i_size_lo) | + ((off_t)letoh32(ddin->i_size_hi) << 32); + + if (dp->i_count == 0) { + /* + * No free slot found - append at end of directory. + * Allocate a new block if needed. + */ + u_int64_t lbn = EXT4FS_LBLKNO(fs, filesz); + u_int64_t blkoff = EXT4FS_BLKOFF(fs, filesz); + + if (blkoff == 0) { + /* Need a new block */ + error = ext4fs_buf_alloc(dp, lbn, fs->m_block_size, + cnp->cn_cred, &bp, B_CLRBUF); + if (error) + return (error); + } else { + error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL); + if (error || pblk == 0) + return (error ? error : EIO); + error = bread(dp->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (error); + } + } + + /* Write entry at end */ + ep = (struct ext4fs_directory *) + ((char *)bp->b_data + blkoff); + ep->e4d_ino = htole32((u_int32_t)ip->i_number); + { + int tail = (fs->m_feature_ro_compat & + EXT4FS_FEATURE_RO_COMPAT_METADATA_CSUM) ? + EXT4FS_DIR_TAIL_SIZE : 0; + if (blkoff == 0) + ep->e4d_reclen = + htole16(fs->m_block_size - tail); + else + ep->e4d_reclen = + htole16(fs->m_block_size - blkoff - tail); + } + ep->e4d_namlen = cnp->cn_namelen; + ep->e4d_type = ext4fs_mode_to_ft(mode); + memcpy(ep->e4d_name, cnp->cn_nameptr, cnp->cn_namelen); + + ext4fs_dir_set_csum(fs, dp->i_number, + dp->i_e4din->dinode.i_nfs_generation, bp->b_data); + error = bwrite(bp); + if (error) + return (error); + + /* Update directory size */ + if (blkoff == 0) + ext4fs_setsize(dp, filesz + fs->m_block_size); + else + ext4fs_setsize(dp, filesz + entrysize); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (ext4fs_update(dp, 1)); + } + + /* + * Found a free slot at dp->i_offset with dp->i_count bytes. + * Read the block and compact entries to make room. + */ + { + u_int64_t lbn = EXT4FS_LBLKNO(fs, dp->i_offset); + + error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL); + if (error || pblk == 0) + return (error ? error : EIO); + + error = bread(dp->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (error); + } + } + + loc = EXT4FS_BLKOFF(fs, dp->i_offset); + ep = (struct ext4fs_directory *)((char *)bp->b_data + loc); + reclen = letoh16(ep->e4d_reclen); + + if (letoh32(ep->e4d_ino) == 0) { + /* Unused entry - just overwrite */ + ep->e4d_ino = htole32((u_int32_t)ip->i_number); + /* Keep reclen as is */ + ep->e4d_namlen = cnp->cn_namelen; + ep->e4d_type = ext4fs_mode_to_ft(mode); + memcpy(ep->e4d_name, cnp->cn_nameptr, cnp->cn_namelen); + } else { + /* Compact: shrink current entry, add new one after it */ + int oldentsz = EXT4FS_DIRSIZ(ep->e4d_namlen); + + nep = (struct ext4fs_directory *) + ((char *)ep + oldentsz); + nep->e4d_ino = htole32((u_int32_t)ip->i_number); + nep->e4d_reclen = htole16(reclen - oldentsz); + nep->e4d_namlen = cnp->cn_namelen; + nep->e4d_type = ext4fs_mode_to_ft(mode); + memcpy(nep->e4d_name, cnp->cn_nameptr, cnp->cn_namelen); + ep->e4d_reclen = htole16(oldentsz); + } + + ext4fs_dir_set_csum(fs, dp->i_number, + dp->i_e4din->dinode.i_nfs_generation, bp->b_data); + error = bwrite(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + if (error == 0) + error = ext4fs_update(dp, 1); + return (error); +} + +/* + * Remove a directory entry. + */ +int +ext4fs_dirremove(struct vnode *dvp, struct componentname *cnp) +{ + struct inode *dp = VTOI(dvp); + struct m_ext4fs *fs = dp->i_e4fs; + struct ext4fs_directory *ep, *prevep; + struct buf *bp; + u_int64_t lbn, pblk; + int error, loc; + + lbn = EXT4FS_LBLKNO(fs, dp->i_offset); + + error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL); + if (error || pblk == 0) + return (error ? error : EIO); + + error = bread(dp->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (error); + } + + loc = EXT4FS_BLKOFF(fs, dp->i_offset); + ep = (struct ext4fs_directory *)((char *)bp->b_data + loc); + + if (dp->i_count == 0) { + /* First entry in block: just zero the inode field */ + ep->e4d_ino = 0; + } else { + /* Merge with previous entry */ + int prevloc = EXT4FS_BLKOFF(fs, dp->i_offset - dp->i_count); + prevep = (struct ext4fs_directory *) + ((char *)bp->b_data + prevloc); + prevep->e4d_reclen = htole16( + letoh16(prevep->e4d_reclen) + letoh16(ep->e4d_reclen)); + } + + ext4fs_dir_set_csum(fs, dp->i_number, + dp->i_e4din->dinode.i_nfs_generation, bp->b_data); + error = bwrite(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +/* + * Check if a directory is empty (contains only "." and ".."). + */ +int +ext4fs_dirempty(struct inode *ip, ufsino_t parentino, struct ucred *cred) +{ + struct m_ext4fs *fs = ip->i_e4fs; + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + struct ext4fs_directory *ep; + struct buf *bp; + off_t off, filesz; + u_int64_t lbn, pblk, blkoff; + u_int16_t reclen; + int error; + + filesz = (off_t)letoh32(din->i_size_lo) | + ((off_t)letoh32(din->i_size_hi) << 32); + + for (off = 0; off < filesz; ) { + lbn = EXT4FS_LBLKNO(fs, off); + + error = ext4fs_extent_pblk(ip, lbn, &pblk, NULL); + if (error || pblk == 0) + return (0); + + error = bread(ip->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (0); + } + + blkoff = EXT4FS_BLKOFF(fs, off); + + while (blkoff < fs->m_block_size && off < filesz) { + ep = (struct ext4fs_directory *) + ((char *)bp->b_data + blkoff); + reclen = letoh16(ep->e4d_reclen); + + if (reclen < 8 || reclen > fs->m_block_size || + blkoff + reclen > fs->m_block_size) { + brelse(bp); + return (0); + } + + if (letoh32(ep->e4d_ino) != 0) { + if (ep->e4d_namlen > 2) { + brelse(bp); + return (0); + } + if (ep->e4d_name[0] != '.') { + brelse(bp); + return (0); + } + if (ep->e4d_namlen == 1) { + /* "." - ok */ + } else if (ep->e4d_name[1] == '.') { + /* ".." - ok */ + } else { + brelse(bp); + return (0); + } + } + + off += reclen; + blkoff += reclen; + } + + brelse(bp); + } + + return (1); +} + +/* + * Rewrite an existing directory entry to point to a new inode. + */ +int +ext4fs_dirrewrite(struct inode *dp, struct inode *ip, + struct componentname *cnp) +{ + struct m_ext4fs *fs = dp->i_e4fs; + struct ext4fs_directory *ep; + struct buf *bp; + u_int64_t lbn, pblk; + u_int16_t mode; + int error, loc; + + lbn = EXT4FS_LBLKNO(fs, dp->i_offset); + + error = ext4fs_extent_pblk(dp, lbn, &pblk, NULL); + if (error || pblk == 0) + return (error ? error : EIO); + + error = bread(dp->i_devvp, + (daddr_t)EXT4FS_FSBTODB(fs, pblk), + fs->m_block_size, &bp); + if (error) { + brelse(bp); + return (error); + } + + loc = EXT4FS_BLKOFF(fs, dp->i_offset); + ep = (struct ext4fs_directory *)((char *)bp->b_data + loc); + ep->e4d_ino = htole32((u_int32_t)ip->i_number); + mode = letoh16(ip->i_e4din->dinode.i_mode); + ep->e4d_type = ext4fs_mode_to_ft(mode); + + ext4fs_dir_set_csum(fs, dp->i_number, + dp->i_e4din->dinode.i_nfs_generation, bp->b_data); + error = bwrite(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +int +ext4fs_inactive(void *v) +{ + struct vop_inactive_args *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + u_int16_t mode, nlink = 1; + int error = 0; +#ifdef DIAGNOSTIC + extern int prtactive; + + if (prtactive && vp->v_usecount != 0) + vprint("ext4fs_inactive: pushing active", vp); +#endif + + /* + * Ignore inodes related to stale file handles. + */ + if (ip->i_e4din == NULL) { + goto out; + } + + mode = letoh16(ip->i_e4din->dinode.i_mode); + if (mode == 0) { + goto out; + } + + /* + * If the inode was deleted (dtime != 0), skip further processing. + */ + if (letoh32(ip->i_e4din->dinode.i_dtime) != 0) { + goto out; + } + + nlink = letoh16(ip->i_e4din->dinode.i_links_count); + + /* + * Handle file deletion: if nlink == 0, truncate data, + * free inode, and mark as deleted. + */ + if (nlink == 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + struct timespec ts; + + (void)ext4fs_truncate(ip, 0, 0, NOCRED); + + mode = letoh16(ip->i_e4din->dinode.i_mode); + ip->i_e4din->dinode.i_mode = htole16(0); + + ext4fs_inode_free(ip, ip->i_number, mode); + + getnanotime(&ts); + ip->i_e4din->dinode.i_dtime = + htole32((u_int32_t)ts.tv_sec); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + + if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { + ext4fs_update(ip, nlink == 0 ? 1 : 0); + } + +out: + VOP_UNLOCK(vp); + + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + * NOTE: after vrecycle, ip is freed (use-after-free danger). + */ + if (ip->i_e4din == NULL || + letoh16(ip->i_e4din->dinode.i_mode) == 0) + vrecycle(vp, ap->a_p); + + return (error); +} + +int +ext4fs_reclaim(void *v) +{ + struct vop_reclaim_args *ap = v; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + int error; + + if ((error = ufs_reclaim(vp)) != 0) + return (error); + + if (ip->i_e4din != NULL) + pool_put(&ext4fs_dinode_pool, ip->i_e4din); + + pool_put(&ext4fs_inode_pool, ip); + + vp->v_data = NULL; + + return (0); +} + +int +ext4fs_bmap(void *v) +{ + struct vop_bmap_args *ap = v; + struct inode *ip = VTOI(ap->a_vp); + struct m_ext4fs *fs = ip->i_e4fs; + u_int64_t pblk, ncontig; + int error; + + if (ap->a_vpp != NULL) + *ap->a_vpp = ip->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + error = ext4fs_extent_pblk(ip, (u_int64_t)ap->a_bn, &pblk, &ncontig); + if (error) { + *ap->a_bnp = -1; + return (error); + } + + if (pblk == 0) { + /* Hole — no physical block allocated */ + *ap->a_bnp = -1; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + return (0); + } + + *ap->a_bnp = (daddr_t)EXT4FS_FSBTODB(fs, pblk); + + if (ap->a_runp != NULL) { + int maxrun = MAXBSIZE / fs->m_block_size - 1; + *ap->a_runp = MIN((int)(ncontig - 1), maxrun); + if (*ap->a_runp < 0) + *ap->a_runp = 0; + } + + return (0); +} + +int +ext4fs_strategy(void *v) +{ + struct vop_strategy_args *ap = v; + struct buf *bp = ap->a_bp; + struct vnode *vp = bp->b_vp; + struct inode *ip; + int error; + int s; + + ip = VTOI(vp); + if (vp->v_type == VBLK || vp->v_type == VCHR) + panic("ext4fs_strategy: spec"); + + if (bp->b_blkno == bp->b_lblkno) { + error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, + NULL); + if (error) { + bp->b_error = error; + bp->b_flags |= B_ERROR; + s = splbio(); + biodone(bp); + splx(s); + return (error); + } + if (bp->b_blkno == -1) + clrbuf(bp); + } + if (bp->b_blkno == -1) { + s = splbio(); + biodone(bp); + splx(s); + return (0); + } + vp = ip->i_devvp; + bp->b_dev = vp->v_rdev; + VOP_STRATEGY(vp, bp); + return (0); +} + +int +ext4fs_print(void *v) +{ + struct vop_print_args *ap = v; + struct inode *ip = VTOI(ap->a_vp); + + printf("tag VT_EXT4FS, ino %llu, on dev %d, %d", + (unsigned long long)ip->i_number, + major(ip->i_dev), minor(ip->i_dev)); + printf(" flags 0x%x, effnlink %d\n", + ip->i_flag, ip->i_effnlink); + return (0); +} + +int +ext4fs_pathconf(void *v) +{ + struct vop_pathconf_args *ap = v; + + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = EXT4FS_LINK_MAX; + break; + case _PC_NAME_MAX: + *ap->a_retval = EXT4FS_MAXNAMLEN; + break; + case _PC_PATH_MAX: + *ap->a_retval = PATH_MAX; + break; + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + break; + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + break; + case _PC_NO_TRUNC: + *ap->a_retval = 1; + break; + case _PC_TIMESTAMP_RESOLUTION: + *ap->a_retval = 1; + break; + default: + return (EINVAL); + } + return (0); +} + +int +ext4fs_advlock(void *v) +{ + struct vop_advlock_args *ap = v; + struct inode *ip = VTOI(ap->a_vp); + + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + off_t filesz; + + filesz = (off_t)letoh32(din->i_size_lo) | + ((off_t)letoh32(din->i_size_hi) << 32); + return (lf_advlock(&ip->i_lockf, filesz, ap->a_id, ap->a_op, + ap->a_fl, ap->a_flags)); +} diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h index 16aaac8b7..2d9224f0f 100644 --- a/sys/ufs/ufs/inode.h +++ b/sys/ufs/ufs/inode.h @@ -43,6 +43,7 @@ #include #include #include +#include /* @@ -76,11 +77,13 @@ struct inode { union { /* Associated filesystem. */ struct fs *fs; /* FFS */ - struct m_ext2fs *e2fs; /* EXT2FS */ + struct m_ext2fs *e2fs; /* EXT2FS */ + struct m_ext4fs *e4fs; /* EXT4FS */ } inode_u; #define i_fs inode_u.fs #define i_e2fs inode_u.e2fs +#define i_e4fs inode_u.e4fs struct cluster_info i_ci; struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ @@ -117,14 +120,16 @@ struct inode { * The on-disk dinode itself. */ union { - struct ufs1_dinode *ffs1_din; - struct ufs2_dinode *ffs2_din; - struct ext2fs_dinode *e2fs_din; + struct ufs1_dinode *ffs1_din; + struct ufs2_dinode *ffs2_din; + struct ext2fs_dinode *e2fs_din; + struct ext4fs_dinode_256 *e4fs_din; } dinode_u; #define i_din1 dinode_u.ffs1_din #define i_din2 dinode_u.ffs2_din #define i_e2din dinode_u.e2fs_din +#define i_e4din dinode_u.e4fs_din struct inode_vtbl *i_vtbl; }; @@ -224,6 +229,8 @@ struct inode_vtbl { #define i_uid i_din1->di_uid #endif /* _KERNEL */ +#define i_e4fs_nlink i_e4din->dinode.i_links_count + #define i_e2fs_mode i_e2din->e2di_mode #define i_e2fs_size i_e2din->e2di_size #define i_e2fs_atime i_e2din->e2di_atime diff --git a/sys/ufs/ufs/ufs_ihash.c b/sys/ufs/ufs/ufs_ihash.c index f470326e5..a192c9a0b 100644 --- a/sys/ufs/ufs/ufs_ihash.c +++ b/sys/ufs/ufs/ufs_ihash.c @@ -43,6 +43,7 @@ #include #include #include +#include #include @@ -119,6 +120,11 @@ loop: */ IS_EXT2_VNODE(vp) ? ip->i_e2fs_nlink <= 0 : #endif + /* + * XXX DIP does not cover ext4fs either; + * use i_e4din directly like ext2fs uses i_e2din. + */ + vp->v_tag == VT_EXT4FS ? ip->i_e4fs_nlink <= 0 : DIP(ip, nlink) <= 0) && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)) { /* diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 80f403861..8c2345021 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -68,6 +68,9 @@ #include #endif #include +#ifdef EXT4FS +#include +#endif #include @@ -108,6 +111,12 @@ ufs_itimes(struct vnode *vp) goto out; } #endif +#ifdef EXT4FS + if (vp->v_tag == VT_EXT4FS) { + EXT4FS_ITIMES(ip); + goto out; + } +#endif if ((vp->v_type == VBLK || vp->v_type == VCHR)) ip->i_flag |= IN_LAZYMOD; @@ -1864,6 +1873,14 @@ filt_ufsread(struct knote *kn, long hint) if (IS_EXT2_VNODE(ip->i_vnode)) kn->kn_data = ext2fs_size(ip) - foffset(kn->kn_fp); else +#endif +#ifdef EXT4FS + if (ip->i_vnode->v_tag == VT_EXT4FS) { + struct ext4fs_dinode *din = &ip->i_e4din->dinode; + kn->kn_data = ((off_t)letoh32(din->i_size_lo) | + ((off_t)letoh32(din->i_size_hi) << 32)) - + foffset(kn->kn_fp); + } else #endif kn->kn_data = DIP(ip, size) - foffset(kn->kn_fp); if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) { diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h index 3ad8938e4..40ea54993 100644 --- a/sys/ufs/ufs/ufsmount.h +++ b/sys/ufs/ufs/ufsmount.h @@ -51,10 +51,13 @@ struct ufsmount { union { /* pointer to superblock */ struct fs *fs; /* FFS */ struct m_ext2fs *e2fs; /* EXT2FS */ + struct m_ext4fs *e4fs; /* EXT4FS */ } ufsmount_u; #define um_fs ufsmount_u.fs #define um_e2fs ufsmount_u.e2fs #define um_e2fsb ufsmount_u.e2fs->s_es +#define um_e4fs ufsmount_u.e4fs +//#define um_e4fsb ufsmount_u.e4fs->s_es struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ @@ -72,9 +75,10 @@ struct ufsmount { /* * Filesystem types */ -#define UM_UFS1 1 -#define UM_UFS2 2 +#define UM_UFS1 1 +#define UM_UFS2 2 #define UM_EXT2FS 3 +#define UM_EXT4FS 4 /* * Flags describing the state of quotas.