Download raw body.
ksh(1) vi mode bug in UTF-8 handling by the 'e' command
Below I'm attaching a patch authored by Lucas Gabriel Vuotto in Apr
2025, which got mixed up with another bug and was ultimately forgotten.
When the cursor positions or lands on a UTF-8 character, the "e" command
in ksh(1) vi mode gets stuck on that character and won't advance no
matter how many times you press the "e" key. This is because the
endword() function (file vi.c) doesn't recognize and skip UTF-8
continuation characters.
Index: bin/ksh/vi.c
===================================================================
RCS file: /cvs/src/bin/ksh/vi.c,v
diff -u -p -u -p -r1.67 vi.c
--- bin/ksh/vi.c 20 Jul 2025 21:24:07 -0000 1.67
+++ bin/ksh/vi.c 27 Jan 2026 14:29:47 -0000
@@ -1590,15 +1590,18 @@ backword(int argcnt)
static int
endword(int argcnt)
{
- int ncursor, skip_space, want_letnum;
+ int ncursor, skip_space, skip_utf8_cont, want_letnum;
unsigned char uc;
ncursor = es->cursor;
while (ncursor < es->linelen && argcnt--) {
- skip_space = 1;
+ skip_space = skip_utf8_cont = 1;
want_letnum = -1;
while (++ncursor < es->linelen) {
uc = es->cbuf[ncursor];
+ if (skip_utf8_cont && isu8cont(uc))
+ continue;
+ skip_utf8_cont = 0;
if (isspace(uc)) {
if (skip_space)
continue;
@@ -1662,6 +1665,9 @@ Endword(int argcnt)
ncursor = es->cursor;
while (ncursor < es->linelen && argcnt--) {
+ while (++ncursor < es->linelen &&
+ isu8cont((unsigned char)es->cbuf[ncursor]))
+ ;
while (++ncursor < es->linelen &&
isspace((unsigned char)es->cbuf[ncursor]))
;
Index: regress/bin/ksh/edit/vi.sh
===================================================================
RCS file: /cvs/src/regress/bin/ksh/edit/vi.sh,v
diff -u -p -u -p -r1.13 vi.sh
--- regress/bin/ksh/edit/vi.sh 19 May 2025 14:36:03 -0000 1.13
+++ regress/bin/ksh/edit/vi.sh 27 Jan 2026 14:29:56 -0000
@@ -93,6 +93,15 @@ testseq "1.00 two\00330ED" " # 1.00 two\
# e: Move to end of word.
testseq "onex two\00330eD" " # onex two\b\r # one \b\b\b\b\b\b"
+# No infinite loop moving to end of {,big} word for non-ASCII UTF-8-ending
+# words.
+# EURO SIGN U+20AC is encoded as bytes 0xe2 0x82 0xac = \0342\0202\0254
+euro='\0342\0202\0254'
+testseq "1.00$euro 2.00 three\00330EED" \
+ " # 1.00$euro 2.00 three\b\r # 1.00$euro 2.0 \b\b\b\b\b\b\b\b"
+testseq "one$euro twox three\00330eeD" \
+ " # one$euro twox three\b\r # one$euro two \b\b\b\b\b\b\b\b"
+
# F: Find character backward.
# ;: Repeat last search.
# ,: Repeat last search in opposite direction.
--
Walter
ksh(1) vi mode bug in UTF-8 handling by the 'e' command