Mailing List Archive

From:: William Goodspeed <goodspeed@posteo.net>
Subject:: Re: [REPOST] ksh: utf8 full width character support for emacs.c
To:: tech@openbsd.org
Date:: Fri, 04 Apr 2025 15:52:07 +0000
Download raw body.
Thread
- 2025-04-08 06:39 Christian Schulte:
  [REPOST] ksh: utf8 full width character support for emacs.c
- - 2025-04-13 14:30 William Goodspeed:
    [REPOST] ksh: utf8 full width character support for emacs.c
2025-04-04 15:52 William Goodspeed:
[REPOST] ksh: utf8 full width character support for emacs.c
Hi list,

Thank everyone for the attention on the issue and the discussion on
unicode.

To make it partially work in the simpliest way imho would be replacing
is_fullwidth and u8_to_cpt with wcwidth(3) and u8_to_wc (with mbtowc).
To be clear, that doesn't mean it will refactor the entire shell to
wchar,
it uses wchar routines only if an unicode occurred to calculate its
width.
Therefore, there won't be an performance issue of calling mbtowc(3)
repeatedly as it only calls it on unicode instead of regular ascii
chars.
After testing, that works pretty good for me.

However there are pros and cons on that solution, and here are some of
my thoughts:

1. setlocale(3) call can't be avoided. It would be
   `setlocale(LC_ALL, "")' to make it default to the encoding in
   the user envirnoment. And it's quiet clear that's a must if one
   wants any i18n from the C library.

2. It behaves differently across locales. If C/POSIX (ASCII)
   or encodings other than utf-8 are choosen, unicode character
   correctly. [But if we fallback wcwidth of something failed to
   one column, it will matche the current (broken) behaviour.] But,
   one would find it parsing utf-8 anyways in other locales.
   Unfortunately, the issue also exists in the current ksh.
   (Since OpenBSD only supports ASCII and UTF8, I don't think
   that's really big of a problem.)

3. Also an user-perceived character may not be one rune. But
   like emojis, in a terminal, they aren't combined together (at least
   on urxvt with Unifont). So I think it's already really ambiguous
   to determine the actuall width of a cluster character.

So in conclusion, I suggest switching to wcwidth(3) and mbtowc(3)
despite the (maybe) disadvantage of calling setlocale(3). It solves the
problem without introducing too much complexity and other defects.
   
Any thoughts?

---
 bin/ksh/Makefile  |   2 +-
 bin/ksh/emacs.c   | 187 +++++++++++++++++++++++++++++++++++++++++-----
 bin/ksh/main.c    |   8 ++
 bin/ksh/unicode.c |  33 ++++++++
 bin/ksh/unicode.h |   6 ++
 5 files changed, 216 insertions(+), 20 deletions(-)
 create mode 100644 bin/ksh/unicode.c
 create mode 100644 bin/ksh/unicode.h

diff --git a/bin/ksh/Makefile b/bin/ksh/Makefile
index 621c38bcf..d800154e4 100644
--- a/bin/ksh/Makefile
+++ b/bin/ksh/Makefile
@@ -7,7 +7,7 @@ LDADD+= -lcurses
 SRCS=  alloc.c c_ksh.c c_sh.c c_test.c c_ulimit.c edit.c emacs.c
eval.c \
        exec.c expr.c history.c io.c jobs.c lex.c mail.c main.c \
        misc.c path.c shf.c syn.c table.c trap.c tree.c tty.c var.c \
-       version.c vi.c
+       version.c vi.c unicode.c
 
 WARNINGS=yes
 DEFS=  -DEMACS -DVI
diff --git a/bin/ksh/emacs.c b/bin/ksh/emacs.c
index 3c9b0074d..813bc8e5a 100644
--- a/bin/ksh/emacs.c
+++ b/bin/ksh/emacs.c
@@ -29,6 +29,11 @@
 #include "sh.h"
 #include "edit.h"
 
+#ifndef SMALL
+#include "unicode.h"
+#else
+#define x_size_rev x_size
+#endif
 static Area    aedit;
 #define        AEDIT   &aedit          /* area for kill ring and
macro defns */
 
@@ -126,6 +131,7 @@ static int  x_fword(void);
 static void    x_goto(char *);
 static void    x_bs(int);
 static int     x_size_str(char *);
+static int     x_size_rev(int);
 static int     x_size(int);
 static void    x_zots(char *);
 static void    x_zotc(int);
@@ -459,7 +465,7 @@ x_ins(char *s)
        if (adj == x_adj_done) {        /* has x_adjust() been called?
*/
                /* no */
                for (cp = xlp; cp > xcp; )
-                       x_bs(*--cp);
+                       x_bs((unsigned char)*--cp);
        }
 
        x_adj_ok = 1;
@@ -653,7 +659,7 @@ x_bs(int c)
 {
        int i;
 
-       i = x_size(c);
+       i = x_size_rev(c);
        while (i--)
                x_e_putc('\b');
 }
@@ -663,20 +669,89 @@ x_size_str(char *cp)
 {
        int size = 0;
        while (*cp)
-               size += x_size(*cp++);
+               size += x_size((unsigned char)*cp++);
        return size;
 }
 
+#ifndef SMALL
+static int
+x_size_rev(int c)
+{
+       static unsigned char ch[5] = { 0 };
+       static int cnt = 3;
+       int w;
+
+       if (c=='\t')
+               return 4;       /* Kludge, tabs are always four
spaces. */
+       if (iscntrl(c))         /* control char */
+               return 2;
+
+       if (!isu8cont(c)) {
+               if (c <= 0x7f) {
+                       cnt = 3;
+                       return 1;
+               }
+
+               ch[cnt] = c;
+               w = u8width(ch + cnt);
+
+               cnt = 3;
+               memset(ch, 0, 4);
+               return w;
+       } else {
+               if (cnt <= 0)
+                       return 0;
+               ch[cnt] = c;
+               cnt--;
+       }
+
+       return 0;
+}
+#endif
+
 static int
 x_size(int c)
 {
+#ifndef SMALL
+       static unsigned char ch[5] = { 0 };
+       static int len = 0, cnt = 0;
+#endif
        if (c=='\t')
                return 4;       /* Kludge, tabs are always four
spaces. */
        if (iscntrl(c))         /* control char */
                return 2;
+#ifdef SMALL
        if (isu8cont(c))
                return 0;
        return 1;
+#else
+       if (!isu8cont(c)) {
+               if (c <= 0x7f) {
+                       len = 0;
+                       return 1;
+               }
+
+               if ((c & 0xf8) == 0xf0 && c < 0xf5)
+                       len = 3;
+               else if ((c & 0xf0) == 0xe0)
+                       len = 2;
+               else if ((c & 0xe0) == 0xc0 && c > 0xc1)
+                       len = 1;
+               else {
+                       len = 0;
+                       return 0;
+               }
+
+               cnt = 0;
+               memset(ch, 0, 5);
+               ch[cnt++] = c;
+       } else {
+               ch[cnt++] = c;
+               if (cnt > len)
+                       return u8width(ch);
+       }
+       return 0;
+#endif
 }
 
 static void
@@ -1099,6 +1174,8 @@ static int
 x_transpose(int c)
 {
        char    tmp;
+       char    rune1[4], rune2[4];
+       char    *p1, *p2, *p;
 
        /* What transpose is meant to do seems to be up for debate.
This
         * is a general summary of the options; the text is abcd with
the
@@ -1124,25 +1201,55 @@ x_transpose(int c)
                /* Gosling/Unipress emacs style: Swap two characters
before the
                 * cursor, do not change cursor position
                 */
-               x_bs(xcp[-1]);
-               x_bs(xcp[-2]);
-               x_zotc(xcp[-1]);
-               x_zotc(xcp[-2]);
-               tmp = xcp[-1];
-               xcp[-1] = xcp[-2];
-               xcp[-2] = tmp;
+               p1 = xcp;
+               do {
+                       x_bs((unsigned char) *--p1);
+               } while (xbuf < p1 && (xcp - p1) < 5 &&
isu8cont(*p1));
+
+               if (p1 == xbuf) {
+                       x_e_putc(BEL);
+                       return KSTD;
+               }
+
+               p2 = p1;
+               do {
+                       x_bs((unsigned char) *--p2);
+               } while   (xbuf <= p2 && (p1 - p2) < 5 &&
isu8cont(*p2));
+
+               for (p = p1; p < xcp; p++)
+                       x_zotc(*p);
+               for (p = p2; p < p1; p++)
+                       x_zotc(*p);
+
+               memcpy(rune1, p1, xcp - p1);
+               memcpy(rune2, p2, p1 - p2);
+               memcpy(p2, rune1, xcp - p1);
+               memcpy(p2 + (xcp - p1), rune2, p1 - p2);
        } else {
                /* GNU emacs style: Swap the characters before and
under the
                 * cursor, move cursor position along one.
                 */
-               x_bs(xcp[-1]);
-               x_zotc(xcp[0]);
-               x_zotc(xcp[-1]);
-               tmp = xcp[-1];
-               xcp[-1] = xcp[0];
-               xcp[0] = tmp;
-               x_bs(xcp[0]);
-               x_goto(xcp + 1);
+               p1 = xcp + 1;
+               while (p1 < xep && *p1 && (p1 - xcp) <= 5 &&
isu8cont(*p1))
+                       p1++;
+
+               p2 = xcp;
+               do {
+                       x_bs((unsigned char) *--p2);
+               } while (xbuf <= p2 && (xcp - p2) < 5 &&
isu8cont(*p2));
+
+               for (p = xcp; p < p1; p++)
+                       x_zotc(*p);
+               for (p = p2; p < xcp; p++)
+                       x_zotc(*p);
+
+               memcpy(rune1, xcp, p1 - xcp);
+               memcpy(rune2, p2, xcp - p2);
+               memcpy(p2, rune1, p1 - xcp);
+               memcpy(p2 + (p1 - xcp), rune2, xcp - p2);
+
+               xcp = p1;
+               x_goto(p1);
        }
        return KSTD;
 }
@@ -1804,6 +1911,11 @@ x_adjust(void)
         */
        if ((xbp = xcp - (x_displen / 2)) < xbuf)
                xbp = xbuf;
+       else {
+               /* rewind to the last valid codepoint */
+               while (xbp > xbuf && isu8cont((unsigned char) *xbp))
+                       xbp--;
+       }
        xlp_valid = false;
        x_redraw(xx_cols);
        x_flush();
@@ -1882,8 +1994,15 @@ x_e_getu8(char *buf, int off)
 }
 
 static void
-x_e_putc(int c)
+x_e_putc(int sc)
 {
+#ifndef SMALL
+       static unsigned char ch[5] = { 0 };
+       static int len = 0, cnt = 0;
+#endif
+       unsigned char c;
+
+       c = sc;
        if (c == '\r' || c == '\n')
                x_col = 0;
        if (x_col < xx_cols) {
@@ -1898,9 +2017,39 @@ x_e_putc(int c)
                        x_col--;
                        break;
                default:
+#ifdef SMALL
                        if (!isu8cont(c))
                                x_col++;
                        break;
+#else
+                       if (!isu8cont(c)) {
+                               if (c <= 0x7f) {
+                                       x_col++;
+                                       len = 0;
+                                       break;
+                               }
+
+                               if ((c & 0xf8) == 0xf0 && c < 0xf5)
+                                       len = 3;
+                               else if ((c & 0xf0) == 0xe0)
+                                       len = 2;
+                               else if ((c & 0xe0) == 0xc0 && c >
0xc1)
+                                       len = 1;
+                               else {
+                                       len = 0;
+                                       break;
+                               }
+
+                               cnt = 0;
+                               memset(ch, 0, 5);
+                               ch[cnt++] = c;
+                       } else {
+                               ch[cnt++] = c;
+                               if (cnt > len)
+                                       x_col += u8width(ch);
+                       }
+                       break;
+#endif
                }
        }
        if (x_adj_ok && (x_col < 0 || x_col >= (xx_cols - 2)))
diff --git a/bin/ksh/main.c b/bin/ksh/main.c
index b011b22bf..a269c5364 100644
--- a/bin/ksh/main.c
+++ b/bin/ksh/main.c
@@ -15,6 +15,10 @@
 #include <string.h>
 #include <unistd.h>
 
+#ifndef SMALL
+#include <locale.h>
+#endif
+
 #include "sh.h"
 
 extern char **environ;
@@ -152,6 +156,10 @@ main(int argc, char *argv[])
        struct env env;
        pid_t ppid;
 
+#ifndef SMALL
+       setlocale(LC_ALL, "");
+#endif
+
        kshname = argv[0];
 
        if (issetugid()) { /* could later drop privileges */
diff --git a/bin/ksh/unicode.c b/bin/ksh/unicode.c
new file mode 100644
index 000000000..a1b3c4a39
--- /dev/null
+++ b/bin/ksh/unicode.c
@@ -0,0 +1,33 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#include "unicode.h"
+
+#ifndef SMALL
+
+int
+u8width(const char *buf) {
+       const unsigned char *ubuf = buf;
+       wchar_t wc;
+       int u8len;
+
+       u8len = 0;
+       if (ubuf[0] <= 0x7F)
+               u8len = 1;
+       else if ((ubuf[0] & 0xE0) == 0xC0)
+               u8len = 2;
+       else if ((ubuf[0] & 0xF0) == 0xE0)
+               u8len = 3;
+       else if ((ubuf[0] & 0xF8) == 0xF0)
+               u8len = 4;
+
+       if (mbtowc(&wc, buf, u8len) == u8len)
+               return wcwidth(wc);
+       else {
+               mbtowc(&wc, NULL, 0);
+               return 1;
+       }
+}
+
+#endif
diff --git a/bin/ksh/unicode.h b/bin/ksh/unicode.h
new file mode 100644
index 000000000..61ac18cce
--- /dev/null
+++ b/bin/ksh/unicode.h
@@ -0,0 +1,6 @@
+#ifndef UNICODE_H
+#define UNICODE_H
+
+int u8width(const char *);
+
+#endif /* UNICODE_H */
- 2025-04-08 06:39 Christian Schulte:
  [REPOST] ksh: utf8 full width character support for emacs.c
- - 2025-04-13 14:30 William Goodspeed:
    [REPOST] ksh: utf8 full width character support for emacs.c
2025-04-04 15:52 William Goodspeed:
[REPOST] ksh: utf8 full width character support for emacs.c