Index | Thread | Search

From:
William Goodspeed <gongzl@stu.hebust.edu.cn>
Subject:
Re: [REPOST] ksh: utf8 full width character support for emacs.c
To:
tech@openbsd.org
Date:
Fri, 04 Apr 2025 23:36:39 +0800

Download raw body.

Thread
  • Ingo Schwarze:

    [REPOST] ksh: utf8 full width character support for emacs.c

  • Hi list,
    
    Thank everyone for the attention on the issue and the discussion on
    unicode.
    
    To make it partially work in the simpliest way imho would be replacing
    is_fullwidth and u8_to_cpt with wcwidth(3) and u8_to_wc (with mbtowc).
    To be clear, that doesn't mean it will refactor the entire shell to
    wchar,
    it uses wchar routines only if an unicode occurred to calculate its
    width.
    Therefore, there won't be an performance issue of calling mbtowc(3)
    repeatedly as it only calls it on unicode instead of regular ascii
    chars.
    After testing, that works pretty good for me.
    
    However there are pros and cons on that solution, and here are some of
    my thoughts:
    
    1. setlocale(3) call can't be avoided. It would be
       `setlocale(LC_ALL, "")' to make it default to the encoding in
       the user envirnoment. And it's quiet clear that's a must if one
       wants any i18n from the C library.
    
    2. It behaves differently across locales. If C/POSIX (ASCII)
       or encodings other than utf-8 are choosen, unicode character
       correctly. [But if we fallback wcwidth of something failed to
       one column, it will matche the current (broken) behaviour.] But,
       one would find it parsing utf-8 anyways in other locales.
       Unfortunately, the issue also exists in the current ksh.
       (Since OpenBSD only supports ASCII and UTF8, I don't think
       that's really big of a problem.)
    
    3. Also an user-perceived character may not be one rune. But
       like emojis, in a terminal, they aren't combined together (at least
       on urxvt with Unifont). So I think it's already really ambiguous
       to determine the actuall width of a cluster character.
    
    So in conclusion, I suggest switching to wcwidth(3) and mbtowc(3)
    despite the (maybe) disadvantage of calling setlocale(3). It solves the
    problem without introducing too much complexity and other defects.
       
    Any thoughts?
    
    ---
     bin/ksh/Makefile  |   2 +-
     bin/ksh/emacs.c   | 187 +++++++++++++++++++++++++++++++++++++++++-----
     bin/ksh/main.c    |   8 ++
     bin/ksh/unicode.c |  33 ++++++++
     bin/ksh/unicode.h |   6 ++
     5 files changed, 216 insertions(+), 20 deletions(-)
     create mode 100644 bin/ksh/unicode.c
     create mode 100644 bin/ksh/unicode.h
    
    diff --git a/bin/ksh/Makefile b/bin/ksh/Makefile
    index 621c38bcf..d800154e4 100644
    --- a/bin/ksh/Makefile
    +++ b/bin/ksh/Makefile
    @@ -7,7 +7,7 @@ LDADD+=	-lcurses
     SRCS=	alloc.c c_ksh.c c_sh.c c_test.c c_ulimit.c edit.c emacs.c
    eval.c \
     	exec.c expr.c history.c io.c jobs.c lex.c mail.c main.c \
     	misc.c path.c shf.c syn.c table.c trap.c tree.c tty.c var.c \
    -	version.c vi.c
    +	version.c vi.c unicode.c
     
     WARNINGS=yes
     DEFS=	-DEMACS -DVI
    diff --git a/bin/ksh/emacs.c b/bin/ksh/emacs.c
    index 3c9b0074d..813bc8e5a 100644
    --- a/bin/ksh/emacs.c
    +++ b/bin/ksh/emacs.c
    @@ -29,6 +29,11 @@
     #include "sh.h"
     #include "edit.h"
     
    +#ifndef SMALL
    +#include "unicode.h"
    +#else
    +#define x_size_rev x_size
    +#endif
     static	Area	aedit;
     #define	AEDIT	&aedit		/* area for kill ring and
    macro defns */
     
    @@ -126,6 +131,7 @@ static int	x_fword(void);
     static void	x_goto(char *);
     static void	x_bs(int);
     static int	x_size_str(char *);
    +static int	x_size_rev(int);
     static int	x_size(int);
     static void	x_zots(char *);
     static void	x_zotc(int);
    @@ -459,7 +465,7 @@ x_ins(char *s)
     	if (adj == x_adj_done) {	/* has x_adjust() been called?
    */
     		/* no */
     		for (cp = xlp; cp > xcp; )
    -			x_bs(*--cp);
    +			x_bs((unsigned char)*--cp);
     	}
     
     	x_adj_ok = 1;
    @@ -653,7 +659,7 @@ x_bs(int c)
     {
     	int i;
     
    -	i = x_size(c);
    +	i = x_size_rev(c);
     	while (i--)
     		x_e_putc('\b');
     }
    @@ -663,20 +669,89 @@ x_size_str(char *cp)
     {
     	int size = 0;
     	while (*cp)
    -		size += x_size(*cp++);
    +		size += x_size((unsigned char)*cp++);
     	return size;
     }
     
    +#ifndef SMALL
    +static int
    +x_size_rev(int c)
    +{
    +	static unsigned char ch[5] = { 0 };
    +	static int cnt = 3;
    +	int w;
    +
    +	if (c=='\t')
    +		return 4;	/* Kludge, tabs are always four
    spaces. */
    +	if (iscntrl(c))		/* control char */
    +		return 2;
    +
    +	if (!isu8cont(c)) {
    +		if (c <= 0x7f) {
    +			cnt = 3;
    +			return 1;
    +		}
    +
    +		ch[cnt] = c;
    +		w = u8width(ch + cnt);
    +
    +		cnt = 3;
    +		memset(ch, 0, 4);
    +		return w;
    +	} else {
    +		if (cnt <= 0)
    +			return 0;
    +		ch[cnt] = c;
    +		cnt--;
    +	}
    +
    +	return 0;
    +}
    +#endif
    +
     static int
     x_size(int c)
     {
    +#ifndef SMALL
    +	static unsigned char ch[5] = { 0 };
    +	static int len = 0, cnt = 0;
    +#endif
     	if (c=='\t')
     		return 4;	/* Kludge, tabs are always four
    spaces. */
     	if (iscntrl(c))		/* control char */
     		return 2;
    +#ifdef SMALL
     	if (isu8cont(c))
     		return 0;
     	return 1;
    +#else
    +	if (!isu8cont(c)) {
    +		if (c <= 0x7f) {
    +			len = 0;
    +			return 1;
    +		}
    +
    +		if ((c & 0xf8) == 0xf0 && c < 0xf5)
    +			len = 3;
    +		else if ((c & 0xf0) == 0xe0)
    +			len = 2;
    +		else if ((c & 0xe0) == 0xc0 && c > 0xc1)
    +			len = 1;
    +		else {
    +			len = 0;
    +			return 0;
    +		}
    +
    +		cnt = 0;
    +		memset(ch, 0, 5);
    +		ch[cnt++] = c;
    +	} else {
    +		ch[cnt++] = c;
    +		if (cnt > len)
    +			return u8width(ch);
    +	}
    +	return 0;
    +#endif
     }
     
     static void
    @@ -1099,6 +1174,8 @@ static int
     x_transpose(int c)
     {
     	char	tmp;
    +	char	rune1[4], rune2[4];
    +	char	*p1, *p2, *p;
     
     	/* What transpose is meant to do seems to be up for debate.
    This
     	 * is a general summary of the options; the text is abcd with
    the
    @@ -1124,25 +1201,55 @@ x_transpose(int c)
     		/* Gosling/Unipress emacs style: Swap two characters
    before the
     		 * cursor, do not change cursor position
     		 */
    -		x_bs(xcp[-1]);
    -		x_bs(xcp[-2]);
    -		x_zotc(xcp[-1]);
    -		x_zotc(xcp[-2]);
    -		tmp = xcp[-1];
    -		xcp[-1] = xcp[-2];
    -		xcp[-2] = tmp;
    +		p1 = xcp;
    +		do {
    +			x_bs((unsigned char) *--p1);
    +		} while (xbuf < p1 && (xcp - p1) < 5 &&
    isu8cont(*p1));
    +
    +		if (p1 == xbuf) {
    +			x_e_putc(BEL);
    +			return KSTD;
    +		}
    +
    +		p2 = p1;
    +		do {
    +			x_bs((unsigned char) *--p2);
    +		} while   (xbuf <= p2 && (p1 - p2) < 5 &&
    isu8cont(*p2));
    +
    +		for (p = p1; p < xcp; p++)
    +			x_zotc(*p);
    +		for (p = p2; p < p1; p++)
    +			x_zotc(*p);
    +
    +		memcpy(rune1, p1, xcp - p1);
    +		memcpy(rune2, p2, p1 - p2);
    +		memcpy(p2, rune1, xcp - p1);
    +		memcpy(p2 + (xcp - p1), rune2, p1 - p2);
     	} else {
     		/* GNU emacs style: Swap the characters before and
    under the
     		 * cursor, move cursor position along one.
     		 */
    -		x_bs(xcp[-1]);
    -		x_zotc(xcp[0]);
    -		x_zotc(xcp[-1]);
    -		tmp = xcp[-1];
    -		xcp[-1] = xcp[0];
    -		xcp[0] = tmp;
    -		x_bs(xcp[0]);
    -		x_goto(xcp + 1);
    +		p1 = xcp + 1;
    +		while (p1 < xep && *p1 && (p1 - xcp) <= 5 &&
    isu8cont(*p1))
    +			p1++;
    +
    +		p2 = xcp;
    +		do {
    +			x_bs((unsigned char) *--p2);
    +		} while (xbuf <= p2 && (xcp - p2) < 5 &&
    isu8cont(*p2));
    +
    +		for (p = xcp; p < p1; p++)
    +			x_zotc(*p);
    +		for (p = p2; p < xcp; p++)
    +			x_zotc(*p);
    +
    +		memcpy(rune1, xcp, p1 - xcp);
    +		memcpy(rune2, p2, xcp - p2);
    +		memcpy(p2, rune1, p1 - xcp);
    +		memcpy(p2 + (p1 - xcp), rune2, xcp - p2);
    +
    +		xcp = p1;
    +		x_goto(p1);
     	}
     	return KSTD;
     }
    @@ -1804,6 +1911,11 @@ x_adjust(void)
     	 */
     	if ((xbp = xcp - (x_displen / 2)) < xbuf)
     		xbp = xbuf;
    +	else {
    +		/* rewind to the last valid codepoint */
    +		while (xbp > xbuf && isu8cont((unsigned char) *xbp))
    +			xbp--;
    +	}
     	xlp_valid = false;
     	x_redraw(xx_cols);
     	x_flush();
    @@ -1882,8 +1994,15 @@ x_e_getu8(char *buf, int off)
     }
     
     static void
    -x_e_putc(int c)
    +x_e_putc(int sc)
     {
    +#ifndef SMALL
    +	static unsigned char ch[5] = { 0 };
    +	static int len = 0, cnt = 0;
    +#endif
    +	unsigned char c;
    +
    +	c = sc;
     	if (c == '\r' || c == '\n')
     		x_col = 0;
     	if (x_col < xx_cols) {
    @@ -1898,9 +2017,39 @@ x_e_putc(int c)
     			x_col--;
     			break;
     		default:
    +#ifdef SMALL
     			if (!isu8cont(c))
     				x_col++;
     			break;
    +#else
    +			if (!isu8cont(c)) {
    +				if (c <= 0x7f) {
    +					x_col++;
    +					len = 0;
    +					break;
    +				}
    +
    +				if ((c & 0xf8) == 0xf0 && c < 0xf5)
    +					len = 3;
    +				else if ((c & 0xf0) == 0xe0)
    +					len = 2;
    +				else if ((c & 0xe0) == 0xc0 && c >
    0xc1)
    +					len = 1;
    +				else {
    +					len = 0;
    +					break;
    +				}
    +
    +				cnt = 0;
    +				memset(ch, 0, 5);
    +				ch[cnt++] = c;
    +			} else {
    +				ch[cnt++] = c;
    +				if (cnt > len)
    +					x_col += u8width(ch);
    +			}
    +			break;
    +#endif
     		}
     	}
     	if (x_adj_ok && (x_col < 0 || x_col >= (xx_cols - 2)))
    diff --git a/bin/ksh/main.c b/bin/ksh/main.c
    index b011b22bf..a269c5364 100644
    --- a/bin/ksh/main.c
    +++ b/bin/ksh/main.c
    @@ -15,6 +15,10 @@
     #include <string.h>
     #include <unistd.h>
     
    +#ifndef SMALL
    +#include <locale.h>
    +#endif
    +
     #include "sh.h"
     
     extern char **environ;
    @@ -152,6 +156,10 @@ main(int argc, char *argv[])
     	struct env env;
     	pid_t ppid;
     
    +#ifndef SMALL
    +	setlocale(LC_ALL, "");
    +#endif
    +
     	kshname = argv[0];
     
     	if (issetugid()) { /* could later drop privileges */
    diff --git a/bin/ksh/unicode.c b/bin/ksh/unicode.c
    new file mode 100644
    index 000000000..a1b3c4a39
    --- /dev/null
    +++ b/bin/ksh/unicode.c
    @@ -0,0 +1,33 @@
    +#include <stddef.h>
    +#include <stdlib.h>
    +#include <wchar.h>
    +
    +#include "unicode.h"
    +
    +#ifndef SMALL
    +
    +int
    +u8width(const char *buf) {
    +	const unsigned char *ubuf = buf;
    +	wchar_t wc;
    +	int u8len;
    +
    +	u8len = 0;
    +	if (ubuf[0] <= 0x7F)
    +		u8len = 1;
    +	else if ((ubuf[0] & 0xE0) == 0xC0)
    +		u8len = 2;
    +	else if ((ubuf[0] & 0xF0) == 0xE0)
    +		u8len = 3;
    +	else if ((ubuf[0] & 0xF8) == 0xF0)
    +		u8len = 4;
    +
    +	if (mbtowc(&wc, buf, u8len) == u8len)
    +		return wcwidth(wc);
    +	else {
    +		mbtowc(&wc, NULL, 0);
    +		return 1;
    +	}
    +}
    +
    +#endif
    diff --git a/bin/ksh/unicode.h b/bin/ksh/unicode.h
    new file mode 100644
    index 000000000..61ac18cce
    --- /dev/null
    +++ b/bin/ksh/unicode.h
    @@ -0,0 +1,6 @@
    +#ifndef UNICODE_H
    +#define UNICODE_H
    +
    +int u8width(const char *);
    +
    +#endif	/* UNICODE_H */
    -- 
    2.46.1
    
    ---
    Gong Zhile (William Goodspeed)
    
    
  • Ingo Schwarze:

    [REPOST] ksh: utf8 full width character support for emacs.c