Index | Thread | Search

From:
Gong Zhile <gongzl@stu.hebust.edu.cn>
Subject:
Re: ksh: utf8 full width character support for emacs.c
To:
tech@openbsd.org
Date:
Sat, 28 Dec 2024 20:50:12 +0800

Download raw body.

Thread
Hi list,

Here are some sample texts for testing:

   Chinese: “你好世界。”
   Japanese (Kanji and Hiragana): 「こんにちは世界。」
   Korean (Hangul): 「안녕하세요 세계。」
   Capitalized Fullwidth Roman Characters: 「HELLO WORLD。」

Please review and happy holidays!

---
goodspeed

On Tue, 2024-12-17 at 13:53 +0800, Gong Zhile wrote:
> Full width characters are commonly used in Asian language system like
> Chinese,
> Japanese and Korean etc. Those characters took double the width of a normal
> ascii char but x_size only counts them in one unit. When navigating between
> those characters in emacs line editing mode, the cursor would lose track and
> mess up the the line making it really difficult to input.
> 
> I tried to make x_size counts correctly with static variables in func and
> looking up in a table generated from ‘EastAsianWidth.txt’. Characters mainly
> count in a size of 2 are: Kanji, Katakana, Hiragana, Hangul, Roman Full-
> Width
> Characters, emojis etc.
> 
> Expected behavior (After patching): cursor should land correctly while
> navigating between full width characters, line editing commands (like
> x_transpose)
> correctly perform.
> 
> Known issue: When heading off the screen with full width chars, it fails to
> place the angle bracket correctly. (Not easy to deal with when full width
> characters crossing xx_cols)
> 
> Tested on: rxvt-unicode, xterm
> 
> Index: bin/ksh/Makefile
> ===================================================================
> RCS file: /cvs/src/bin/ksh/Makefile,v
> diff -u -r1.39 Makefile
> --- bin/ksh/Makefile	18 Jun 2018 17:03:58 -0000	1.39
> +++ bin/ksh/Makefile	17 Dec 2024 05:26:43 -0000
> @@ -7,7 +7,7 @@
>  SRCS=	alloc.c c_ksh.c c_sh.c c_test.c c_ulimit.c edit.c emacs.c eval.c \
>  	exec.c expr.c history.c io.c jobs.c lex.c mail.c main.c \
>  	misc.c path.c shf.c syn.c table.c trap.c tree.c tty.c var.c \
> -	version.c vi.c
> +	version.c vi.c unicode.c
>  
>  WARNINGS=yes
>  DEFS=	-DEMACS -DVI
> Index: bin/ksh/emacs.c
> ===================================================================
> RCS file: /cvs/src/bin/ksh/emacs.c,v
> diff -u -r1.90 emacs.c
> --- bin/ksh/emacs.c	21 Jun 2023 22:22:08 -0000	1.90
> +++ bin/ksh/emacs.c	17 Dec 2024 05:26:43 -0000
> @@ -29,6 +29,11 @@
>  #include "sh.h"
>  #include "edit.h"
>  
> +#ifndef SMALL
> +#include "unicode.h"
> +#else
> +#define x_size_rev x_size
> +#endif
>  static	Area	aedit;
>  #define	AEDIT	&aedit		/* area for kill ring and macro
> defns */
>  
> @@ -126,6 +131,7 @@
>  static void	x_goto(char *);
>  static void	x_bs(int);
>  static int	x_size_str(char *);
> +static int	x_size_rev(int);
>  static int	x_size(int);
>  static void	x_zots(char *);
>  static void	x_zotc(int);
> @@ -459,7 +465,7 @@
>  	if (adj == x_adj_done) {	/* has x_adjust() been called? */
>  		/* no */
>  		for (cp = xlp; cp > xcp; )
> -			x_bs(*--cp);
> +			x_bs((unsigned char)*--cp);
>  	}
>  
>  	x_adj_ok = 1;
> @@ -552,7 +558,7 @@
>  	x_adj_ok = 1;
>  	xlp_valid = false;
>  	for (cp = x_lastcp(); cp > xcp; )
> -		x_bs(*--cp);
> +		x_bs((unsigned char)*--cp);
>  
>  	return;
>  }
> @@ -653,7 +659,7 @@
>  {
>  	int i;
>  
> -	i = x_size(c);
> +	i = x_size_rev(c);
>  	while (i--)
>  		x_e_putc('\b');
>  }
> @@ -663,20 +669,93 @@
>  {
>  	int size = 0;
>  	while (*cp)
> -		size += x_size(*cp++);
> +		size += x_size((unsigned char)*cp++);
>  	return size;
>  }
> +#ifndef SMALL
> +static int
> +x_size_rev(int c)
> +{
> +	static unsigned char ch[5] = { 0 };
> +	static int cnt = 3;
> +	unsigned long cpt;
> +	int w;
> +
> +	if (c=='\t')
> +		return 4;	/* Kludge, tabs are always four spaces. */
> +	if (iscntrl(c))		/* control char */
> +		return 2;
> +
> +	if (!isu8cont(c)) {
> +		if (c <= 0x7f) {
> +			cnt = 3;
> +			return 1;
> +		}
> +
> +		ch[cnt] = c;
> +		u8_to_cpt(ch + cnt, &cpt);
> +		w = is_fullwidth(cpt) ? 2 : 1;
> +
> +		cnt = 3;
> +		memset(ch, 0, 4);
> +		return w;
> +	} else {
> +		if (cnt <= 0)
> +			return 0;
> +		ch[cnt] = c;
> +		cnt--;
> +	}
> +
> +	return 0;
> +}
> +#endif
>  
>  static int
>  x_size(int c)
>  {
> +#ifndef SMALL
> +	static unsigned char ch[5] = { 0 };
> +	static int len = 0, cnt = 0;
> +	unsigned long cpt;
> +#endif
>  	if (c=='\t')
>  		return 4;	/* Kludge, tabs are always four spaces. */
>  	if (iscntrl(c))		/* control char */
>  		return 2;
> +#ifdef SMALL
>  	if (isu8cont(c))
>  		return 0;
>  	return 1;
> +#else
> +	if (!isu8cont(c)) {
> +		if (c <= 0x7f) {
> +			len = 0;
> +			return 1;
> +		}
> +
> +		if ((c & 0xf8) == 0xf0 && c < 0xf5)
> +			len = 3;
> +		else if ((c & 0xf0) == 0xe0)
> +			len = 2;
> +		else if ((c & 0xe0) == 0xc0 && c > 0xc1)
> +			len = 1;
> +		else {
> +			len = 0;
> +			return 0;
> +		}
> +
> +		cnt = 0;
> +		memset(ch, 0, 5);
> +		ch[cnt++] = c;
> +	} else {
> +		ch[cnt++] = c;
> +		if (cnt > len) {
> +			u8_to_cpt(ch, &cpt);
> +			return is_fullwidth(cpt) ? 2 : 1;
> +		}
> +	}
> +	return 0;
> +#endif
>  }
>  
>  static void
> @@ -1098,7 +1177,8 @@
>  static int
>  x_transpose(int c)
>  {
> -	char	tmp;
> +	char	rune1[4], rune2[4];
> +	char	*p1, *p2, *p;
>  
>  	/* What transpose is meant to do seems to be up for debate. This
>  	 * is a general summary of the options; the text is abcd with the
> @@ -1124,25 +1204,55 @@
>  		/* Gosling/Unipress emacs style: Swap two characters before
> the
>  		 * cursor, do not change cursor position
>  		 */
> -		x_bs(xcp[-1]);
> -		x_bs(xcp[-2]);
> -		x_zotc(xcp[-1]);
> -		x_zotc(xcp[-2]);
> -		tmp = xcp[-1];
> -		xcp[-1] = xcp[-2];
> -		xcp[-2] = tmp;
> +		p1 = xcp;
> +		do {
> +			x_bs((unsigned char) *--p1);
> +		} while (xbuf < p1 && (xcp - p1) < 5 && isu8cont(*p1));
> +
> +		if (p1 == xbuf) {
> +			x_e_putc(BEL);
> +			return KSTD;
> +		}
> +
> +		p2 = p1;
> +		do {
> +			x_bs((unsigned char) *--p2);
> +		} while  (xbuf <= p2 && (p1 - p2) < 5 && isu8cont(*p2));
> +
> +		for (p = p1; p < xcp; p++)
> +			x_zotc(*p);
> +		for (p = p2; p < p1; p++)
> +			x_zotc(*p);
> +
> +		memcpy(rune1, p1, xcp - p1);
> +		memcpy(rune2, p2, p1 - p2);
> +		memcpy(p2, rune1, xcp - p1);
> +		memcpy(p2 + (xcp - p1), rune2, p1 - p2);
>  	} else {
>  		/* GNU emacs style: Swap the characters before and under
> the
>  		 * cursor, move cursor position along one.
>  		 */
> -		x_bs(xcp[-1]);
> -		x_zotc(xcp[0]);
> -		x_zotc(xcp[-1]);
> -		tmp = xcp[-1];
> -		xcp[-1] = xcp[0];
> -		xcp[0] = tmp;
> -		x_bs(xcp[0]);
> -		x_goto(xcp + 1);
> +		p1 = xcp + 1;
> +		while (p1 < xep && *p1 && (p1 - xcp) <= 5 && isu8cont(*p1))
> +			p1++;
> +
> +		p2 = xcp;
> +		do {
> +			x_bs((unsigned char) *--p2);
> +		} while (xbuf <= p2 && (xcp - p2) < 5 && isu8cont(*p2));
> +
> +		for (p = xcp; p < p1; p++)
> +			x_zotc(*p);
> +		for (p = p2; p < xcp; p++)
> +			x_zotc(*p);
> +
> +		memcpy(rune1, xcp, p1 - xcp);
> +		memcpy(rune2, p2, xcp - p2);
> +		memcpy(p2, rune1, p1 - xcp);
> +		memcpy(p2 + (p1 - xcp), rune2, xcp - p2);
> +
> +		xcp = p1;
> +		x_goto(p1);
>  	}
>  	return KSTD;
>  }
> @@ -1804,6 +1914,11 @@
>  	 */
>  	if ((xbp = xcp - (x_displen / 2)) < xbuf)
>  		xbp = xbuf;
> +	else {
> +		/* rewind to the last valid codepoint */
> +		while (xbp > xbuf && isu8cont((unsigned char) *xbp))
> +			xbp--;
> +	}
>  	xlp_valid = false;
>  	x_redraw(xx_cols);
>  	x_flush();
> @@ -1882,8 +1997,16 @@
>  }
>  
>  static void
> -x_e_putc(int c)
> +x_e_putc(int sc)
>  {
> +#ifndef SMALL
> +	static unsigned char ch[5] = { 0 };
> +	static int len = 0, cnt = 0;
> +	unsigned long cpt;
> +#endif
> +	unsigned char c;
> +
> +	c = sc;
>  	if (c == '\r' || c == '\n')
>  		x_col = 0;
>  	if (x_col < xx_cols) {
> @@ -1898,9 +2021,43 @@
>  			x_col--;
>  			break;
>  		default:
> +#ifdef SMALL
>  			if (!isu8cont(c))
>  				x_col++;
>  			break;
> +#else
> +			if (!isu8cont(c)) {
> +				if (c <= 0x7f) {
> +					x_col++;
> +					len = 0;
> +					break;
> +				}
> +
> +				if ((c & 0xf8) == 0xf0 && c < 0xf5)
> +					len = 3;
> +				else if ((c & 0xf0) == 0xe0)
> +					len = 2;
> +				else if ((c & 0xe0) == 0xc0 && c > 0xc1)
> +					len = 1;
> +				else {
> +					len = 0;
> +					break;
> +				}
> +
> +				cnt = 0;
> +				memset(ch, 0, 5);
> +				ch[cnt++] = c;
> +			} else {
> +				ch[cnt++] = c;
> +				if (cnt > len) {
> +					x_col++;
> +					u8_to_cpt(ch, &cpt);
> +					if (is_fullwidth(cpt))
> +						x_col++;
> +				}
> +			}
> +			break;
> +#endif
>  		}
>  	}
>  	if (x_adj_ok && (x_col < 0 || x_col >= (xx_cols - 2)))
> Index: bin/ksh/unicode.c
> ===================================================================
> --- bin/ksh/unicode.c    (new file)
> +++ bin/ksh/unicode.c    (working copy)
> --- /dev/null	2024-12-17 11:54:03.396000088 +0800
> +++ bin/ksh/unicode.c	2024-12-17 13:25:38.050258915 +0800
> @@ -0,0 +1,162 @@
> +#include "unicode.h"
> +
> +#ifndef SMALL
> +
> +/* The following code was generated from EastAsianWidth.txt (Flag: W&F)
> + * Reference: https://www.unicode.org/reports/tr11/tr11-6.html
> + */
> +
> +int is_fullwidth(unsigned long cpt) {
> +	if ((0x1100 <= cpt && cpt <= 0x115f)
> +	    || (0x231a <= cpt && cpt <= 0x231b)
> +	    || (0x2329 <= cpt && cpt <= 0x232a)
> +	    || (0x23e9 <= cpt && cpt <= 0x23ec)
> +	    || (cpt == 0x23f0)
> +	    || (cpt == 0x23f3)
> +	    || (0x25fd <= cpt && cpt <= 0x25fe)
> +	    || (0x2614 <= cpt && cpt <= 0x2615)
> +	    || (0x2630 <= cpt && cpt <= 0x2637)
> +	    || (0x2648 <= cpt && cpt <= 0x2653)
> +	    || (cpt == 0x267f)
> +	    || (0x268a <= cpt && cpt <= 0x268f)
> +	    || (cpt == 0x2693)
> +	    || (cpt == 0x26a1)
> +	    || (0x26aa <= cpt && cpt <= 0x26ab)
> +	    || (0x26bd <= cpt && cpt <= 0x26be)
> +	    || (0x26c4 <= cpt && cpt <= 0x26c5)
> +	    || (cpt == 0x26ce)
> +	    || (cpt == 0x26d4)
> +	    || (cpt == 0x26ea)
> +	    || (0x26f2 <= cpt && cpt <= 0x26f3)
> +	    || (cpt == 0x26f5)
> +	    || (cpt == 0x26fa)
> +	    || (cpt == 0x26fd)
> +	    || (cpt == 0x2705)
> +	    || (0x270a <= cpt && cpt <= 0x270b)
> +	    || (cpt == 0x2728)
> +	    || (cpt == 0x274c)
> +	    || (cpt == 0x274e)
> +	    || (0x2753 <= cpt && cpt <= 0x2755)
> +	    || (cpt == 0x2757)
> +	    || (0x2795 <= cpt && cpt <= 0x2797)
> +	    || (cpt == 0x27b0)
> +	    || (cpt == 0x27bf)
> +	    || (0x2b1b <= cpt && cpt <= 0x2b1c)
> +	    || (cpt == 0x2b50)
> +	    || (cpt == 0x2b55)
> +	    || (0x2e80 <= cpt && cpt <= 0x2e99)
> +	    || (0x2e9b <= cpt && cpt <= 0x2ef3)
> +	    || (0x2f00 <= cpt && cpt <= 0x2fd5)
> +	    || (0x2ff0 <= cpt && cpt <= 0x303e)
> +	    || (0x3041 <= cpt && cpt <= 0x3096)
> +	    || (0x3099 <= cpt && cpt <= 0x30ff)
> +	    || (0x3105 <= cpt && cpt <= 0x312f)
> +	    || (0x3131 <= cpt && cpt <= 0x318e)
> +	    || (0x3190 <= cpt && cpt <= 0x31e5)
> +	    || (0x31ef <= cpt && cpt <= 0x321e)
> +	    || (0x3220 <= cpt && cpt <= 0x3247)
> +	    || (0x3250 <= cpt && cpt <= 0xa48c)
> +	    || (0xa490 <= cpt && cpt <= 0xa4c6)
> +	    || (0xa960 <= cpt && cpt <= 0xa97c)
> +	    || (0xac00 <= cpt && cpt <= 0xd7a3)
> +	    || (0xf900 <= cpt && cpt <= 0xfaff)
> +	    || (0xfe10 <= cpt && cpt <= 0xfe19)
> +	    || (0xfe30 <= cpt && cpt <= 0xfe52)
> +	    || (0xfe54 <= cpt && cpt <= 0xfe66)
> +	    || (0xfe68 <= cpt && cpt <= 0xfe6b)
> +	    || (0xff01 <= cpt && cpt <= 0xff60)
> +	    || (0xffe0 <= cpt && cpt <= 0xffe6)
> +	    || (0x16fe0 <= cpt && cpt <= 0x16fe4)
> +	    || (0x16ff0 <= cpt && cpt <= 0x16ff1)
> +	    || (0x17000 <= cpt && cpt <= 0x187f7)
> +	    || (0x18800 <= cpt && cpt <= 0x18cd5)
> +	    || (0x18cff <= cpt && cpt <= 0x18d08)
> +	    || (0x1aff0 <= cpt && cpt <= 0x1aff3)
> +	    || (0x1aff5 <= cpt && cpt <= 0x1affb)
> +	    || (0x1affd <= cpt && cpt <= 0x1affe)
> +	    || (0x1b000 <= cpt && cpt <= 0x1b122)
> +	    || (cpt == 0x1b132)
> +	    || (0x1b150 <= cpt && cpt <= 0x1b152)
> +	    || (cpt == 0x1b155)
> +	    || (0x1b164 <= cpt && cpt <= 0x1b167)
> +	    || (0x1b170 <= cpt && cpt <= 0x1b2fb)
> +	    || (0x1d300 <= cpt && cpt <= 0x1d356)
> +	    || (0x1d360 <= cpt && cpt <= 0x1d376)
> +	    || (cpt == 0x1f004)
> +	    || (cpt == 0x1f0cf)
> +	    || (cpt == 0x1f18e)
> +	    || (0x1f191 <= cpt && cpt <= 0x1f19a)
> +	    || (0x1f200 <= cpt && cpt <= 0x1f202)
> +	    || (0x1f210 <= cpt && cpt <= 0x1f23b)
> +	    || (0x1f240 <= cpt && cpt <= 0x1f248)
> +	    || (0x1f250 <= cpt && cpt <= 0x1f251)
> +	    || (0x1f260 <= cpt && cpt <= 0x1f265)
> +	    || (0x1f300 <= cpt && cpt <= 0x1f320)
> +	    || (0x1f32d <= cpt && cpt <= 0x1f335)
> +	    || (0x1f337 <= cpt && cpt <= 0x1f37c)
> +	    || (0x1f37e <= cpt && cpt <= 0x1f393)
> +	    || (0x1f3a0 <= cpt && cpt <= 0x1f3ca)
> +	    || (0x1f3cf <= cpt && cpt <= 0x1f3d3)
> +	    || (0x1f3e0 <= cpt && cpt <= 0x1f3f0)
> +	    || (cpt == 0x1f3f4)
> +	    || (0x1f3f8 <= cpt && cpt <= 0x1f43e)
> +	    || (cpt == 0x1f440)
> +	    || (0x1f442 <= cpt && cpt <= 0x1f4fc)
> +	    || (0x1f4ff <= cpt && cpt <= 0x1f53d)
> +	    || (0x1f54b <= cpt && cpt <= 0x1f54e)
> +	    || (0x1f550 <= cpt && cpt <= 0x1f567)
> +	    || (cpt == 0x1f57a)
> +	    || (0x1f595 <= cpt && cpt <= 0x1f596)
> +	    || (cpt == 0x1f5a4)
> +	    || (0x1f5fb <= cpt && cpt <= 0x1f64f)
> +	    || (0x1f680 <= cpt && cpt <= 0x1f6c5)
> +	    || (cpt == 0x1f6cc)
> +	    || (0x1f6d0 <= cpt && cpt <= 0x1f6d2)
> +	    || (0x1f6d5 <= cpt && cpt <= 0x1f6d7)
> +	    || (0x1f6dc <= cpt && cpt <= 0x1f6df)
> +	    || (0x1f6eb <= cpt && cpt <= 0x1f6ec)
> +	    || (0x1f6f4 <= cpt && cpt <= 0x1f6fc)
> +	    || (0x1f7e0 <= cpt && cpt <= 0x1f7eb)
> +	    || (cpt == 0x1f7f0)
> +	    || (0x1f90c <= cpt && cpt <= 0x1f93a)
> +	    || (0x1f93c <= cpt && cpt <= 0x1f945)
> +	    || (0x1f947 <= cpt && cpt <= 0x1f9ff)
> +	    || (0x1fa70 <= cpt && cpt <= 0x1fa7c)
> +	    || (0x1fa80 <= cpt && cpt <= 0x1fa89)
> +	    || (0x1fa8f <= cpt && cpt <= 0x1fac6)
> +	    || (0x1face <= cpt && cpt <= 0x1fadc)
> +	    || (0x1fadf <= cpt && cpt <= 0x1fae9)
> +	    || (0x1faf0 <= cpt && cpt <= 0x1faf8)
> +	    || (0x20000 <= cpt && cpt <= 0x2fffd)
> +	    || (0x30000 <= cpt && cpt <= 0x3fffd))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +int u8_to_cpt(const char *buf, unsigned long *cpt) {
> +	const unsigned char *ubuf = buf;
> +
> +	if (ubuf[0] <= 0x7F) {
> +		*cpt = ubuf[0];
> +		return 1;
> +	} else if ((ubuf[0] & 0xE0) == 0xC0) {
> +		*cpt = ((ubuf[0] & 0x1F) << 6) | (ubuf[1] & 0x3F);
> +		return 2;
> +	} else if ((ubuf[0] & 0xF0) == 0xE0) {
> +		*cpt = ((ubuf[0] & 0x0F) << 12)
> +			| ((ubuf[1] & 0x3F) << 6)
> +			| (ubuf[2] & 0x3F);
> +		return 3;
> +	} else if ((ubuf[0] & 0xF8) == 0xF0) {
> +		*cpt = ((ubuf[0] & 0x07) << 18)
> +			| ((ubuf[1] & 0x3F) << 12)
> +			| ((ubuf[2] & 0x3F) << 6)
> +			| (ubuf[3] & 0x3F);
> +		return 4;
> +	}
> +
> +	return 0;
> +}
> +
> +#endif
> Index: bin/ksh/unicode.h
> ===================================================================
> --- bin/ksh/unicode.h    (new file)
> +++ bin/ksh/unicode.h    (working copy)
> --- /dev/null	2024-12-17 11:54:03.396000088 +0800
> +++ bin/ksh/unicode.h	2024-12-17 09:19:00.521730569 +0800
> @@ -0,0 +1,7 @@
> +#ifndef UNICODE_H
> +#define UNICODE_H
> +
> +int is_fullwidth(unsigned long);
> +int u8_to_cpt(const char *, unsigned long *);
> +
> +#endif	/* UNICODE_H */
> 
> 
> 
> 
> 
>