Index | Thread | Search

From:
Gong Zhile <gongzl@stu.hebust.edu.cn>
Subject:
[REPOST] ksh: utf8 full width character support for emacs.c
To:
tech@openbsd.org
Date:
Sun, 16 Mar 2025 22:49:14 +0800

Download raw body.

Thread
Full width characters are commonly used in Asian language system like Chinese,
Japanese and Korean etc. Those characters took double the width of a normal
ascii char but x_size only counts them in one unit. When navigating between
those characters in emacs line editing mode, the cursor would lose track and
mess up the the line making it really difficult to input.

I tried to make x_size counts correctly with static variables in func and
looking up in a table generated from ‘EastAsianWidth.txt’. Characters mainly
count in a size of 2 are: Kanji, Katakana, Hiragana, Hangul, Roman Full-Width
Characters, emojis etc.

Expected behavior (After patching): cursor should land correctly while
navigating between full width characters, line editing commands (like
x_transpose)
correctly perform.

Known issue: When heading off the screen with full width chars, it fails to
place the angle bracket correctly. (Not easy to deal with when full width
characters crossing xx_cols)

Tested on: rxvt-unicode, xterm

Index: bin/ksh/Makefile
===================================================================
RCS file: /cvs/src/bin/ksh/Makefile,v
diff -u -r1.39 Makefile
--- bin/ksh/Makefile	18 Jun 2018 17:03:58 -0000	1.39
+++ bin/ksh/Makefile	17 Dec 2024 05:26:43 -0000
@@ -7,7 +7,7 @@
 SRCS=	alloc.c c_ksh.c c_sh.c c_test.c c_ulimit.c edit.c emacs.c eval.c \
 	exec.c expr.c history.c io.c jobs.c lex.c mail.c main.c \
 	misc.c path.c shf.c syn.c table.c trap.c tree.c tty.c var.c \
-	version.c vi.c
+	version.c vi.c unicode.c
 
 WARNINGS=yes
 DEFS=	-DEMACS -DVI
Index: bin/ksh/emacs.c
===================================================================
RCS file: /cvs/src/bin/ksh/emacs.c,v
diff -u -r1.90 emacs.c
--- bin/ksh/emacs.c	21 Jun 2023 22:22:08 -0000	1.90
+++ bin/ksh/emacs.c	17 Dec 2024 05:26:43 -0000
@@ -29,6 +29,11 @@
 #include "sh.h"
 #include "edit.h"
 
+#ifndef SMALL
+#include "unicode.h"
+#else
+#define x_size_rev x_size
+#endif
 static	Area	aedit;
 #define	AEDIT	&aedit		/* area for kill ring and macro defns
*/
 
@@ -126,6 +131,7 @@
 static void	x_goto(char *);
 static void	x_bs(int);
 static int	x_size_str(char *);
+static int	x_size_rev(int);
 static int	x_size(int);
 static void	x_zots(char *);
 static void	x_zotc(int);
@@ -459,7 +465,7 @@
 	if (adj == x_adj_done) {	/* has x_adjust() been called? */
 		/* no */
 		for (cp = xlp; cp > xcp; )
-			x_bs(*--cp);
+			x_bs((unsigned char)*--cp);
 	}
 
 	x_adj_ok = 1;
@@ -552,7 +558,7 @@
 	x_adj_ok = 1;
 	xlp_valid = false;
 	for (cp = x_lastcp(); cp > xcp; )
-		x_bs(*--cp);
+		x_bs((unsigned char)*--cp);
 
 	return;
 }
@@ -653,7 +659,7 @@
 {
 	int i;
 
-	i = x_size(c);
+	i = x_size_rev(c);
 	while (i--)
 		x_e_putc('\b');
 }
@@ -663,20 +669,93 @@
 {
 	int size = 0;
 	while (*cp)
-		size += x_size(*cp++);
+		size += x_size((unsigned char)*cp++);
 	return size;
 }
+#ifndef SMALL
+static int
+x_size_rev(int c)
+{
+	static unsigned char ch[5] = { 0 };
+	static int cnt = 3;
+	unsigned long cpt;
+	int w;
+
+	if (c=='\t')
+		return 4;	/* Kludge, tabs are always four spaces. */
+	if (iscntrl(c))		/* control char */
+		return 2;
+
+	if (!isu8cont(c)) {
+		if (c <= 0x7f) {
+			cnt = 3;
+			return 1;
+		}
+
+		ch[cnt] = c;
+		u8_to_cpt(ch + cnt, &cpt);
+		w = is_fullwidth(cpt) ? 2 : 1;
+
+		cnt = 3;
+		memset(ch, 0, 4);
+		return w;
+	} else {
+		if (cnt <= 0)
+			return 0;
+		ch[cnt] = c;
+		cnt--;
+	}
+
+	return 0;
+}
+#endif
 
 static int
 x_size(int c)
 {
+#ifndef SMALL
+	static unsigned char ch[5] = { 0 };
+	static int len = 0, cnt = 0;
+	unsigned long cpt;
+#endif
 	if (c=='\t')
 		return 4;	/* Kludge, tabs are always four spaces. */
 	if (iscntrl(c))		/* control char */
 		return 2;
+#ifdef SMALL
 	if (isu8cont(c))
 		return 0;
 	return 1;
+#else
+	if (!isu8cont(c)) {
+		if (c <= 0x7f) {
+			len = 0;
+			return 1;
+		}
+
+		if ((c & 0xf8) == 0xf0 && c < 0xf5)
+			len = 3;
+		else if ((c & 0xf0) == 0xe0)
+			len = 2;
+		else if ((c & 0xe0) == 0xc0 && c > 0xc1)
+			len = 1;
+		else {
+			len = 0;
+			return 0;
+		}
+
+		cnt = 0;
+		memset(ch, 0, 5);
+		ch[cnt++] = c;
+	} else {
+		ch[cnt++] = c;
+		if (cnt > len) {
+			u8_to_cpt(ch, &cpt);
+			return is_fullwidth(cpt) ? 2 : 1;
+		}
+	}
+	return 0;
+#endif
 }
 
 static void
@@ -1098,7 +1177,8 @@
 static int
 x_transpose(int c)
 {
-	char	tmp;
+	char	rune1[4], rune2[4];
+	char	*p1, *p2, *p;
 
 	/* What transpose is meant to do seems to be up for debate. This
 	 * is a general summary of the options; the text is abcd with the
@@ -1124,25 +1204,55 @@
 		/* Gosling/Unipress emacs style: Swap two characters before
the
 		 * cursor, do not change cursor position
 		 */
-		x_bs(xcp[-1]);
-		x_bs(xcp[-2]);
-		x_zotc(xcp[-1]);
-		x_zotc(xcp[-2]);
-		tmp = xcp[-1];
-		xcp[-1] = xcp[-2];
-		xcp[-2] = tmp;
+		p1 = xcp;
+		do {
+			x_bs((unsigned char) *--p1);
+		} while (xbuf < p1 && (xcp - p1) < 5 && isu8cont(*p1));
+
+		if (p1 == xbuf) {
+			x_e_putc(BEL);
+			return KSTD;
+		}
+
+		p2 = p1;
+		do {
+			x_bs((unsigned char) *--p2);
+		} while  (xbuf <= p2 && (p1 - p2) < 5 && isu8cont(*p2));
+
+		for (p = p1; p < xcp; p++)
+			x_zotc(*p);
+		for (p = p2; p < p1; p++)
+			x_zotc(*p);
+
+		memcpy(rune1, p1, xcp - p1);
+		memcpy(rune2, p2, p1 - p2);
+		memcpy(p2, rune1, xcp - p1);
+		memcpy(p2 + (xcp - p1), rune2, p1 - p2);
 	} else {
 		/* GNU emacs style: Swap the characters before and under the
 		 * cursor, move cursor position along one.
 		 */
-		x_bs(xcp[-1]);
-		x_zotc(xcp[0]);
-		x_zotc(xcp[-1]);
-		tmp = xcp[-1];
-		xcp[-1] = xcp[0];
-		xcp[0] = tmp;
-		x_bs(xcp[0]);
-		x_goto(xcp + 1);
+		p1 = xcp + 1;
+		while (p1 < xep && *p1 && (p1 - xcp) <= 5 && isu8cont(*p1))
+			p1++;
+
+		p2 = xcp;
+		do {
+			x_bs((unsigned char) *--p2);
+		} while (xbuf <= p2 && (xcp - p2) < 5 && isu8cont(*p2));
+
+		for (p = xcp; p < p1; p++)
+			x_zotc(*p);
+		for (p = p2; p < xcp; p++)
+			x_zotc(*p);
+
+		memcpy(rune1, xcp, p1 - xcp);
+		memcpy(rune2, p2, xcp - p2);
+		memcpy(p2, rune1, p1 - xcp);
+		memcpy(p2 + (p1 - xcp), rune2, xcp - p2);
+
+		xcp = p1;
+		x_goto(p1);
 	}
 	return KSTD;
 }
@@ -1804,6 +1914,11 @@
 	 */
 	if ((xbp = xcp - (x_displen / 2)) < xbuf)
 		xbp = xbuf;
+	else {
+		/* rewind to the last valid codepoint */
+		while (xbp > xbuf && isu8cont((unsigned char) *xbp))
+			xbp--;
+	}
 	xlp_valid = false;
 	x_redraw(xx_cols);
 	x_flush();
@@ -1882,8 +1997,16 @@
 }
 
 static void
-x_e_putc(int c)
+x_e_putc(int sc)
 {
+#ifndef SMALL
+	static unsigned char ch[5] = { 0 };
+	static int len = 0, cnt = 0;
+	unsigned long cpt;
+#endif
+	unsigned char c;
+
+	c = sc;
 	if (c == '\r' || c == '\n')
 		x_col = 0;
 	if (x_col < xx_cols) {
@@ -1898,9 +2021,43 @@
 			x_col--;
 			break;
 		default:
+#ifdef SMALL
 			if (!isu8cont(c))
 				x_col++;
 			break;
+#else
+			if (!isu8cont(c)) {
+				if (c <= 0x7f) {
+					x_col++;
+					len = 0;
+					break;
+				}
+
+				if ((c & 0xf8) == 0xf0 && c < 0xf5)
+					len = 3;
+				else if ((c & 0xf0) == 0xe0)
+					len = 2;
+				else if ((c & 0xe0) == 0xc0 && c > 0xc1)
+					len = 1;
+				else {
+					len = 0;
+					break;
+				}
+
+				cnt = 0;
+				memset(ch, 0, 5);
+				ch[cnt++] = c;
+			} else {
+				ch[cnt++] = c;
+				if (cnt > len) {
+					x_col++;
+					u8_to_cpt(ch, &cpt);
+					if (is_fullwidth(cpt))
+						x_col++;
+				}
+			}
+			break;
+#endif
 		}
 	}
 	if (x_adj_ok && (x_col < 0 || x_col >= (xx_cols - 2)))
Index: bin/ksh/unicode.c
===================================================================
--- bin/ksh/unicode.c    (new file)
+++ bin/ksh/unicode.c    (working copy)
--- /dev/null	2024-12-17 11:54:03.396000088 +0800
+++ bin/ksh/unicode.c	2024-12-17 13:25:38.050258915 +0800
@@ -0,0 +1,162 @@
+#include "unicode.h"
+
+#ifndef SMALL
+
+/* The following code was generated from EastAsianWidth.txt (Flag: W&F)
+ * Reference: https://www.unicode.org/reports/tr11/tr11-6.html
+ */
+
+int is_fullwidth(unsigned long cpt) {
+	if ((0x1100 <= cpt && cpt <= 0x115f)
+	    || (0x231a <= cpt && cpt <= 0x231b)
+	    || (0x2329 <= cpt && cpt <= 0x232a)
+	    || (0x23e9 <= cpt && cpt <= 0x23ec)
+	    || (cpt == 0x23f0)
+	    || (cpt == 0x23f3)
+	    || (0x25fd <= cpt && cpt <= 0x25fe)
+	    || (0x2614 <= cpt && cpt <= 0x2615)
+	    || (0x2630 <= cpt && cpt <= 0x2637)
+	    || (0x2648 <= cpt && cpt <= 0x2653)
+	    || (cpt == 0x267f)
+	    || (0x268a <= cpt && cpt <= 0x268f)
+	    || (cpt == 0x2693)
+	    || (cpt == 0x26a1)
+	    || (0x26aa <= cpt && cpt <= 0x26ab)
+	    || (0x26bd <= cpt && cpt <= 0x26be)
+	    || (0x26c4 <= cpt && cpt <= 0x26c5)
+	    || (cpt == 0x26ce)
+	    || (cpt == 0x26d4)
+	    || (cpt == 0x26ea)
+	    || (0x26f2 <= cpt && cpt <= 0x26f3)
+	    || (cpt == 0x26f5)
+	    || (cpt == 0x26fa)
+	    || (cpt == 0x26fd)
+	    || (cpt == 0x2705)
+	    || (0x270a <= cpt && cpt <= 0x270b)
+	    || (cpt == 0x2728)
+	    || (cpt == 0x274c)
+	    || (cpt == 0x274e)
+	    || (0x2753 <= cpt && cpt <= 0x2755)
+	    || (cpt == 0x2757)
+	    || (0x2795 <= cpt && cpt <= 0x2797)
+	    || (cpt == 0x27b0)
+	    || (cpt == 0x27bf)
+	    || (0x2b1b <= cpt && cpt <= 0x2b1c)
+	    || (cpt == 0x2b50)
+	    || (cpt == 0x2b55)
+	    || (0x2e80 <= cpt && cpt <= 0x2e99)
+	    || (0x2e9b <= cpt && cpt <= 0x2ef3)
+	    || (0x2f00 <= cpt && cpt <= 0x2fd5)
+	    || (0x2ff0 <= cpt && cpt <= 0x303e)
+	    || (0x3041 <= cpt && cpt <= 0x3096)
+	    || (0x3099 <= cpt && cpt <= 0x30ff)
+	    || (0x3105 <= cpt && cpt <= 0x312f)
+	    || (0x3131 <= cpt && cpt <= 0x318e)
+	    || (0x3190 <= cpt && cpt <= 0x31e5)
+	    || (0x31ef <= cpt && cpt <= 0x321e)
+	    || (0x3220 <= cpt && cpt <= 0x3247)
+	    || (0x3250 <= cpt && cpt <= 0xa48c)
+	    || (0xa490 <= cpt && cpt <= 0xa4c6)
+	    || (0xa960 <= cpt && cpt <= 0xa97c)
+	    || (0xac00 <= cpt && cpt <= 0xd7a3)
+	    || (0xf900 <= cpt && cpt <= 0xfaff)
+	    || (0xfe10 <= cpt && cpt <= 0xfe19)
+	    || (0xfe30 <= cpt && cpt <= 0xfe52)
+	    || (0xfe54 <= cpt && cpt <= 0xfe66)
+	    || (0xfe68 <= cpt && cpt <= 0xfe6b)
+	    || (0xff01 <= cpt && cpt <= 0xff60)
+	    || (0xffe0 <= cpt && cpt <= 0xffe6)
+	    || (0x16fe0 <= cpt && cpt <= 0x16fe4)
+	    || (0x16ff0 <= cpt && cpt <= 0x16ff1)
+	    || (0x17000 <= cpt && cpt <= 0x187f7)
+	    || (0x18800 <= cpt && cpt <= 0x18cd5)
+	    || (0x18cff <= cpt && cpt <= 0x18d08)
+	    || (0x1aff0 <= cpt && cpt <= 0x1aff3)
+	    || (0x1aff5 <= cpt && cpt <= 0x1affb)
+	    || (0x1affd <= cpt && cpt <= 0x1affe)
+	    || (0x1b000 <= cpt && cpt <= 0x1b122)
+	    || (cpt == 0x1b132)
+	    || (0x1b150 <= cpt && cpt <= 0x1b152)
+	    || (cpt == 0x1b155)
+	    || (0x1b164 <= cpt && cpt <= 0x1b167)
+	    || (0x1b170 <= cpt && cpt <= 0x1b2fb)
+	    || (0x1d300 <= cpt && cpt <= 0x1d356)
+	    || (0x1d360 <= cpt && cpt <= 0x1d376)
+	    || (cpt == 0x1f004)
+	    || (cpt == 0x1f0cf)
+	    || (cpt == 0x1f18e)
+	    || (0x1f191 <= cpt && cpt <= 0x1f19a)
+	    || (0x1f200 <= cpt && cpt <= 0x1f202)
+	    || (0x1f210 <= cpt && cpt <= 0x1f23b)
+	    || (0x1f240 <= cpt && cpt <= 0x1f248)
+	    || (0x1f250 <= cpt && cpt <= 0x1f251)
+	    || (0x1f260 <= cpt && cpt <= 0x1f265)
+	    || (0x1f300 <= cpt && cpt <= 0x1f320)
+	    || (0x1f32d <= cpt && cpt <= 0x1f335)
+	    || (0x1f337 <= cpt && cpt <= 0x1f37c)
+	    || (0x1f37e <= cpt && cpt <= 0x1f393)
+	    || (0x1f3a0 <= cpt && cpt <= 0x1f3ca)
+	    || (0x1f3cf <= cpt && cpt <= 0x1f3d3)
+	    || (0x1f3e0 <= cpt && cpt <= 0x1f3f0)
+	    || (cpt == 0x1f3f4)
+	    || (0x1f3f8 <= cpt && cpt <= 0x1f43e)
+	    || (cpt == 0x1f440)
+	    || (0x1f442 <= cpt && cpt <= 0x1f4fc)
+	    || (0x1f4ff <= cpt && cpt <= 0x1f53d)
+	    || (0x1f54b <= cpt && cpt <= 0x1f54e)
+	    || (0x1f550 <= cpt && cpt <= 0x1f567)
+	    || (cpt == 0x1f57a)
+	    || (0x1f595 <= cpt && cpt <= 0x1f596)
+	    || (cpt == 0x1f5a4)
+	    || (0x1f5fb <= cpt && cpt <= 0x1f64f)
+	    || (0x1f680 <= cpt && cpt <= 0x1f6c5)
+	    || (cpt == 0x1f6cc)
+	    || (0x1f6d0 <= cpt && cpt <= 0x1f6d2)
+	    || (0x1f6d5 <= cpt && cpt <= 0x1f6d7)
+	    || (0x1f6dc <= cpt && cpt <= 0x1f6df)
+	    || (0x1f6eb <= cpt && cpt <= 0x1f6ec)
+	    || (0x1f6f4 <= cpt && cpt <= 0x1f6fc)
+	    || (0x1f7e0 <= cpt && cpt <= 0x1f7eb)
+	    || (cpt == 0x1f7f0)
+	    || (0x1f90c <= cpt && cpt <= 0x1f93a)
+	    || (0x1f93c <= cpt && cpt <= 0x1f945)
+	    || (0x1f947 <= cpt && cpt <= 0x1f9ff)
+	    || (0x1fa70 <= cpt && cpt <= 0x1fa7c)
+	    || (0x1fa80 <= cpt && cpt <= 0x1fa89)
+	    || (0x1fa8f <= cpt && cpt <= 0x1fac6)
+	    || (0x1face <= cpt && cpt <= 0x1fadc)
+	    || (0x1fadf <= cpt && cpt <= 0x1fae9)
+	    || (0x1faf0 <= cpt && cpt <= 0x1faf8)
+	    || (0x20000 <= cpt && cpt <= 0x2fffd)
+	    || (0x30000 <= cpt && cpt <= 0x3fffd))
+		return 1;
+
+	return 0;
+}
+
+int u8_to_cpt(const char *buf, unsigned long *cpt) {
+	const unsigned char *ubuf = buf;
+
+	if (ubuf[0] <= 0x7F) {
+		*cpt = ubuf[0];
+		return 1;
+	} else if ((ubuf[0] & 0xE0) == 0xC0) {
+		*cpt = ((ubuf[0] & 0x1F) << 6) | (ubuf[1] & 0x3F);
+		return 2;
+	} else if ((ubuf[0] & 0xF0) == 0xE0) {
+		*cpt = ((ubuf[0] & 0x0F) << 12)
+			| ((ubuf[1] & 0x3F) << 6)
+			| (ubuf[2] & 0x3F);
+		return 3;
+	} else if ((ubuf[0] & 0xF8) == 0xF0) {
+		*cpt = ((ubuf[0] & 0x07) << 18)
+			| ((ubuf[1] & 0x3F) << 12)
+			| ((ubuf[2] & 0x3F) << 6)
+			| (ubuf[3] & 0x3F);
+		return 4;
+	}
+
+	return 0;
+}
+
+#endif
Index: bin/ksh/unicode.h
===================================================================
--- bin/ksh/unicode.h    (new file)
+++ bin/ksh/unicode.h    (working copy)
--- /dev/null	2024-12-17 11:54:03.396000088 +0800
+++ bin/ksh/unicode.h	2024-12-17 09:19:00.521730569 +0800
@@ -0,0 +1,7 @@
+#ifndef UNICODE_H
+#define UNICODE_H
+
+int is_fullwidth(unsigned long);
+int u8_to_cpt(const char *, unsigned long *);
+
+#endif	/* UNICODE_H */