Download raw body.
[REPOST] ksh: utf8 full width character support for emacs.c
Full width characters are commonly used in Asian language system like Chinese,
Japanese and Korean etc. Those characters took double the width of a normal
ascii char but x_size only counts them in one unit. When navigating between
those characters in emacs line editing mode, the cursor would lose track and
mess up the the line making it really difficult to input.
I tried to make x_size counts correctly with static variables in func and
looking up in a table generated from ‘EastAsianWidth.txt’. Characters mainly
count in a size of 2 are: Kanji, Katakana, Hiragana, Hangul, Roman Full-Width
Characters, emojis etc.
Expected behavior (After patching): cursor should land correctly while
navigating between full width characters, line editing commands (like
x_transpose)
correctly perform.
Known issue: When heading off the screen with full width chars, it fails to
place the angle bracket correctly. (Not easy to deal with when full width
characters crossing xx_cols)
Tested on: rxvt-unicode, xterm
Index: bin/ksh/Makefile
===================================================================
RCS file: /cvs/src/bin/ksh/Makefile,v
diff -u -r1.39 Makefile
--- bin/ksh/Makefile 18 Jun 2018 17:03:58 -0000 1.39
+++ bin/ksh/Makefile 17 Dec 2024 05:26:43 -0000
@@ -7,7 +7,7 @@
SRCS= alloc.c c_ksh.c c_sh.c c_test.c c_ulimit.c edit.c emacs.c eval.c \
exec.c expr.c history.c io.c jobs.c lex.c mail.c main.c \
misc.c path.c shf.c syn.c table.c trap.c tree.c tty.c var.c \
- version.c vi.c
+ version.c vi.c unicode.c
WARNINGS=yes
DEFS= -DEMACS -DVI
Index: bin/ksh/emacs.c
===================================================================
RCS file: /cvs/src/bin/ksh/emacs.c,v
diff -u -r1.90 emacs.c
--- bin/ksh/emacs.c 21 Jun 2023 22:22:08 -0000 1.90
+++ bin/ksh/emacs.c 17 Dec 2024 05:26:43 -0000
@@ -29,6 +29,11 @@
#include "sh.h"
#include "edit.h"
+#ifndef SMALL
+#include "unicode.h"
+#else
+#define x_size_rev x_size
+#endif
static Area aedit;
#define AEDIT &aedit /* area for kill ring and macro defns
*/
@@ -126,6 +131,7 @@
static void x_goto(char *);
static void x_bs(int);
static int x_size_str(char *);
+static int x_size_rev(int);
static int x_size(int);
static void x_zots(char *);
static void x_zotc(int);
@@ -459,7 +465,7 @@
if (adj == x_adj_done) { /* has x_adjust() been called? */
/* no */
for (cp = xlp; cp > xcp; )
- x_bs(*--cp);
+ x_bs((unsigned char)*--cp);
}
x_adj_ok = 1;
@@ -552,7 +558,7 @@
x_adj_ok = 1;
xlp_valid = false;
for (cp = x_lastcp(); cp > xcp; )
- x_bs(*--cp);
+ x_bs((unsigned char)*--cp);
return;
}
@@ -653,7 +659,7 @@
{
int i;
- i = x_size(c);
+ i = x_size_rev(c);
while (i--)
x_e_putc('\b');
}
@@ -663,20 +669,93 @@
{
int size = 0;
while (*cp)
- size += x_size(*cp++);
+ size += x_size((unsigned char)*cp++);
return size;
}
+#ifndef SMALL
+static int
+x_size_rev(int c)
+{
+ static unsigned char ch[5] = { 0 };
+ static int cnt = 3;
+ unsigned long cpt;
+ int w;
+
+ if (c=='\t')
+ return 4; /* Kludge, tabs are always four spaces. */
+ if (iscntrl(c)) /* control char */
+ return 2;
+
+ if (!isu8cont(c)) {
+ if (c <= 0x7f) {
+ cnt = 3;
+ return 1;
+ }
+
+ ch[cnt] = c;
+ u8_to_cpt(ch + cnt, &cpt);
+ w = is_fullwidth(cpt) ? 2 : 1;
+
+ cnt = 3;
+ memset(ch, 0, 4);
+ return w;
+ } else {
+ if (cnt <= 0)
+ return 0;
+ ch[cnt] = c;
+ cnt--;
+ }
+
+ return 0;
+}
+#endif
static int
x_size(int c)
{
+#ifndef SMALL
+ static unsigned char ch[5] = { 0 };
+ static int len = 0, cnt = 0;
+ unsigned long cpt;
+#endif
if (c=='\t')
return 4; /* Kludge, tabs are always four spaces. */
if (iscntrl(c)) /* control char */
return 2;
+#ifdef SMALL
if (isu8cont(c))
return 0;
return 1;
+#else
+ if (!isu8cont(c)) {
+ if (c <= 0x7f) {
+ len = 0;
+ return 1;
+ }
+
+ if ((c & 0xf8) == 0xf0 && c < 0xf5)
+ len = 3;
+ else if ((c & 0xf0) == 0xe0)
+ len = 2;
+ else if ((c & 0xe0) == 0xc0 && c > 0xc1)
+ len = 1;
+ else {
+ len = 0;
+ return 0;
+ }
+
+ cnt = 0;
+ memset(ch, 0, 5);
+ ch[cnt++] = c;
+ } else {
+ ch[cnt++] = c;
+ if (cnt > len) {
+ u8_to_cpt(ch, &cpt);
+ return is_fullwidth(cpt) ? 2 : 1;
+ }
+ }
+ return 0;
+#endif
}
static void
@@ -1098,7 +1177,8 @@
static int
x_transpose(int c)
{
- char tmp;
+ char rune1[4], rune2[4];
+ char *p1, *p2, *p;
/* What transpose is meant to do seems to be up for debate. This
* is a general summary of the options; the text is abcd with the
@@ -1124,25 +1204,55 @@
/* Gosling/Unipress emacs style: Swap two characters before
the
* cursor, do not change cursor position
*/
- x_bs(xcp[-1]);
- x_bs(xcp[-2]);
- x_zotc(xcp[-1]);
- x_zotc(xcp[-2]);
- tmp = xcp[-1];
- xcp[-1] = xcp[-2];
- xcp[-2] = tmp;
+ p1 = xcp;
+ do {
+ x_bs((unsigned char) *--p1);
+ } while (xbuf < p1 && (xcp - p1) < 5 && isu8cont(*p1));
+
+ if (p1 == xbuf) {
+ x_e_putc(BEL);
+ return KSTD;
+ }
+
+ p2 = p1;
+ do {
+ x_bs((unsigned char) *--p2);
+ } while (xbuf <= p2 && (p1 - p2) < 5 && isu8cont(*p2));
+
+ for (p = p1; p < xcp; p++)
+ x_zotc(*p);
+ for (p = p2; p < p1; p++)
+ x_zotc(*p);
+
+ memcpy(rune1, p1, xcp - p1);
+ memcpy(rune2, p2, p1 - p2);
+ memcpy(p2, rune1, xcp - p1);
+ memcpy(p2 + (xcp - p1), rune2, p1 - p2);
} else {
/* GNU emacs style: Swap the characters before and under the
* cursor, move cursor position along one.
*/
- x_bs(xcp[-1]);
- x_zotc(xcp[0]);
- x_zotc(xcp[-1]);
- tmp = xcp[-1];
- xcp[-1] = xcp[0];
- xcp[0] = tmp;
- x_bs(xcp[0]);
- x_goto(xcp + 1);
+ p1 = xcp + 1;
+ while (p1 < xep && *p1 && (p1 - xcp) <= 5 && isu8cont(*p1))
+ p1++;
+
+ p2 = xcp;
+ do {
+ x_bs((unsigned char) *--p2);
+ } while (xbuf <= p2 && (xcp - p2) < 5 && isu8cont(*p2));
+
+ for (p = xcp; p < p1; p++)
+ x_zotc(*p);
+ for (p = p2; p < xcp; p++)
+ x_zotc(*p);
+
+ memcpy(rune1, xcp, p1 - xcp);
+ memcpy(rune2, p2, xcp - p2);
+ memcpy(p2, rune1, p1 - xcp);
+ memcpy(p2 + (p1 - xcp), rune2, xcp - p2);
+
+ xcp = p1;
+ x_goto(p1);
}
return KSTD;
}
@@ -1804,6 +1914,11 @@
*/
if ((xbp = xcp - (x_displen / 2)) < xbuf)
xbp = xbuf;
+ else {
+ /* rewind to the last valid codepoint */
+ while (xbp > xbuf && isu8cont((unsigned char) *xbp))
+ xbp--;
+ }
xlp_valid = false;
x_redraw(xx_cols);
x_flush();
@@ -1882,8 +1997,16 @@
}
static void
-x_e_putc(int c)
+x_e_putc(int sc)
{
+#ifndef SMALL
+ static unsigned char ch[5] = { 0 };
+ static int len = 0, cnt = 0;
+ unsigned long cpt;
+#endif
+ unsigned char c;
+
+ c = sc;
if (c == '\r' || c == '\n')
x_col = 0;
if (x_col < xx_cols) {
@@ -1898,9 +2021,43 @@
x_col--;
break;
default:
+#ifdef SMALL
if (!isu8cont(c))
x_col++;
break;
+#else
+ if (!isu8cont(c)) {
+ if (c <= 0x7f) {
+ x_col++;
+ len = 0;
+ break;
+ }
+
+ if ((c & 0xf8) == 0xf0 && c < 0xf5)
+ len = 3;
+ else if ((c & 0xf0) == 0xe0)
+ len = 2;
+ else if ((c & 0xe0) == 0xc0 && c > 0xc1)
+ len = 1;
+ else {
+ len = 0;
+ break;
+ }
+
+ cnt = 0;
+ memset(ch, 0, 5);
+ ch[cnt++] = c;
+ } else {
+ ch[cnt++] = c;
+ if (cnt > len) {
+ x_col++;
+ u8_to_cpt(ch, &cpt);
+ if (is_fullwidth(cpt))
+ x_col++;
+ }
+ }
+ break;
+#endif
}
}
if (x_adj_ok && (x_col < 0 || x_col >= (xx_cols - 2)))
Index: bin/ksh/unicode.c
===================================================================
--- bin/ksh/unicode.c (new file)
+++ bin/ksh/unicode.c (working copy)
--- /dev/null 2024-12-17 11:54:03.396000088 +0800
+++ bin/ksh/unicode.c 2024-12-17 13:25:38.050258915 +0800
@@ -0,0 +1,162 @@
+#include "unicode.h"
+
+#ifndef SMALL
+
+/* The following code was generated from EastAsianWidth.txt (Flag: W&F)
+ * Reference: https://www.unicode.org/reports/tr11/tr11-6.html
+ */
+
+int is_fullwidth(unsigned long cpt) {
+ if ((0x1100 <= cpt && cpt <= 0x115f)
+ || (0x231a <= cpt && cpt <= 0x231b)
+ || (0x2329 <= cpt && cpt <= 0x232a)
+ || (0x23e9 <= cpt && cpt <= 0x23ec)
+ || (cpt == 0x23f0)
+ || (cpt == 0x23f3)
+ || (0x25fd <= cpt && cpt <= 0x25fe)
+ || (0x2614 <= cpt && cpt <= 0x2615)
+ || (0x2630 <= cpt && cpt <= 0x2637)
+ || (0x2648 <= cpt && cpt <= 0x2653)
+ || (cpt == 0x267f)
+ || (0x268a <= cpt && cpt <= 0x268f)
+ || (cpt == 0x2693)
+ || (cpt == 0x26a1)
+ || (0x26aa <= cpt && cpt <= 0x26ab)
+ || (0x26bd <= cpt && cpt <= 0x26be)
+ || (0x26c4 <= cpt && cpt <= 0x26c5)
+ || (cpt == 0x26ce)
+ || (cpt == 0x26d4)
+ || (cpt == 0x26ea)
+ || (0x26f2 <= cpt && cpt <= 0x26f3)
+ || (cpt == 0x26f5)
+ || (cpt == 0x26fa)
+ || (cpt == 0x26fd)
+ || (cpt == 0x2705)
+ || (0x270a <= cpt && cpt <= 0x270b)
+ || (cpt == 0x2728)
+ || (cpt == 0x274c)
+ || (cpt == 0x274e)
+ || (0x2753 <= cpt && cpt <= 0x2755)
+ || (cpt == 0x2757)
+ || (0x2795 <= cpt && cpt <= 0x2797)
+ || (cpt == 0x27b0)
+ || (cpt == 0x27bf)
+ || (0x2b1b <= cpt && cpt <= 0x2b1c)
+ || (cpt == 0x2b50)
+ || (cpt == 0x2b55)
+ || (0x2e80 <= cpt && cpt <= 0x2e99)
+ || (0x2e9b <= cpt && cpt <= 0x2ef3)
+ || (0x2f00 <= cpt && cpt <= 0x2fd5)
+ || (0x2ff0 <= cpt && cpt <= 0x303e)
+ || (0x3041 <= cpt && cpt <= 0x3096)
+ || (0x3099 <= cpt && cpt <= 0x30ff)
+ || (0x3105 <= cpt && cpt <= 0x312f)
+ || (0x3131 <= cpt && cpt <= 0x318e)
+ || (0x3190 <= cpt && cpt <= 0x31e5)
+ || (0x31ef <= cpt && cpt <= 0x321e)
+ || (0x3220 <= cpt && cpt <= 0x3247)
+ || (0x3250 <= cpt && cpt <= 0xa48c)
+ || (0xa490 <= cpt && cpt <= 0xa4c6)
+ || (0xa960 <= cpt && cpt <= 0xa97c)
+ || (0xac00 <= cpt && cpt <= 0xd7a3)
+ || (0xf900 <= cpt && cpt <= 0xfaff)
+ || (0xfe10 <= cpt && cpt <= 0xfe19)
+ || (0xfe30 <= cpt && cpt <= 0xfe52)
+ || (0xfe54 <= cpt && cpt <= 0xfe66)
+ || (0xfe68 <= cpt && cpt <= 0xfe6b)
+ || (0xff01 <= cpt && cpt <= 0xff60)
+ || (0xffe0 <= cpt && cpt <= 0xffe6)
+ || (0x16fe0 <= cpt && cpt <= 0x16fe4)
+ || (0x16ff0 <= cpt && cpt <= 0x16ff1)
+ || (0x17000 <= cpt && cpt <= 0x187f7)
+ || (0x18800 <= cpt && cpt <= 0x18cd5)
+ || (0x18cff <= cpt && cpt <= 0x18d08)
+ || (0x1aff0 <= cpt && cpt <= 0x1aff3)
+ || (0x1aff5 <= cpt && cpt <= 0x1affb)
+ || (0x1affd <= cpt && cpt <= 0x1affe)
+ || (0x1b000 <= cpt && cpt <= 0x1b122)
+ || (cpt == 0x1b132)
+ || (0x1b150 <= cpt && cpt <= 0x1b152)
+ || (cpt == 0x1b155)
+ || (0x1b164 <= cpt && cpt <= 0x1b167)
+ || (0x1b170 <= cpt && cpt <= 0x1b2fb)
+ || (0x1d300 <= cpt && cpt <= 0x1d356)
+ || (0x1d360 <= cpt && cpt <= 0x1d376)
+ || (cpt == 0x1f004)
+ || (cpt == 0x1f0cf)
+ || (cpt == 0x1f18e)
+ || (0x1f191 <= cpt && cpt <= 0x1f19a)
+ || (0x1f200 <= cpt && cpt <= 0x1f202)
+ || (0x1f210 <= cpt && cpt <= 0x1f23b)
+ || (0x1f240 <= cpt && cpt <= 0x1f248)
+ || (0x1f250 <= cpt && cpt <= 0x1f251)
+ || (0x1f260 <= cpt && cpt <= 0x1f265)
+ || (0x1f300 <= cpt && cpt <= 0x1f320)
+ || (0x1f32d <= cpt && cpt <= 0x1f335)
+ || (0x1f337 <= cpt && cpt <= 0x1f37c)
+ || (0x1f37e <= cpt && cpt <= 0x1f393)
+ || (0x1f3a0 <= cpt && cpt <= 0x1f3ca)
+ || (0x1f3cf <= cpt && cpt <= 0x1f3d3)
+ || (0x1f3e0 <= cpt && cpt <= 0x1f3f0)
+ || (cpt == 0x1f3f4)
+ || (0x1f3f8 <= cpt && cpt <= 0x1f43e)
+ || (cpt == 0x1f440)
+ || (0x1f442 <= cpt && cpt <= 0x1f4fc)
+ || (0x1f4ff <= cpt && cpt <= 0x1f53d)
+ || (0x1f54b <= cpt && cpt <= 0x1f54e)
+ || (0x1f550 <= cpt && cpt <= 0x1f567)
+ || (cpt == 0x1f57a)
+ || (0x1f595 <= cpt && cpt <= 0x1f596)
+ || (cpt == 0x1f5a4)
+ || (0x1f5fb <= cpt && cpt <= 0x1f64f)
+ || (0x1f680 <= cpt && cpt <= 0x1f6c5)
+ || (cpt == 0x1f6cc)
+ || (0x1f6d0 <= cpt && cpt <= 0x1f6d2)
+ || (0x1f6d5 <= cpt && cpt <= 0x1f6d7)
+ || (0x1f6dc <= cpt && cpt <= 0x1f6df)
+ || (0x1f6eb <= cpt && cpt <= 0x1f6ec)
+ || (0x1f6f4 <= cpt && cpt <= 0x1f6fc)
+ || (0x1f7e0 <= cpt && cpt <= 0x1f7eb)
+ || (cpt == 0x1f7f0)
+ || (0x1f90c <= cpt && cpt <= 0x1f93a)
+ || (0x1f93c <= cpt && cpt <= 0x1f945)
+ || (0x1f947 <= cpt && cpt <= 0x1f9ff)
+ || (0x1fa70 <= cpt && cpt <= 0x1fa7c)
+ || (0x1fa80 <= cpt && cpt <= 0x1fa89)
+ || (0x1fa8f <= cpt && cpt <= 0x1fac6)
+ || (0x1face <= cpt && cpt <= 0x1fadc)
+ || (0x1fadf <= cpt && cpt <= 0x1fae9)
+ || (0x1faf0 <= cpt && cpt <= 0x1faf8)
+ || (0x20000 <= cpt && cpt <= 0x2fffd)
+ || (0x30000 <= cpt && cpt <= 0x3fffd))
+ return 1;
+
+ return 0;
+}
+
+int u8_to_cpt(const char *buf, unsigned long *cpt) {
+ const unsigned char *ubuf = buf;
+
+ if (ubuf[0] <= 0x7F) {
+ *cpt = ubuf[0];
+ return 1;
+ } else if ((ubuf[0] & 0xE0) == 0xC0) {
+ *cpt = ((ubuf[0] & 0x1F) << 6) | (ubuf[1] & 0x3F);
+ return 2;
+ } else if ((ubuf[0] & 0xF0) == 0xE0) {
+ *cpt = ((ubuf[0] & 0x0F) << 12)
+ | ((ubuf[1] & 0x3F) << 6)
+ | (ubuf[2] & 0x3F);
+ return 3;
+ } else if ((ubuf[0] & 0xF8) == 0xF0) {
+ *cpt = ((ubuf[0] & 0x07) << 18)
+ | ((ubuf[1] & 0x3F) << 12)
+ | ((ubuf[2] & 0x3F) << 6)
+ | (ubuf[3] & 0x3F);
+ return 4;
+ }
+
+ return 0;
+}
+
+#endif
Index: bin/ksh/unicode.h
===================================================================
--- bin/ksh/unicode.h (new file)
+++ bin/ksh/unicode.h (working copy)
--- /dev/null 2024-12-17 11:54:03.396000088 +0800
+++ bin/ksh/unicode.h 2024-12-17 09:19:00.521730569 +0800
@@ -0,0 +1,7 @@
+#ifndef UNICODE_H
+#define UNICODE_H
+
+int is_fullwidth(unsigned long);
+int u8_to_cpt(const char *, unsigned long *);
+
+#endif /* UNICODE_H */
[REPOST] ksh: utf8 full width character support for emacs.c