From: Gong Zhile Subject: Re: ksh: utf8 full width character support for emacs.c To: tech@openbsd.org Date: Sat, 28 Dec 2024 20:50:12 +0800 Hi list, Here are some sample texts for testing: Chinese: “你好世界。” Japanese (Kanji and Hiragana): 「こんにちは世界。」 Korean (Hangul): 「안녕하세요 세계。」 Capitalized Fullwidth Roman Characters: 「HELLO WORLD。」 Please review and happy holidays! --- goodspeed On Tue, 2024-12-17 at 13:53 +0800, Gong Zhile wrote: > Full width characters are commonly used in Asian language system like > Chinese, > Japanese and Korean etc. Those characters took double the width of a normal > ascii char but x_size only counts them in one unit. When navigating between > those characters in emacs line editing mode, the cursor would lose track and > mess up the the line making it really difficult to input. > > I tried to make x_size counts correctly with static variables in func and > looking up in a table generated from ‘EastAsianWidth.txt’. Characters mainly > count in a size of 2 are: Kanji, Katakana, Hiragana, Hangul, Roman Full- > Width > Characters, emojis etc. > > Expected behavior (After patching): cursor should land correctly while > navigating between full width characters, line editing commands (like > x_transpose) > correctly perform. > > Known issue: When heading off the screen with full width chars, it fails to > place the angle bracket correctly. (Not easy to deal with when full width > characters crossing xx_cols) > > Tested on: rxvt-unicode, xterm > > Index: bin/ksh/Makefile > =================================================================== > RCS file: /cvs/src/bin/ksh/Makefile,v > diff -u -r1.39 Makefile > --- bin/ksh/Makefile 18 Jun 2018 17:03:58 -0000 1.39 > +++ bin/ksh/Makefile 17 Dec 2024 05:26:43 -0000 > @@ -7,7 +7,7 @@ >  SRCS= alloc.c c_ksh.c c_sh.c c_test.c c_ulimit.c edit.c emacs.c eval.c \ >   exec.c expr.c history.c io.c jobs.c lex.c mail.c main.c \ >   misc.c path.c shf.c syn.c table.c trap.c tree.c tty.c var.c \ > - version.c vi.c > + version.c vi.c unicode.c >   >  WARNINGS=yes >  DEFS= -DEMACS -DVI > Index: bin/ksh/emacs.c > =================================================================== > RCS file: /cvs/src/bin/ksh/emacs.c,v > diff -u -r1.90 emacs.c > --- bin/ksh/emacs.c 21 Jun 2023 22:22:08 -0000 1.90 > +++ bin/ksh/emacs.c 17 Dec 2024 05:26:43 -0000 > @@ -29,6 +29,11 @@ >  #include "sh.h" >  #include "edit.h" >   > +#ifndef SMALL > +#include "unicode.h" > +#else > +#define x_size_rev x_size > +#endif >  static Area aedit; >  #define AEDIT &aedit /* area for kill ring and macro > defns */ >   > @@ -126,6 +131,7 @@ >  static void x_goto(char *); >  static void x_bs(int); >  static int x_size_str(char *); > +static int x_size_rev(int); >  static int x_size(int); >  static void x_zots(char *); >  static void x_zotc(int); > @@ -459,7 +465,7 @@ >   if (adj == x_adj_done) { /* has x_adjust() been called? */ >   /* no */ >   for (cp = xlp; cp > xcp; ) > - x_bs(*--cp); > + x_bs((unsigned char)*--cp); >   } >   >   x_adj_ok = 1; > @@ -552,7 +558,7 @@ >   x_adj_ok = 1; >   xlp_valid = false; >   for (cp = x_lastcp(); cp > xcp; ) > - x_bs(*--cp); > + x_bs((unsigned char)*--cp); >   >   return; >  } > @@ -653,7 +659,7 @@ >  { >   int i; >   > - i = x_size(c); > + i = x_size_rev(c); >   while (i--) >   x_e_putc('\b'); >  } > @@ -663,20 +669,93 @@ >  { >   int size = 0; >   while (*cp) > - size += x_size(*cp++); > + size += x_size((unsigned char)*cp++); >   return size; >  } > +#ifndef SMALL > +static int > +x_size_rev(int c) > +{ > + static unsigned char ch[5] = { 0 }; > + static int cnt = 3; > + unsigned long cpt; > + int w; > + > + if (c=='\t') > + return 4; /* Kludge, tabs are always four spaces. */ > + if (iscntrl(c)) /* control char */ > + return 2; > + > + if (!isu8cont(c)) { > + if (c <= 0x7f) { > + cnt = 3; > + return 1; > + } > + > + ch[cnt] = c; > + u8_to_cpt(ch + cnt, &cpt); > + w = is_fullwidth(cpt) ? 2 : 1; > + > + cnt = 3; > + memset(ch, 0, 4); > + return w; > + } else { > + if (cnt <= 0) > + return 0; > + ch[cnt] = c; > + cnt--; > + } > + > + return 0; > +} > +#endif >   >  static int >  x_size(int c) >  { > +#ifndef SMALL > + static unsigned char ch[5] = { 0 }; > + static int len = 0, cnt = 0; > + unsigned long cpt; > +#endif >   if (c=='\t') >   return 4; /* Kludge, tabs are always four spaces. */ >   if (iscntrl(c)) /* control char */ >   return 2; > +#ifdef SMALL >   if (isu8cont(c)) >   return 0; >   return 1; > +#else > + if (!isu8cont(c)) { > + if (c <= 0x7f) { > + len = 0; > + return 1; > + } > + > + if ((c & 0xf8) == 0xf0 && c < 0xf5) > + len = 3; > + else if ((c & 0xf0) == 0xe0) > + len = 2; > + else if ((c & 0xe0) == 0xc0 && c > 0xc1) > + len = 1; > + else { > + len = 0; > + return 0; > + } > + > + cnt = 0; > + memset(ch, 0, 5); > + ch[cnt++] = c; > + } else { > + ch[cnt++] = c; > + if (cnt > len) { > + u8_to_cpt(ch, &cpt); > + return is_fullwidth(cpt) ? 2 : 1; > + } > + } > + return 0; > +#endif >  } >   >  static void > @@ -1098,7 +1177,8 @@ >  static int >  x_transpose(int c) >  { > - char tmp; > + char rune1[4], rune2[4]; > + char *p1, *p2, *p; >   >   /* What transpose is meant to do seems to be up for debate. This >   * is a general summary of the options; the text is abcd with the > @@ -1124,25 +1204,55 @@ >   /* Gosling/Unipress emacs style: Swap two characters before > the >   * cursor, do not change cursor position >   */ > - x_bs(xcp[-1]); > - x_bs(xcp[-2]); > - x_zotc(xcp[-1]); > - x_zotc(xcp[-2]); > - tmp = xcp[-1]; > - xcp[-1] = xcp[-2]; > - xcp[-2] = tmp; > + p1 = xcp; > + do { > + x_bs((unsigned char) *--p1); > + } while (xbuf < p1 && (xcp - p1) < 5 && isu8cont(*p1)); > + > + if (p1 == xbuf) { > + x_e_putc(BEL); > + return KSTD; > + } > + > + p2 = p1; > + do { > + x_bs((unsigned char) *--p2); > + } while  (xbuf <= p2 && (p1 - p2) < 5 && isu8cont(*p2)); > + > + for (p = p1; p < xcp; p++) > + x_zotc(*p); > + for (p = p2; p < p1; p++) > + x_zotc(*p); > + > + memcpy(rune1, p1, xcp - p1); > + memcpy(rune2, p2, p1 - p2); > + memcpy(p2, rune1, xcp - p1); > + memcpy(p2 + (xcp - p1), rune2, p1 - p2); >   } else { >   /* GNU emacs style: Swap the characters before and under > the >   * cursor, move cursor position along one. >   */ > - x_bs(xcp[-1]); > - x_zotc(xcp[0]); > - x_zotc(xcp[-1]); > - tmp = xcp[-1]; > - xcp[-1] = xcp[0]; > - xcp[0] = tmp; > - x_bs(xcp[0]); > - x_goto(xcp + 1); > + p1 = xcp + 1; > + while (p1 < xep && *p1 && (p1 - xcp) <= 5 && isu8cont(*p1)) > + p1++; > + > + p2 = xcp; > + do { > + x_bs((unsigned char) *--p2); > + } while (xbuf <= p2 && (xcp - p2) < 5 && isu8cont(*p2)); > + > + for (p = xcp; p < p1; p++) > + x_zotc(*p); > + for (p = p2; p < xcp; p++) > + x_zotc(*p); > + > + memcpy(rune1, xcp, p1 - xcp); > + memcpy(rune2, p2, xcp - p2); > + memcpy(p2, rune1, p1 - xcp); > + memcpy(p2 + (p1 - xcp), rune2, xcp - p2); > + > + xcp = p1; > + x_goto(p1); >   } >   return KSTD; >  } > @@ -1804,6 +1914,11 @@ >   */ >   if ((xbp = xcp - (x_displen / 2)) < xbuf) >   xbp = xbuf; > + else { > + /* rewind to the last valid codepoint */ > + while (xbp > xbuf && isu8cont((unsigned char) *xbp)) > + xbp--; > + } >   xlp_valid = false; >   x_redraw(xx_cols); >   x_flush(); > @@ -1882,8 +1997,16 @@ >  } >   >  static void > -x_e_putc(int c) > +x_e_putc(int sc) >  { > +#ifndef SMALL > + static unsigned char ch[5] = { 0 }; > + static int len = 0, cnt = 0; > + unsigned long cpt; > +#endif > + unsigned char c; > + > + c = sc; >   if (c == '\r' || c == '\n') >   x_col = 0; >   if (x_col < xx_cols) { > @@ -1898,9 +2021,43 @@ >   x_col--; >   break; >   default: > +#ifdef SMALL >   if (!isu8cont(c)) >   x_col++; >   break; > +#else > + if (!isu8cont(c)) { > + if (c <= 0x7f) { > + x_col++; > + len = 0; > + break; > + } > + > + if ((c & 0xf8) == 0xf0 && c < 0xf5) > + len = 3; > + else if ((c & 0xf0) == 0xe0) > + len = 2; > + else if ((c & 0xe0) == 0xc0 && c > 0xc1) > + len = 1; > + else { > + len = 0; > + break; > + } > + > + cnt = 0; > + memset(ch, 0, 5); > + ch[cnt++] = c; > + } else { > + ch[cnt++] = c; > + if (cnt > len) { > + x_col++; > + u8_to_cpt(ch, &cpt); > + if (is_fullwidth(cpt)) > + x_col++; > + } > + } > + break; > +#endif >   } >   } >   if (x_adj_ok && (x_col < 0 || x_col >= (xx_cols - 2))) > Index: bin/ksh/unicode.c > =================================================================== > --- bin/ksh/unicode.c    (new file) > +++ bin/ksh/unicode.c    (working copy) > --- /dev/null 2024-12-17 11:54:03.396000088 +0800 > +++ bin/ksh/unicode.c 2024-12-17 13:25:38.050258915 +0800 > @@ -0,0 +1,162 @@ > +#include "unicode.h" > + > +#ifndef SMALL > + > +/* The following code was generated from EastAsianWidth.txt (Flag: W&F) > + * Reference: https://www.unicode.org/reports/tr11/tr11-6.html > + */ > + > +int is_fullwidth(unsigned long cpt) { > + if ((0x1100 <= cpt && cpt <= 0x115f) > +     || (0x231a <= cpt && cpt <= 0x231b) > +     || (0x2329 <= cpt && cpt <= 0x232a) > +     || (0x23e9 <= cpt && cpt <= 0x23ec) > +     || (cpt == 0x23f0) > +     || (cpt == 0x23f3) > +     || (0x25fd <= cpt && cpt <= 0x25fe) > +     || (0x2614 <= cpt && cpt <= 0x2615) > +     || (0x2630 <= cpt && cpt <= 0x2637) > +     || (0x2648 <= cpt && cpt <= 0x2653) > +     || (cpt == 0x267f) > +     || (0x268a <= cpt && cpt <= 0x268f) > +     || (cpt == 0x2693) > +     || (cpt == 0x26a1) > +     || (0x26aa <= cpt && cpt <= 0x26ab) > +     || (0x26bd <= cpt && cpt <= 0x26be) > +     || (0x26c4 <= cpt && cpt <= 0x26c5) > +     || (cpt == 0x26ce) > +     || (cpt == 0x26d4) > +     || (cpt == 0x26ea) > +     || (0x26f2 <= cpt && cpt <= 0x26f3) > +     || (cpt == 0x26f5) > +     || (cpt == 0x26fa) > +     || (cpt == 0x26fd) > +     || (cpt == 0x2705) > +     || (0x270a <= cpt && cpt <= 0x270b) > +     || (cpt == 0x2728) > +     || (cpt == 0x274c) > +     || (cpt == 0x274e) > +     || (0x2753 <= cpt && cpt <= 0x2755) > +     || (cpt == 0x2757) > +     || (0x2795 <= cpt && cpt <= 0x2797) > +     || (cpt == 0x27b0) > +     || (cpt == 0x27bf) > +     || (0x2b1b <= cpt && cpt <= 0x2b1c) > +     || (cpt == 0x2b50) > +     || (cpt == 0x2b55) > +     || (0x2e80 <= cpt && cpt <= 0x2e99) > +     || (0x2e9b <= cpt && cpt <= 0x2ef3) > +     || (0x2f00 <= cpt && cpt <= 0x2fd5) > +     || (0x2ff0 <= cpt && cpt <= 0x303e) > +     || (0x3041 <= cpt && cpt <= 0x3096) > +     || (0x3099 <= cpt && cpt <= 0x30ff) > +     || (0x3105 <= cpt && cpt <= 0x312f) > +     || (0x3131 <= cpt && cpt <= 0x318e) > +     || (0x3190 <= cpt && cpt <= 0x31e5) > +     || (0x31ef <= cpt && cpt <= 0x321e) > +     || (0x3220 <= cpt && cpt <= 0x3247) > +     || (0x3250 <= cpt && cpt <= 0xa48c) > +     || (0xa490 <= cpt && cpt <= 0xa4c6) > +     || (0xa960 <= cpt && cpt <= 0xa97c) > +     || (0xac00 <= cpt && cpt <= 0xd7a3) > +     || (0xf900 <= cpt && cpt <= 0xfaff) > +     || (0xfe10 <= cpt && cpt <= 0xfe19) > +     || (0xfe30 <= cpt && cpt <= 0xfe52) > +     || (0xfe54 <= cpt && cpt <= 0xfe66) > +     || (0xfe68 <= cpt && cpt <= 0xfe6b) > +     || (0xff01 <= cpt && cpt <= 0xff60) > +     || (0xffe0 <= cpt && cpt <= 0xffe6) > +     || (0x16fe0 <= cpt && cpt <= 0x16fe4) > +     || (0x16ff0 <= cpt && cpt <= 0x16ff1) > +     || (0x17000 <= cpt && cpt <= 0x187f7) > +     || (0x18800 <= cpt && cpt <= 0x18cd5) > +     || (0x18cff <= cpt && cpt <= 0x18d08) > +     || (0x1aff0 <= cpt && cpt <= 0x1aff3) > +     || (0x1aff5 <= cpt && cpt <= 0x1affb) > +     || (0x1affd <= cpt && cpt <= 0x1affe) > +     || (0x1b000 <= cpt && cpt <= 0x1b122) > +     || (cpt == 0x1b132) > +     || (0x1b150 <= cpt && cpt <= 0x1b152) > +     || (cpt == 0x1b155) > +     || (0x1b164 <= cpt && cpt <= 0x1b167) > +     || (0x1b170 <= cpt && cpt <= 0x1b2fb) > +     || (0x1d300 <= cpt && cpt <= 0x1d356) > +     || (0x1d360 <= cpt && cpt <= 0x1d376) > +     || (cpt == 0x1f004) > +     || (cpt == 0x1f0cf) > +     || (cpt == 0x1f18e) > +     || (0x1f191 <= cpt && cpt <= 0x1f19a) > +     || (0x1f200 <= cpt && cpt <= 0x1f202) > +     || (0x1f210 <= cpt && cpt <= 0x1f23b) > +     || (0x1f240 <= cpt && cpt <= 0x1f248) > +     || (0x1f250 <= cpt && cpt <= 0x1f251) > +     || (0x1f260 <= cpt && cpt <= 0x1f265) > +     || (0x1f300 <= cpt && cpt <= 0x1f320) > +     || (0x1f32d <= cpt && cpt <= 0x1f335) > +     || (0x1f337 <= cpt && cpt <= 0x1f37c) > +     || (0x1f37e <= cpt && cpt <= 0x1f393) > +     || (0x1f3a0 <= cpt && cpt <= 0x1f3ca) > +     || (0x1f3cf <= cpt && cpt <= 0x1f3d3) > +     || (0x1f3e0 <= cpt && cpt <= 0x1f3f0) > +     || (cpt == 0x1f3f4) > +     || (0x1f3f8 <= cpt && cpt <= 0x1f43e) > +     || (cpt == 0x1f440) > +     || (0x1f442 <= cpt && cpt <= 0x1f4fc) > +     || (0x1f4ff <= cpt && cpt <= 0x1f53d) > +     || (0x1f54b <= cpt && cpt <= 0x1f54e) > +     || (0x1f550 <= cpt && cpt <= 0x1f567) > +     || (cpt == 0x1f57a) > +     || (0x1f595 <= cpt && cpt <= 0x1f596) > +     || (cpt == 0x1f5a4) > +     || (0x1f5fb <= cpt && cpt <= 0x1f64f) > +     || (0x1f680 <= cpt && cpt <= 0x1f6c5) > +     || (cpt == 0x1f6cc) > +     || (0x1f6d0 <= cpt && cpt <= 0x1f6d2) > +     || (0x1f6d5 <= cpt && cpt <= 0x1f6d7) > +     || (0x1f6dc <= cpt && cpt <= 0x1f6df) > +     || (0x1f6eb <= cpt && cpt <= 0x1f6ec) > +     || (0x1f6f4 <= cpt && cpt <= 0x1f6fc) > +     || (0x1f7e0 <= cpt && cpt <= 0x1f7eb) > +     || (cpt == 0x1f7f0) > +     || (0x1f90c <= cpt && cpt <= 0x1f93a) > +     || (0x1f93c <= cpt && cpt <= 0x1f945) > +     || (0x1f947 <= cpt && cpt <= 0x1f9ff) > +     || (0x1fa70 <= cpt && cpt <= 0x1fa7c) > +     || (0x1fa80 <= cpt && cpt <= 0x1fa89) > +     || (0x1fa8f <= cpt && cpt <= 0x1fac6) > +     || (0x1face <= cpt && cpt <= 0x1fadc) > +     || (0x1fadf <= cpt && cpt <= 0x1fae9) > +     || (0x1faf0 <= cpt && cpt <= 0x1faf8) > +     || (0x20000 <= cpt && cpt <= 0x2fffd) > +     || (0x30000 <= cpt && cpt <= 0x3fffd)) > + return 1; > + > + return 0; > +} > + > +int u8_to_cpt(const char *buf, unsigned long *cpt) { > + const unsigned char *ubuf = buf; > + > + if (ubuf[0] <= 0x7F) { > + *cpt = ubuf[0]; > + return 1; > + } else if ((ubuf[0] & 0xE0) == 0xC0) { > + *cpt = ((ubuf[0] & 0x1F) << 6) | (ubuf[1] & 0x3F); > + return 2; > + } else if ((ubuf[0] & 0xF0) == 0xE0) { > + *cpt = ((ubuf[0] & 0x0F) << 12) > + | ((ubuf[1] & 0x3F) << 6) > + | (ubuf[2] & 0x3F); > + return 3; > + } else if ((ubuf[0] & 0xF8) == 0xF0) { > + *cpt = ((ubuf[0] & 0x07) << 18) > + | ((ubuf[1] & 0x3F) << 12) > + | ((ubuf[2] & 0x3F) << 6) > + | (ubuf[3] & 0x3F); > + return 4; > + } > + > + return 0; > +} > + > +#endif > Index: bin/ksh/unicode.h > =================================================================== > --- bin/ksh/unicode.h    (new file) > +++ bin/ksh/unicode.h    (working copy) > --- /dev/null 2024-12-17 11:54:03.396000088 +0800 > +++ bin/ksh/unicode.h 2024-12-17 09:19:00.521730569 +0800 > @@ -0,0 +1,7 @@ > +#ifndef UNICODE_H > +#define UNICODE_H > + > +int is_fullwidth(unsigned long); > +int u8_to_cpt(const char *, unsigned long *); > + > +#endif /* UNICODE_H */ > > > > > >