From: Gong Zhile Subject: [REPOST] ksh: utf8 full width character support for emacs.c To: tech@openbsd.org Date: Sun, 16 Mar 2025 22:49:14 +0800 Full width characters are commonly used in Asian language system like Chinese, Japanese and Korean etc. Those characters took double the width of a normal ascii char but x_size only counts them in one unit. When navigating between those characters in emacs line editing mode, the cursor would lose track and mess up the the line making it really difficult to input. I tried to make x_size counts correctly with static variables in func and looking up in a table generated from ‘EastAsianWidth.txt’. Characters mainly count in a size of 2 are: Kanji, Katakana, Hiragana, Hangul, Roman Full-Width Characters, emojis etc. Expected behavior (After patching): cursor should land correctly while navigating between full width characters, line editing commands (like x_transpose) correctly perform. Known issue: When heading off the screen with full width chars, it fails to place the angle bracket correctly. (Not easy to deal with when full width characters crossing xx_cols) Tested on: rxvt-unicode, xterm Index: bin/ksh/Makefile =================================================================== RCS file: /cvs/src/bin/ksh/Makefile,v diff -u -r1.39 Makefile --- bin/ksh/Makefile 18 Jun 2018 17:03:58 -0000 1.39 +++ bin/ksh/Makefile 17 Dec 2024 05:26:43 -0000 @@ -7,7 +7,7 @@  SRCS= alloc.c c_ksh.c c_sh.c c_test.c c_ulimit.c edit.c emacs.c eval.c \   exec.c expr.c history.c io.c jobs.c lex.c mail.c main.c \   misc.c path.c shf.c syn.c table.c trap.c tree.c tty.c var.c \ - version.c vi.c + version.c vi.c unicode.c    WARNINGS=yes  DEFS= -DEMACS -DVI Index: bin/ksh/emacs.c =================================================================== RCS file: /cvs/src/bin/ksh/emacs.c,v diff -u -r1.90 emacs.c --- bin/ksh/emacs.c 21 Jun 2023 22:22:08 -0000 1.90 +++ bin/ksh/emacs.c 17 Dec 2024 05:26:43 -0000 @@ -29,6 +29,11 @@  #include "sh.h"  #include "edit.h"   +#ifndef SMALL +#include "unicode.h" +#else +#define x_size_rev x_size +#endif  static Area aedit;  #define AEDIT &aedit /* area for kill ring and macro defns */   @@ -126,6 +131,7 @@  static void x_goto(char *);  static void x_bs(int);  static int x_size_str(char *); +static int x_size_rev(int);  static int x_size(int);  static void x_zots(char *);  static void x_zotc(int); @@ -459,7 +465,7 @@   if (adj == x_adj_done) { /* has x_adjust() been called? */   /* no */   for (cp = xlp; cp > xcp; ) - x_bs(*--cp); + x_bs((unsigned char)*--cp);   }     x_adj_ok = 1; @@ -552,7 +558,7 @@   x_adj_ok = 1;   xlp_valid = false;   for (cp = x_lastcp(); cp > xcp; ) - x_bs(*--cp); + x_bs((unsigned char)*--cp);     return;  } @@ -653,7 +659,7 @@  {   int i;   - i = x_size(c); + i = x_size_rev(c);   while (i--)   x_e_putc('\b');  } @@ -663,20 +669,93 @@  {   int size = 0;   while (*cp) - size += x_size(*cp++); + size += x_size((unsigned char)*cp++);   return size;  } +#ifndef SMALL +static int +x_size_rev(int c) +{ + static unsigned char ch[5] = { 0 }; + static int cnt = 3; + unsigned long cpt; + int w; + + if (c=='\t') + return 4; /* Kludge, tabs are always four spaces. */ + if (iscntrl(c)) /* control char */ + return 2; + + if (!isu8cont(c)) { + if (c <= 0x7f) { + cnt = 3; + return 1; + } + + ch[cnt] = c; + u8_to_cpt(ch + cnt, &cpt); + w = is_fullwidth(cpt) ? 2 : 1; + + cnt = 3; + memset(ch, 0, 4); + return w; + } else { + if (cnt <= 0) + return 0; + ch[cnt] = c; + cnt--; + } + + return 0; +} +#endif    static int  x_size(int c)  { +#ifndef SMALL + static unsigned char ch[5] = { 0 }; + static int len = 0, cnt = 0; + unsigned long cpt; +#endif   if (c=='\t')   return 4; /* Kludge, tabs are always four spaces. */   if (iscntrl(c)) /* control char */   return 2; +#ifdef SMALL   if (isu8cont(c))   return 0;   return 1; +#else + if (!isu8cont(c)) { + if (c <= 0x7f) { + len = 0; + return 1; + } + + if ((c & 0xf8) == 0xf0 && c < 0xf5) + len = 3; + else if ((c & 0xf0) == 0xe0) + len = 2; + else if ((c & 0xe0) == 0xc0 && c > 0xc1) + len = 1; + else { + len = 0; + return 0; + } + + cnt = 0; + memset(ch, 0, 5); + ch[cnt++] = c; + } else { + ch[cnt++] = c; + if (cnt > len) { + u8_to_cpt(ch, &cpt); + return is_fullwidth(cpt) ? 2 : 1; + } + } + return 0; +#endif  }    static void @@ -1098,7 +1177,8 @@  static int  x_transpose(int c)  { - char tmp; + char rune1[4], rune2[4]; + char *p1, *p2, *p;     /* What transpose is meant to do seems to be up for debate. This   * is a general summary of the options; the text is abcd with the @@ -1124,25 +1204,55 @@   /* Gosling/Unipress emacs style: Swap two characters before the   * cursor, do not change cursor position   */ - x_bs(xcp[-1]); - x_bs(xcp[-2]); - x_zotc(xcp[-1]); - x_zotc(xcp[-2]); - tmp = xcp[-1]; - xcp[-1] = xcp[-2]; - xcp[-2] = tmp; + p1 = xcp; + do { + x_bs((unsigned char) *--p1); + } while (xbuf < p1 && (xcp - p1) < 5 && isu8cont(*p1)); + + if (p1 == xbuf) { + x_e_putc(BEL); + return KSTD; + } + + p2 = p1; + do { + x_bs((unsigned char) *--p2); + } while  (xbuf <= p2 && (p1 - p2) < 5 && isu8cont(*p2)); + + for (p = p1; p < xcp; p++) + x_zotc(*p); + for (p = p2; p < p1; p++) + x_zotc(*p); + + memcpy(rune1, p1, xcp - p1); + memcpy(rune2, p2, p1 - p2); + memcpy(p2, rune1, xcp - p1); + memcpy(p2 + (xcp - p1), rune2, p1 - p2);   } else {   /* GNU emacs style: Swap the characters before and under the   * cursor, move cursor position along one.   */ - x_bs(xcp[-1]); - x_zotc(xcp[0]); - x_zotc(xcp[-1]); - tmp = xcp[-1]; - xcp[-1] = xcp[0]; - xcp[0] = tmp; - x_bs(xcp[0]); - x_goto(xcp + 1); + p1 = xcp + 1; + while (p1 < xep && *p1 && (p1 - xcp) <= 5 && isu8cont(*p1)) + p1++; + + p2 = xcp; + do { + x_bs((unsigned char) *--p2); + } while (xbuf <= p2 && (xcp - p2) < 5 && isu8cont(*p2)); + + for (p = xcp; p < p1; p++) + x_zotc(*p); + for (p = p2; p < xcp; p++) + x_zotc(*p); + + memcpy(rune1, xcp, p1 - xcp); + memcpy(rune2, p2, xcp - p2); + memcpy(p2, rune1, p1 - xcp); + memcpy(p2 + (p1 - xcp), rune2, xcp - p2); + + xcp = p1; + x_goto(p1);   }   return KSTD;  } @@ -1804,6 +1914,11 @@   */   if ((xbp = xcp - (x_displen / 2)) < xbuf)   xbp = xbuf; + else { + /* rewind to the last valid codepoint */ + while (xbp > xbuf && isu8cont((unsigned char) *xbp)) + xbp--; + }   xlp_valid = false;   x_redraw(xx_cols);   x_flush(); @@ -1882,8 +1997,16 @@  }    static void -x_e_putc(int c) +x_e_putc(int sc)  { +#ifndef SMALL + static unsigned char ch[5] = { 0 }; + static int len = 0, cnt = 0; + unsigned long cpt; +#endif + unsigned char c; + + c = sc;   if (c == '\r' || c == '\n')   x_col = 0;   if (x_col < xx_cols) { @@ -1898,9 +2021,43 @@   x_col--;   break;   default: +#ifdef SMALL   if (!isu8cont(c))   x_col++;   break; +#else + if (!isu8cont(c)) { + if (c <= 0x7f) { + x_col++; + len = 0; + break; + } + + if ((c & 0xf8) == 0xf0 && c < 0xf5) + len = 3; + else if ((c & 0xf0) == 0xe0) + len = 2; + else if ((c & 0xe0) == 0xc0 && c > 0xc1) + len = 1; + else { + len = 0; + break; + } + + cnt = 0; + memset(ch, 0, 5); + ch[cnt++] = c; + } else { + ch[cnt++] = c; + if (cnt > len) { + x_col++; + u8_to_cpt(ch, &cpt); + if (is_fullwidth(cpt)) + x_col++; + } + } + break; +#endif   }   }   if (x_adj_ok && (x_col < 0 || x_col >= (xx_cols - 2))) Index: bin/ksh/unicode.c =================================================================== --- bin/ksh/unicode.c    (new file) +++ bin/ksh/unicode.c    (working copy) --- /dev/null 2024-12-17 11:54:03.396000088 +0800 +++ bin/ksh/unicode.c 2024-12-17 13:25:38.050258915 +0800 @@ -0,0 +1,162 @@ +#include "unicode.h" + +#ifndef SMALL + +/* The following code was generated from EastAsianWidth.txt (Flag: W&F) + * Reference: https://www.unicode.org/reports/tr11/tr11-6.html + */ + +int is_fullwidth(unsigned long cpt) { + if ((0x1100 <= cpt && cpt <= 0x115f) +     || (0x231a <= cpt && cpt <= 0x231b) +     || (0x2329 <= cpt && cpt <= 0x232a) +     || (0x23e9 <= cpt && cpt <= 0x23ec) +     || (cpt == 0x23f0) +     || (cpt == 0x23f3) +     || (0x25fd <= cpt && cpt <= 0x25fe) +     || (0x2614 <= cpt && cpt <= 0x2615) +     || (0x2630 <= cpt && cpt <= 0x2637) +     || (0x2648 <= cpt && cpt <= 0x2653) +     || (cpt == 0x267f) +     || (0x268a <= cpt && cpt <= 0x268f) +     || (cpt == 0x2693) +     || (cpt == 0x26a1) +     || (0x26aa <= cpt && cpt <= 0x26ab) +     || (0x26bd <= cpt && cpt <= 0x26be) +     || (0x26c4 <= cpt && cpt <= 0x26c5) +     || (cpt == 0x26ce) +     || (cpt == 0x26d4) +     || (cpt == 0x26ea) +     || (0x26f2 <= cpt && cpt <= 0x26f3) +     || (cpt == 0x26f5) +     || (cpt == 0x26fa) +     || (cpt == 0x26fd) +     || (cpt == 0x2705) +     || (0x270a <= cpt && cpt <= 0x270b) +     || (cpt == 0x2728) +     || (cpt == 0x274c) +     || (cpt == 0x274e) +     || (0x2753 <= cpt && cpt <= 0x2755) +     || (cpt == 0x2757) +     || (0x2795 <= cpt && cpt <= 0x2797) +     || (cpt == 0x27b0) +     || (cpt == 0x27bf) +     || (0x2b1b <= cpt && cpt <= 0x2b1c) +     || (cpt == 0x2b50) +     || (cpt == 0x2b55) +     || (0x2e80 <= cpt && cpt <= 0x2e99) +     || (0x2e9b <= cpt && cpt <= 0x2ef3) +     || (0x2f00 <= cpt && cpt <= 0x2fd5) +     || (0x2ff0 <= cpt && cpt <= 0x303e) +     || (0x3041 <= cpt && cpt <= 0x3096) +     || (0x3099 <= cpt && cpt <= 0x30ff) +     || (0x3105 <= cpt && cpt <= 0x312f) +     || (0x3131 <= cpt && cpt <= 0x318e) +     || (0x3190 <= cpt && cpt <= 0x31e5) +     || (0x31ef <= cpt && cpt <= 0x321e) +     || (0x3220 <= cpt && cpt <= 0x3247) +     || (0x3250 <= cpt && cpt <= 0xa48c) +     || (0xa490 <= cpt && cpt <= 0xa4c6) +     || (0xa960 <= cpt && cpt <= 0xa97c) +     || (0xac00 <= cpt && cpt <= 0xd7a3) +     || (0xf900 <= cpt && cpt <= 0xfaff) +     || (0xfe10 <= cpt && cpt <= 0xfe19) +     || (0xfe30 <= cpt && cpt <= 0xfe52) +     || (0xfe54 <= cpt && cpt <= 0xfe66) +     || (0xfe68 <= cpt && cpt <= 0xfe6b) +     || (0xff01 <= cpt && cpt <= 0xff60) +     || (0xffe0 <= cpt && cpt <= 0xffe6) +     || (0x16fe0 <= cpt && cpt <= 0x16fe4) +     || (0x16ff0 <= cpt && cpt <= 0x16ff1) +     || (0x17000 <= cpt && cpt <= 0x187f7) +     || (0x18800 <= cpt && cpt <= 0x18cd5) +     || (0x18cff <= cpt && cpt <= 0x18d08) +     || (0x1aff0 <= cpt && cpt <= 0x1aff3) +     || (0x1aff5 <= cpt && cpt <= 0x1affb) +     || (0x1affd <= cpt && cpt <= 0x1affe) +     || (0x1b000 <= cpt && cpt <= 0x1b122) +     || (cpt == 0x1b132) +     || (0x1b150 <= cpt && cpt <= 0x1b152) +     || (cpt == 0x1b155) +     || (0x1b164 <= cpt && cpt <= 0x1b167) +     || (0x1b170 <= cpt && cpt <= 0x1b2fb) +     || (0x1d300 <= cpt && cpt <= 0x1d356) +     || (0x1d360 <= cpt && cpt <= 0x1d376) +     || (cpt == 0x1f004) +     || (cpt == 0x1f0cf) +     || (cpt == 0x1f18e) +     || (0x1f191 <= cpt && cpt <= 0x1f19a) +     || (0x1f200 <= cpt && cpt <= 0x1f202) +     || (0x1f210 <= cpt && cpt <= 0x1f23b) +     || (0x1f240 <= cpt && cpt <= 0x1f248) +     || (0x1f250 <= cpt && cpt <= 0x1f251) +     || (0x1f260 <= cpt && cpt <= 0x1f265) +     || (0x1f300 <= cpt && cpt <= 0x1f320) +     || (0x1f32d <= cpt && cpt <= 0x1f335) +     || (0x1f337 <= cpt && cpt <= 0x1f37c) +     || (0x1f37e <= cpt && cpt <= 0x1f393) +     || (0x1f3a0 <= cpt && cpt <= 0x1f3ca) +     || (0x1f3cf <= cpt && cpt <= 0x1f3d3) +     || (0x1f3e0 <= cpt && cpt <= 0x1f3f0) +     || (cpt == 0x1f3f4) +     || (0x1f3f8 <= cpt && cpt <= 0x1f43e) +     || (cpt == 0x1f440) +     || (0x1f442 <= cpt && cpt <= 0x1f4fc) +     || (0x1f4ff <= cpt && cpt <= 0x1f53d) +     || (0x1f54b <= cpt && cpt <= 0x1f54e) +     || (0x1f550 <= cpt && cpt <= 0x1f567) +     || (cpt == 0x1f57a) +     || (0x1f595 <= cpt && cpt <= 0x1f596) +     || (cpt == 0x1f5a4) +     || (0x1f5fb <= cpt && cpt <= 0x1f64f) +     || (0x1f680 <= cpt && cpt <= 0x1f6c5) +     || (cpt == 0x1f6cc) +     || (0x1f6d0 <= cpt && cpt <= 0x1f6d2) +     || (0x1f6d5 <= cpt && cpt <= 0x1f6d7) +     || (0x1f6dc <= cpt && cpt <= 0x1f6df) +     || (0x1f6eb <= cpt && cpt <= 0x1f6ec) +     || (0x1f6f4 <= cpt && cpt <= 0x1f6fc) +     || (0x1f7e0 <= cpt && cpt <= 0x1f7eb) +     || (cpt == 0x1f7f0) +     || (0x1f90c <= cpt && cpt <= 0x1f93a) +     || (0x1f93c <= cpt && cpt <= 0x1f945) +     || (0x1f947 <= cpt && cpt <= 0x1f9ff) +     || (0x1fa70 <= cpt && cpt <= 0x1fa7c) +     || (0x1fa80 <= cpt && cpt <= 0x1fa89) +     || (0x1fa8f <= cpt && cpt <= 0x1fac6) +     || (0x1face <= cpt && cpt <= 0x1fadc) +     || (0x1fadf <= cpt && cpt <= 0x1fae9) +     || (0x1faf0 <= cpt && cpt <= 0x1faf8) +     || (0x20000 <= cpt && cpt <= 0x2fffd) +     || (0x30000 <= cpt && cpt <= 0x3fffd)) + return 1; + + return 0; +} + +int u8_to_cpt(const char *buf, unsigned long *cpt) { + const unsigned char *ubuf = buf; + + if (ubuf[0] <= 0x7F) { + *cpt = ubuf[0]; + return 1; + } else if ((ubuf[0] & 0xE0) == 0xC0) { + *cpt = ((ubuf[0] & 0x1F) << 6) | (ubuf[1] & 0x3F); + return 2; + } else if ((ubuf[0] & 0xF0) == 0xE0) { + *cpt = ((ubuf[0] & 0x0F) << 12) + | ((ubuf[1] & 0x3F) << 6) + | (ubuf[2] & 0x3F); + return 3; + } else if ((ubuf[0] & 0xF8) == 0xF0) { + *cpt = ((ubuf[0] & 0x07) << 18) + | ((ubuf[1] & 0x3F) << 12) + | ((ubuf[2] & 0x3F) << 6) + | (ubuf[3] & 0x3F); + return 4; + } + + return 0; +} + +#endif Index: bin/ksh/unicode.h =================================================================== --- bin/ksh/unicode.h    (new file) +++ bin/ksh/unicode.h    (working copy) --- /dev/null 2024-12-17 11:54:03.396000088 +0800 +++ bin/ksh/unicode.h 2024-12-17 09:19:00.521730569 +0800 @@ -0,0 +1,7 @@ +#ifndef UNICODE_H +#define UNICODE_H + +int is_fullwidth(unsigned long); +int u8_to_cpt(const char *, unsigned long *); + +#endif /* UNICODE_H */