Mailing List Archive

Hi list, Thank everyone for the attention on the issue and the discussion on unicode. To make it partially work in the simpliest way imho would be replacing is_fullwidth and u8_to_cpt with wcwidth(3) and u8_to_wc (with mbtowc). To be clear, that doesn't mean it will refactor the entire shell to wchar, it uses wchar routines only if an unicode occurred to calculate its width. Therefore, there won't be an performance issue of calling mbtowc(3) repeatedly as it only calls it on unicode instead of regular ascii chars. After testing, that works pretty good for me. However there are pros and cons on that solution, and here are some of my thoughts: 1. setlocale(3) call can't be avoided. It would be `setlocale(LC_ALL, "")' to make it default to the encoding in the user envirnoment. And it's quiet clear that's a must if one wants any i18n from the C library. 2. It behaves differently across locales. If C/POSIX (ASCII) or encodings other than utf-8 are choosen, unicode character correctly. [But if we fallback wcwidth of something failed to one column, it will matche the current (broken) behaviour.] But, one would find it parsing utf-8 anyways in other locales. Unfortunately, the issue also exists in the current ksh. (Since OpenBSD only supports ASCII and UTF8, I don't think that's really big of a problem.) 3. Also an user-perceived character may not be one rune. But like emojis, in a terminal, they aren't combined together (at least on urxvt with Unifont). So I think it's already really ambiguous to determine the actuall width of a cluster character. So in conclusion, I suggest switching to wcwidth(3) and mbtowc(3) despite the (maybe) disadvantage of calling setlocale(3). It solves the problem without introducing too much complexity and other defects. Any thoughts? --- bin/ksh/Makefile | 2 +- bin/ksh/emacs.c | 187 +++++++++++++++++++++++++++++++++++++++++----- bin/ksh/main.c | 8 ++ bin/ksh/unicode.c | 33 ++++++++ bin/ksh/unicode.h | 6 ++ 5 files changed, 216 insertions(+), 20 deletions(-) create mode 100644 bin/ksh/unicode.c create mode 100644 bin/ksh/unicode.h diff --git a/bin/ksh/Makefile b/bin/ksh/Makefile index 621c38bcf..d800154e4 100644 --- a/bin/ksh/Makefile +++ b/bin/ksh/Makefile @@ -7,7 +7,7 @@ LDADD+= -lcurses SRCS= alloc.c c_ksh.c c_sh.c c_test.c c_ulimit.c edit.c emacs.c eval.c \ exec.c expr.c history.c io.c jobs.c lex.c mail.c main.c \ misc.c path.c shf.c syn.c table.c trap.c tree.c tty.c var.c \ - version.c vi.c + version.c vi.c unicode.c WARNINGS=yes DEFS= -DEMACS -DVI diff --git a/bin/ksh/emacs.c b/bin/ksh/emacs.c index 3c9b0074d..813bc8e5a 100644 --- a/bin/ksh/emacs.c +++ b/bin/ksh/emacs.c @@ -29,6 +29,11 @@ #include "sh.h" #include "edit.h" +#ifndef SMALL +#include "unicode.h" +#else +#define x_size_rev x_size +#endif static Area aedit; #define AEDIT &aedit /* area for kill ring and macro defns */ @@ -126,6 +131,7 @@ static int x_fword(void); static void x_goto(char *); static void x_bs(int); static int x_size_str(char *); +static int x_size_rev(int); static int x_size(int); static void x_zots(char *); static void x_zotc(int); @@ -459,7 +465,7 @@ x_ins(char *s) if (adj == x_adj_done) { /* has x_adjust() been called? */ /* no */ for (cp = xlp; cp > xcp; ) - x_bs(*--cp); + x_bs((unsigned char)*--cp); } x_adj_ok = 1; @@ -653,7 +659,7 @@ x_bs(int c) { int i; - i = x_size(c); + i = x_size_rev(c); while (i--) x_e_putc('\b'); } @@ -663,20 +669,89 @@ x_size_str(char *cp) { int size = 0; while (*cp) - size += x_size(*cp++); + size += x_size((unsigned char)*cp++); return size; } +#ifndef SMALL +static int +x_size_rev(int c) +{ + static unsigned char ch[5] = { 0 }; + static int cnt = 3; + int w; + + if (c=='\t') + return 4; /* Kludge, tabs are always four spaces. */ + if (iscntrl(c)) /* control char */ + return 2; + + if (!isu8cont(c)) { + if (c <= 0x7f) { + cnt = 3; + return 1; + } + + ch[cnt] = c; + w = u8width(ch + cnt); + + cnt = 3; + memset(ch, 0, 4); + return w; + } else { + if (cnt <= 0) + return 0; + ch[cnt] = c; + cnt--; + } + + return 0; +} +#endif + static int x_size(int c) { +#ifndef SMALL + static unsigned char ch[5] = { 0 }; + static int len = 0, cnt = 0; +#endif if (c=='\t') return 4; /* Kludge, tabs are always four spaces. */ if (iscntrl(c)) /* control char */ return 2; +#ifdef SMALL if (isu8cont(c)) return 0; return 1; +#else + if (!isu8cont(c)) { + if (c <= 0x7f) { + len = 0; + return 1; + } + + if ((c & 0xf8) == 0xf0 && c < 0xf5) + len = 3; + else if ((c & 0xf0) == 0xe0) + len = 2; + else if ((c & 0xe0) == 0xc0 && c > 0xc1) + len = 1; + else { + len = 0; + return 0; + } + + cnt = 0; + memset(ch, 0, 5); + ch[cnt++] = c; + } else { + ch[cnt++] = c; + if (cnt > len) + return u8width(ch); + } + return 0; +#endif } static void @@ -1099,6 +1174,8 @@ static int x_transpose(int c) { char tmp; + char rune1[4], rune2[4]; + char *p1, *p2, *p; /* What transpose is meant to do seems to be up for debate. This * is a general summary of the options; the text is abcd with the @@ -1124,25 +1201,55 @@ x_transpose(int c) /* Gosling/Unipress emacs style: Swap two characters before the * cursor, do not change cursor position */ - x_bs(xcp[-1]); - x_bs(xcp[-2]); - x_zotc(xcp[-1]); - x_zotc(xcp[-2]); - tmp = xcp[-1]; - xcp[-1] = xcp[-2]; - xcp[-2] = tmp; + p1 = xcp; + do { + x_bs((unsigned char) *--p1); + } while (xbuf < p1 && (xcp - p1) < 5 && isu8cont(*p1)); + + if (p1 == xbuf) { + x_e_putc(BEL); + return KSTD; + } + + p2 = p1; + do { + x_bs((unsigned char) *--p2); + } while (xbuf <= p2 && (p1 - p2) < 5 && isu8cont(*p2)); + + for (p = p1; p < xcp; p++) + x_zotc(*p); + for (p = p2; p < p1; p++) + x_zotc(*p); + + memcpy(rune1, p1, xcp - p1); + memcpy(rune2, p2, p1 - p2); + memcpy(p2, rune1, xcp - p1); + memcpy(p2 + (xcp - p1), rune2, p1 - p2); } else { /* GNU emacs style: Swap the characters before and under the * cursor, move cursor position along one. */ - x_bs(xcp[-1]); - x_zotc(xcp[0]); - x_zotc(xcp[-1]); - tmp = xcp[-1]; - xcp[-1] = xcp[0]; - xcp[0] = tmp; - x_bs(xcp[0]); - x_goto(xcp + 1); + p1 = xcp + 1; + while (p1 < xep && *p1 && (p1 - xcp) <= 5 && isu8cont(*p1)) + p1++; + + p2 = xcp; + do { + x_bs((unsigned char) *--p2); + } while (xbuf <= p2 && (xcp - p2) < 5 && isu8cont(*p2)); + + for (p = xcp; p < p1; p++) + x_zotc(*p); + for (p = p2; p < xcp; p++) + x_zotc(*p); + + memcpy(rune1, xcp, p1 - xcp); + memcpy(rune2, p2, xcp - p2); + memcpy(p2, rune1, p1 - xcp); + memcpy(p2 + (p1 - xcp), rune2, xcp - p2); + + xcp = p1; + x_goto(p1); } return KSTD; } @@ -1804,6 +1911,11 @@ x_adjust(void) */ if ((xbp = xcp - (x_displen / 2)) < xbuf) xbp = xbuf; + else { + /* rewind to the last valid codepoint */ + while (xbp > xbuf && isu8cont((unsigned char) *xbp)) + xbp--; + } xlp_valid = false; x_redraw(xx_cols); x_flush(); @@ -1882,8 +1994,15 @@ x_e_getu8(char *buf, int off) } static void -x_e_putc(int c) +x_e_putc(int sc) { +#ifndef SMALL + static unsigned char ch[5] = { 0 }; + static int len = 0, cnt = 0; +#endif + unsigned char c; + + c = sc; if (c == '\r' || c == '\n') x_col = 0; if (x_col < xx_cols) { @@ -1898,9 +2017,39 @@ x_e_putc(int c) x_col--; break; default: +#ifdef SMALL if (!isu8cont(c)) x_col++; break; +#else + if (!isu8cont(c)) { + if (c <= 0x7f) { + x_col++; + len = 0; + break; + } + + if ((c & 0xf8) == 0xf0 && c < 0xf5) + len = 3; + else if ((c & 0xf0) == 0xe0) + len = 2; + else if ((c & 0xe0) == 0xc0 && c > 0xc1) + len = 1; + else { + len = 0; + break; + } + + cnt = 0; + memset(ch, 0, 5); + ch[cnt++] = c; + } else { + ch[cnt++] = c; + if (cnt > len) + x_col += u8width(ch); + } + break; +#endif } } if (x_adj_ok && (x_col < 0 || x_col >= (xx_cols - 2))) diff --git a/bin/ksh/main.c b/bin/ksh/main.c index b011b22bf..a269c5364 100644 --- a/bin/ksh/main.c +++ b/bin/ksh/main.c @@ -15,6 +15,10 @@ #include <string.h> #include <unistd.h> +#ifndef SMALL +#include <locale.h> +#endif + #include "sh.h" extern char **environ; @@ -152,6 +156,10 @@ main(int argc, char *argv[]) struct env env; pid_t ppid; +#ifndef SMALL + setlocale(LC_ALL, ""); +#endif + kshname = argv[0]; if (issetugid()) { /* could later drop privileges */ diff --git a/bin/ksh/unicode.c b/bin/ksh/unicode.c new file mode 100644 index 000000000..a1b3c4a39 --- /dev/null +++ b/bin/ksh/unicode.c @@ -0,0 +1,33 @@ +#include <stddef.h> +#include <stdlib.h> +#include <wchar.h> + +#include "unicode.h" + +#ifndef SMALL + +int +u8width(const char *buf) { + const unsigned char *ubuf = buf; + wchar_t wc; + int u8len; + + u8len = 0; + if (ubuf[0] <= 0x7F) + u8len = 1; + else if ((ubuf[0] & 0xE0) == 0xC0) + u8len = 2; + else if ((ubuf[0] & 0xF0) == 0xE0) + u8len = 3; + else if ((ubuf[0] & 0xF8) == 0xF0) + u8len = 4; + + if (mbtowc(&wc, buf, u8len) == u8len) + return wcwidth(wc); + else { + mbtowc(&wc, NULL, 0); + return 1; + } +} + +#endif diff --git a/bin/ksh/unicode.h b/bin/ksh/unicode.h new file mode 100644 index 000000000..61ac18cce --- /dev/null +++ b/bin/ksh/unicode.h @@ -0,0 +1,6 @@ +#ifndef UNICODE_H +#define UNICODE_H + +int u8width(const char *); + +#endif /* UNICODE_H */ -- 2.46.1 --- Gong Zhile (William Goodspeed)

- 2025-04-07 02:48 ori@eigenstate.org:
  [REPOST] ksh: utf8 full width character support for emacs.c

2025-03-21 10:44 Ingo Schwarze:

[REPOST] ksh: utf8 full width character support for emacs.c

2025-04-04 15:36 William Goodspeed:
[REPOST] ksh: utf8 full width character support for emacs.c
- 2025-04-08 06:39 Christian Schulte:
  [REPOST] ksh: utf8 full width character support for emacs.c
- - 2025-04-13 14:30 William Goodspeed:
    [REPOST] ksh: utf8 full width character support for emacs.c