/* * Functions which make use of the unicode facts in unifacts.c * * Copyright * (C) 2015 Joseph H. Allen * * This file is part of JOE (Joe's Own Editor) */ #include "types.h" /* Convert UTF-32 string to lowercase for case folding */ struct Rtree rtree_fold[1]; int *lowerize(int *d, ptrdiff_t len, const int *s) { int *org = d; if (!len) { fprintf(stderr, "lowerize called with len == 0\n"); exit(1); } --len; while (len && *s) { int idx = rmap_lookup(rtree_fold, *s, 0); if (idx < FOLDMAGIC) { /* Replace it with a single character */ *d++ = *s++ + idx; --len; } else { /* Replace it with a string */ idx -= FOLDMAGIC; ++s; *d++ = fold_repl[idx][0]; --len; if (len && fold_repl[idx][1]) { *d++ = fold_repl[idx][1]; --len; if (len && fold_repl[idx][2]) { *d++ = fold_repl[idx][2]; --len; } } } } *d = 0; return org; } /* Get a character class containing all characters matching a particular unicode category or block */ HASH *unicat_hash; struct Cclass *unicode(const char *cat) { struct Cclass *m; if (!unicat_hash) unicat_hash = htmk(256); m = (struct Cclass *)htfind(unicat_hash, cat); if (!m) { int x; m = (struct Cclass *)joe_malloc(SIZEOF(struct Cclass)); cclass_init(m); for (x = 0; unicat[x].name; ++x) /* Match exact category name or set of categories like 'L' matches 'Ll', 'Lu', etc. */ if (!zcmp(unicat[x].name, cat) || (cat[0] == unicat[x].name[0] && !cat[1] && unicat[x].name[1] && !unicat[x].name[2])) { cclass_merge(m, unicat[x].intervals, unicat[x].len); } if (!m->len) { joe_free(m); m = 0; } else { cclass_opt(m); htadd(unicat_hash, zdup(cat), m); } } return m; } /* iswxxx functions */ struct Cclass cclass_upper[1]; int joe_iswupper(struct charmap *foo, int ch) { return cclass_lookup(cclass_upper, ch); } struct Cclass cclass_lower[1]; int joe_iswlower(struct charmap *foo, int ch) { return cclass_lookup(cclass_lower, ch); } struct Cclass cclass_alpha[1]; int joe_iswalpha(struct charmap *foo, int ch) { return cclass_lookup(cclass_alpha, ch); } struct Cclass cclass_alpha_[1]; struct Cclass cclass_notalpha_[1]; int joe_iswalpha_(struct charmap *foo, int ch) { return cclass_lookup(cclass_alpha_, ch); } struct Cclass cclass_alnum[1]; int joe_iswalnum(struct charmap *foo, int ch) { return cclass_lookup(cclass_alnum, ch); } struct Cclass cclass_alnum_[1]; struct Cclass cclass_notalnum_[1]; int joe_iswalnum_(struct charmap *foo, int ch) { return cclass_lookup(cclass_alnum_, ch); } struct Cclass cclass_digit[1]; int joe_iswdigit(struct charmap *foo, int ch) { return cclass_lookup(cclass_digit, ch); } struct Cclass cclass_notdigit[1]; struct Cclass cclass_xdigit[1]; int joe_iswxdigit(struct charmap *foo, int ch) { return cclass_lookup(cclass_xdigit, ch); } struct Cclass cclass_punct[1]; int joe_iswpunct(struct charmap *foo, int ch) { return cclass_lookup(cclass_punct, ch); } struct Cclass cclass_space[1]; int joe_iswspace(struct charmap *foo, int ch) { return cclass_lookup(cclass_space, ch); } struct Cclass cclass_notspace[1]; struct Cclass cclass_blank[1]; int joe_iswblank(struct charmap *foo, int ch) { return cclass_lookup(cclass_blank, ch); } struct Cclass cclass_ctrl[1]; int joe_iswctrl(struct charmap *foo, int ch) { return cclass_lookup(cclass_ctrl, ch); } struct Cclass cclass_graph[1]; int joe_iswgraph(struct charmap *foo, int ch) { return cclass_lookup(cclass_graph, ch); } struct Cclass cclass_print[1]; int joe_iswprint(struct charmap *foo, int ch) { return cclass_lookup(cclass_print, ch); } struct Cclass cclass_word[1]; struct Cclass cclass_notword[1]; struct Rtree rtree_tolower[1]; int joe_towlower(struct charmap *foo, int ch) { return ch + rmap_lookup(rtree_tolower, ch, 0); } struct Rtree rtree_toupper[1]; int joe_towupper(struct charmap *foo, int ch) { return ch + rmap_lookup(rtree_toupper, ch, 0); } /* Combining characters */ struct Cclass cclass_combining[1]; /* Double-width characters */ struct Cclass cclass_double[1]; /* This is how ASCII is classified in UNICODE: Cc: 00 - 1F, 7F Zs: 20 Po: ! " # % & ' * , . / : ; ? @ \ Sc: $ Pd: - Ps: ( [ { Pe: ) ] } Sm: + < = > | ~ Sk: ^ ` Pc: _ Nd: 0 - 9 Lu: A - Z Ll: a - z Notes: For "blank", you probably want Zs and tab For "whitespace", you probably want Zs, tab, newline, carriage return and form-feed For "identifier start", you probably want letters, Pc and maybe Sc For "identifier rest", you probably want letters, digits, Pc and maybe Sc Convenient character classes: see http://www.w3.org/TR/xml11/#NT-NameStartChar see http://www.w3.org/TR/xmlschma11-2/#regexs cclass_digit: \d Digit: same as \p{Nd} cclass_notdigit: \D opposite cclass_space: \s space, tab, newline, return [JOE also includes formfeed!] cclass_notspace: \S opposite \i NameStartChar : A-Z _ a-z C0-D6 D8-F6 F8-2FF 370-37D 37F-1FFF 200C-200D 2070-218F 2C00-2FEF 3001-D7FF F900-FDCF FDF0-FFFD 10000-EFFFF \I opposite \c NameChar \i - . 0-9 B7 0300-036F 203F-2040 \C opposite cclass_word: \w word character: [\x{0}-\x{10ffff}]-[\p{P}\p{Z}\p{C}] cclass_notword: \W opposite */ void joe_iswinit() { int x; /* Upper */ cclass_init(cclass_upper); cclass_union(cclass_upper, unicode("Lu")); cclass_opt(cclass_upper); /* Lower */ cclass_init(cclass_lower); cclass_union(cclass_lower, unicode("Ll")); cclass_opt(cclass_lower); /* Alphabetical */ cclass_init(cclass_alpha); cclass_union(cclass_alpha, unicode("L")); cclass_union(cclass_alpha, unicode("M")); cclass_opt(cclass_alpha); /* Alphabetical + underscores (name start character) */ /* Java has: isJavaIdentifierStart: \p{Nl} \p{L} \p{Pc} \p{Sc} */ /* Unicode has ID_Start: L + Nl + other_id_start - pattern_syntax - pattern_whitespace from DerivedCoreProperties.txt: see: http://unicode.org/reports/tr31/ and: http://unicode.org/reports/tr31/tr31-1.html#Pattern_Syntax */ /* Used for XML and \i */ cclass_init(cclass_alpha_); cclass_union(cclass_alpha_, unicode("L")); cclass_union(cclass_alpha_, unicode("Pc")); /* cclass_union(cclass_alpha_, unicode("Sc")); */ cclass_union(cclass_alpha_, unicode("Nl")); cclass_opt(cclass_alpha_); /* \I */ cclass_init(cclass_notalpha_); cclass_union(cclass_notalpha_, cclass_alpha_); cclass_inv(cclass_notalpha_); cclass_opt(cclass_notalpha_); /* Alphanumeric */ cclass_init(cclass_alnum); cclass_union(cclass_alnum, unicode("L")); cclass_union(cclass_alnum, unicode("M")); cclass_union(cclass_alnum, unicode("N")); cclass_opt(cclass_alnum); /* Alphanumeric + underscores (name continuation character) */ /* Java has: isJavaIdentifierPart: isJavaIdentifierStart \p{Mn} \p{Mc} \p{Nd} ignorable */ /* Ignorable: 0x00-0x08, 0x0e-0x1b, 0x7f-0x9F */ /* Unicode has ID_Continue: ID_Start + Mn + Mc + Nd + Pc + Other_ID_Continue - Pattern_syntax - Pattern_whitespace */ /* Used for XML */ /* \c */ cclass_init(cclass_alnum_); cclass_union(cclass_alnum_, unicode("L")); cclass_union(cclass_alnum_, unicode("Pc")); /* cclass_union(cclass_alpha_, unicode("Sc")); */ cclass_union(cclass_alpha_, unicode("Nl")); cclass_union(cclass_alnum_, unicode("Mn")); cclass_union(cclass_alnum_, unicode("Mc")); cclass_union(cclass_alnum_, unicode("Nd")); cclass_add(cclass_alnum_, 0x200c, 0x200d); cclass_opt(cclass_alnum_); /* \C */ cclass_init(cclass_notalnum_); cclass_union(cclass_notalnum_, cclass_alnum_); cclass_inv(cclass_notalnum_); cclass_opt(cclass_notalnum_); /* Digit */ cclass_init(cclass_digit); cclass_union(cclass_digit, unicode("Nd")); cclass_opt(cclass_digit); /* Not a digit */ cclass_init(cclass_notdigit); cclass_union(cclass_notdigit, cclass_digit); cclass_inv(cclass_notdigit); cclass_opt(cclass_notdigit); /* Hex digit */ cclass_init(cclass_xdigit); cclass_union(cclass_xdigit, unicode("Nd")); cclass_add(cclass_xdigit, 'a', 'f'); cclass_add(cclass_xdigit, 'A', 'F'); cclass_opt(cclass_xdigit); /* Punctuation */ cclass_init(cclass_punct); cclass_union(cclass_punct, unicode("P")); cclass_opt(cclass_punct); /* Whitespace */ cclass_init(cclass_space); cclass_add(cclass_space, '\t', '\t'); cclass_add(cclass_space, '\r', '\r'); cclass_add(cclass_space, '\n', '\n'); cclass_add(cclass_space, '\f', '\f'); cclass_union(cclass_space, unicode("Z")); cclass_opt(cclass_space); /* Not whitespace */ cclass_init(cclass_notspace); cclass_union(cclass_notspace, cclass_space); cclass_inv(cclass_notspace); cclass_opt(cclass_notspace); /* Blanks: tab included */ cclass_init(cclass_blank); cclass_add(cclass_blank, '\t', '\t'); cclass_union(cclass_blank, unicode("Zs")); cclass_opt(cclass_blank); /* Control characters */ cclass_init(cclass_ctrl); cclass_union(cclass_ctrl, unicode("C")); cclass_union(cclass_ctrl, unicode("Zl")); cclass_union(cclass_ctrl, unicode("Zp")); cclass_opt(cclass_ctrl); /* Printable characters (kind of inverse of control characters) */ cclass_init(cclass_print); cclass_union(cclass_print, unicode("L")); cclass_union(cclass_print, unicode("M")); cclass_union(cclass_print, unicode("S")); cclass_union(cclass_print, unicode("N")); cclass_union(cclass_print, unicode("P")); cclass_union(cclass_print, unicode("Zs")); cclass_opt(cclass_print); /* Graphical characters (no spaces) */ cclass_init(cclass_graph); cclass_union(cclass_graph, unicode("L")); cclass_union(cclass_graph, unicode("M")); cclass_union(cclass_graph, unicode("S")); cclass_union(cclass_graph, unicode("N")); cclass_union(cclass_graph, unicode("P")); cclass_opt(cclass_graph); /* Not word characters */ cclass_init(cclass_notword); cclass_union(cclass_notword, unicode("C")); cclass_union(cclass_notword, unicode("P")); cclass_union(cclass_notword, unicode("Z")); cclass_opt(cclass_notword); /* Word characters */ cclass_init(cclass_word); cclass_union(cclass_word, cclass_notword); cclass_inv(cclass_word); cclass_opt(cclass_word); /* Convert to uppercase */ rmap_init(rtree_toupper); for (x = 0; toupper_table[x].first; ++x) { rmap_add(rtree_toupper, toupper_table[x].first, toupper_table[x].last, toupper_cvt[x] - toupper_table[x].first, 0); } rmap_opt(rtree_toupper); /* Convert to lowercase */ rmap_init(rtree_tolower); for (x = 0; tolower_table[x].first; ++x) { rmap_add(rtree_tolower, tolower_table[x].first, tolower_table[x].last, tolower_cvt[x] - tolower_table[x].first, 0); } rmap_opt(rtree_tolower); /* Set up fold table */ rmap_init(rtree_fold); for (x = 0; fold_table[x].first; ++x) { if (fold_repl[x][1]) rmap_add(rtree_fold, fold_table[x].first, fold_table[x].last, FOLDMAGIC + x, 0); else rmap_add(rtree_fold, fold_table[x].first, fold_table[x].last, fold_repl[x][0] - fold_table[x].first, 0); } rmap_opt(rtree_fold); /* Combining characters: for JOE this means - we don't account for their width, they are merged with a start character - a start plus combining contribute to the appearance of the character, so we update the character (resend the whole sequence to the terminal) if any of them change */ cclass_init(cclass_combining); cclass_union(cclass_combining, unicode("Me")); cclass_union(cclass_combining, unicode("Mn")); cclass_add(cclass_combining, 0x1160, 0x11FF); /* These act like combining characters */ cclass_opt(cclass_combining); /* Double width characters */ cclass_init(cclass_double); for (x = 0; width_table[x].first; ++x) cclass_add(cclass_double, width_table[x].first, width_table[x].last); cclass_opt(cclass_double); } /* Digit value of any \p{Nd} digit */ /* Note that intervals in Nd table are not merged! */ static struct unicat *digtable = 0; int digval(int ch) { if (!digtable) { int x; for (x = 0; unicat[x].name; ++x) if (!zcmp(unicat[x].name, "Nd")) { digtable = &unicat[x]; break; } } if (digtable) { ptrdiff_t idx = interval_test(digtable->intervals, digtable->len, ch); if (idx != -1) { return ch - digtable->intervals[idx].first; } } return -1; } /* * This is an implementation of wcwidth() and wcswidth() (defined in * IEEE Std 1002.1-2001) for Unicode. * * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html * * In fixed-width output devices, Latin characters all occupy a single * "cell" position of equal width, whereas ideographic CJK characters * occupy two such cells. Interoperability between terminal-line * applications and (teletype-style) character terminals using the * UTF-8 encoding requires agreement on which character should advance * the cursor by how many cell positions. No established formal * standards exist at present on which Unicode character shall occupy * how many cell positions on character terminals. These routines are * a first attempt of defining such behavior based on simple rules * applied to data provided by the Unicode Consortium. * * For some graphical characters, the Unicode standard explicitly * defines a character-cell width via the definition of the East Asian * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. * In all these cases, there is no ambiguity about which width a * terminal shall use. For characters in the East Asian Ambiguous (A) * class, the width choice depends purely on a preference of backward * compatibility with either historic CJK or Western practice. * Choosing single-width for these characters is easy to justify as * the appropriate long-term solution, as the CJK practice of * displaying these characters as double-width comes from historic * implementation simplicity (8-bit encoded characters were displayed * single-width and 16-bit ones double-width, even for Greek, * Cyrillic, etc.) and not any typographic considerations. * * Much less clear is the choice of width for the Not East Asian * (Neutral) class. Existing practice does not dictate a width for any * of these characters. It would nevertheless make sense * typographically to allocate two character cells to characters such * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be * represented adequately with a single-width glyph. The following * routines at present merely assign a single-cell width to all * neutral characters, in the interest of simplicity. This is not * entirely satisfactory and should be reconsidered before * establishing a formal standard in this area. At the moment, the * decision which Not East Asian (Neutral) characters should be * represented by double-width glyphs cannot yet be answered by * applying a simple rule from the Unicode database content. Setting * up a proper standard for the behavior of UTF-8 character terminals * will require a careful analysis not only of each Unicode character, * but also of each presentation form, something the author of these * routines has avoided to do so far. * * http://www.unicode.org/unicode/reports/tr11/ * * Markus Kuhn -- 2007-05-26 (Unicode 5.0) * * Permission to use, copy, modify, and distribute this software * for any purpose and without fee is hereby granted. The author * disclaims all warranties with regard to this software. * * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c */ /* The following two functions define the column width of an ISO 10646 * character as follows: * * - The null character (U+0000) has a column width of 0. * * - Other C0/C1 control characters and DEL will lead to a return * value of -1. * * - Non-spacing and enclosing combining characters (general * category code Mn or Me in the Unicode database) have a * column width of 0. * * - SOFT HYPHEN (U+00AD) has a column width of 1. * * - Other format characters (general category code Cf in the Unicode * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. * * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) * have a column width of 0. * * - Spacing characters in the East Asian Wide (W) or East Asian * Full-width (F) category as defined in Unicode Technical * Report #11 have a column width of 2. * * - All remaining characters (including all printable * ISO 8859-1 and WGL4 characters, Unicode control characters, * etc.) have a column width of 1. * * This implementation assumes that wchar_t characters are encoded * in ISO 10646. */ /* Modified for JOE: returns printed width of control and other non-printable characters */ int joe_wcwidth(int wide,int ucs) { /* If ANSI color sequences exist (ansi mode), they are 0 width */ if (ucs & ANSI_BIT) return 0; /* If terminal is not UTF-8 or file is not UTF-8: width is 1 */ /* FIXME */ if (!locale_map->type || !wide) return 1; /* Negative characters are characters in range 128 - 255 converted from signed char to int */ if (ucs < 0) ucs += 256; /* Printed width of non-printable characters */ if (!cclass_lookup(cclass_print, ucs)) { if (ucs < 0x80) /* Ctrl-A is printed as underlined A in JOE */ return 1; else if (ucs < 0x100) /* */ return 4; else if (ucs < 0x1000) /* */ return 5; else if (ucs < 0x10000) /* */ return 6; else if (ucs < 0x100000) /* */ return 7; else if (ucs < 0x1000000) /* */ return 8; else if (ucs < 0x10000000) /* */ return 9; else /* */ return 10; } /* Combining characters are merged with their start character so they have no width */ if (cclass_lookup(cclass_combining, ucs)) return 0; /* Some characters are double-width */ if (cclass_lookup(cclass_double, ucs)) return 2; return 1; } /* Width of a string: was in qw.c. Do we need both this and txtwidth? This one does not account for tabs. */ ptrdiff_t joe_wcswidth(struct charmap *map,const char *s, ptrdiff_t len) { if (!map->type) { return len; } else { int width = 0; while (len) { int c = utf8_decode_fwrd(&s, &len); if (c >= 0) { width += joe_wcwidth(1, c); } else ++width; } return width; } } /* Return true if c is a control character which should not be sent directly * to the terminal, but should instead be displayed like <2028>. joe_wcwidth gives * the displayed width of these control characters. */ int unictrl(int ucs) { return !cclass_lookup(cclass_print, ucs); } /* Copy character from one string to another */ void copy_c(char **d, const char **s) { if (locale_map->type) { *d += utf8_encode(*d, utf8_decode_fwrd(s, NULL)); } else if (**s) { **d = **s; (*s)++; (*d)++; } } /* Get next character from string and advance it, locale dependent */ int fwrd_c(struct charmap *map, const char **s) { if (map->type) return utf8_decode_fwrd(s, NULL); else { int c = *(const unsigned char *)*s; *s = *s + 1; return c; } }