/* * Regular expression subroutines * Copyright * (C) 1992 Joseph H. Allen * * This file is part of JOE (Joe's Own Editor) */ #include "types.h" /* Parse one character. It can be UTF-8 if utf8 is set. b has pointer to string length or NULL for zero-terminated strings */ /* Returns the character or -256 for a category (in which case *cat is filled in if it's not NULL) */ /* Returns NUL if a/b is at end of string */ /* Returns UTF8_BAD or UTF8_INCOMPLETE for bad UTF-8 sequences */ int escape(int utf8, const char **a, ptrdiff_t *b, struct Cclass **cat) { int c = 0; const char *s = *a; ptrdiff_t l; if (cat) *cat = 0; if (b) l = *b; else l = -1; if ((b ? l >= 2 : s[1]) && *s == '\\') { /* Backslash.. */ ++s; --l; switch (*s) { case 'i': { /* Identifier start */ ++s; --l; c = -256; if (cat) *cat = cclass_alpha_; break; } case 'I': { /* Not identifier start */ ++s; --l; c = -256; if (cat) *cat = cclass_notalpha_; break; } case 'c': { /* Identifier continue */ ++s; --l; c = -256; if (cat) *cat = cclass_alnum_; break; } case 'C': { /* Not identifier continue */ ++s; --l; c = -256; if (cat) *cat = cclass_notalnum_; break; } case 'w': { /* Word */ ++s; --l; c = -256; if (cat) *cat = cclass_word; break; } case 'W': { /* Not a word */ ++s; --l; c = -256; if (cat) *cat = cclass_notword; break; } case 's': { /* Space */ ++s; --l; c = -256; if (cat) *cat = cclass_space; break; } case 'S': { /* Not a space */ ++s; --l; c = -256; if (cat) *cat = cclass_notspace; break; } case 'd': { /* Digit */ ++s; --l; c = -256; if (cat) *cat = cclass_digit; break; } case 'D': { /* Not a digit */ ++s; --l; c = -256; if (cat) *cat = cclass_notdigit; break; #if 0 } case 'c': { /* Control */ c = 0; ++s; --l; if (l) { c = (*s++ & 0x1F); --l; } break; #endif } case 'n': { /* Newline */ c = 10; ++s; --l; break; } case 't': case '9': { /* Tab */ c = 9; ++s; --l; break; } case 'a': { /* Alert */ c = 7; ++s; --l; break; } case 'b': case '8': { /* Backspace */ c = 8; ++s; --l; break; } case 'f': { /* Formfeed */ c = 12; ++s; --l; break; } case 'e': { /* Escape */ c = 27; ++s; --l; break; } case 'r': { /* Carriage return */ c = 13; ++s; --l; break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { /* Octal */ c = *s - '0'; ++s; --l; if ((!b || l > 0) && *s >= '0' && *s <= '7') { c = c * 8 + s[1] - '0'; ++s; --l; } if ((!b || l > 0) && *s >= '0' && *s <= '7') { c = c * 8 + s[1] - '0'; ++s; --l; } break; } case 'p': { /* Character class made from unicode category */ ++s; --l; c = 'X'; if ((!b || l > 0) && *s == '{') { char buf[80]; ptrdiff_t idx = 0; ++s; --l; while ((!b || l > 0) && *s && *s != '}') { if (idx != SIZEOF(buf) - 1) buf[idx++] = *s; ++s; --l; } if ((!b || l > 0) && *s == '}') { ++s; --l; } buf[idx] = 0; c = -256; if (cat) *cat = unicode(buf); } break; } case 'x': case 'X': { /* Hex or Unicode */ c = 0; ++s; --l; if ((!b || l > 0) && *s == '{') { ++s; --l; while (!b || l > 0) { if (*s >= '0' && *s <= '9') { c = c * 16 + *s - '0'; ++s; --l; } else if (*s >= 'A' && *s <= 'F') { c = c * 16 + *s - 'A' + 10; ++s; --l; } else if (*s >= 'a' && *s <= 'f') { c = c * 16 + *s - 'a' + 10; ++s; --l; } else { break; } } if ((!b || l > 0) && *s == '}') { ++s; --l; } } else { if ((!b || l > 0) && *s >= '0' && *s <= '9') { c = c * 16 + *s - '0'; ++s; --l; } else if ((!b || l > 0) && *s >= 'A' && *s <= 'F') { c = c * 16 + *s - 'A' + 10; ++s; --l; } else if ((!b || l > 0) && *s >= 'a' && *s <= 'f') { c = c * 16 + *s - 'a' + 10; ++s; --l; } if ((!b || l > 0) && *s >= '0' && *s <= '9') { c = c * 16 + *s - '0'; ++s; --l; } else if ((!b || l > 0) && *s >= 'A' && *s <= 'F') { c = c * 16 + *s - 'A' + 10; ++s; --l; } else if ((!b || l > 0) && *s >= 'a' && *s <= 'f') { c = c * 16 + *s - 'a' + 10; ++s; --l; } } break; } default: { /* Character as-is after backslash */ if (utf8) c = utf8_decode_fwrd(&s, (b ? &l : NULL)); else { c = *(const unsigned char *)s++; --l; } break; } } } else if (!b || l > 0) { if (utf8) { c = utf8_decode_fwrd(&s, (b ? &l : NULL)); } else { c = *(const unsigned char *)s++; --l; } } *a = s; if (b) *b = l; return c; } /* Compile regular expression */ /* Create tree node */ static int mk_node(struct regcomp *r, int ty, int nl, int nr) { int no; if (r->len == r->size) { r->size *= 2; r->nodes = (struct node *)joe_realloc(r->nodes, SIZEOF(struct node) * r->size); } no = r->len++; r->nodes[no].type = ty; r->nodes[no].r = nr; r->nodes[no].l = nl; cclass_init(r->nodes[no].cclass); return no; } /* Print tree */ static void ind(int x) { while (x--) logmessage_0(" "); } static void show(struct regcomp *r, int no, int x) { if (no != -1) { if (r->nodes[no].type >= 0) { ind(x); logmessage_1("'%c'\n", r->nodes[no].type); } else { ind(x); logmessage_1("%c\n", -r->nodes[no].type); } if (r->nodes[no].type == -'[') { ind(x + 1); cclass_show(r->nodes[no].cclass); } show(r, r->nodes[no].l, x + 1); show(r, r->nodes[no].r, x + 1); } } /* Conventional syntax regex */ static int do_parse_conventional(struct regcomp *g, int prec, int fold) { int no = -1; /* Get first item */ again: if (!g->l || *g->ptr == ')' || *g->ptr == '|') { no = -1; } else if (*g->ptr == '(') { ++g->ptr; --g->l; if (g->l >= 2 && g->ptr[0] == '?' && g->ptr[1] == '#') { /* Ignore comment */ /* printf("Comment\n"); */ g->ptr += 2; g->l -= 2; while (g->l && *g->ptr != ')') { ++g->ptr; --g->l; } if (g->l && *g->ptr == ')') { ++g->ptr; --g->l; } goto again; } else if (g->l >= 2 && g->ptr[0] == '?' && g->ptr[1] == ':') { /* Grouping, no sub-match addressing */ g->ptr += 2; no = mk_node(g, -'(', -1, do_parse_conventional(g, 0, fold)); } else { /* Grouping with sub-match addressing */ no = mk_node(g, -'{', -1, do_parse_conventional(g, 0, fold)); } if (g->l && *g->ptr == ')') { ++g->ptr; --g->l; } else { printf("Unbalanced parenthesis\n"); return 0; } } else if (g->l && (*g->ptr == '.' || *g->ptr == '^' || *g->ptr == '$')) { no = mk_node(g, -*g->ptr++, -1, -1); --g->l; } else if (g->l >= 2 && g->ptr[0] == '\\' && (g->ptr[1] == '!')) { no = mk_node(g, -'e', -1, -1); g->ptr += 2; g->l -= 2; } else if (g->l >= 2 && g->ptr[0] == '\\' && (g->ptr[1] == 'y')) { no = mk_node(g, -'{', -1, mk_node(g, -'*', -1, mk_node(g, -'.', -1, -1))); g->ptr += 2; g->l -= 2; } else if (g->l >= 2 && g->ptr[0] == '\\' && (g->ptr[1] == 'Y')) { no = mk_node(g, -'{', -1, mk_node(g, -'*', -1, mk_node(g, -'e', -1, -1))); g->ptr += 2; g->l -= 2; } else if (g->l >= 2 && g->ptr[0] == '\\' && (g->ptr[1] == 'b' || g->ptr[1] == 'B' || g->ptr[1] == 'A' || g->ptr[1] == 'Z' || g->ptr[1] == 'z' || g->ptr[1] == '>' || g->ptr[1] == '<')) { ++g->ptr; --g->l; no = mk_node(g, -*g->ptr++, -1, -1); --g->l; } else if (g->l && *g->ptr == '[') { struct Cclass *m; int inv = 0; no = mk_node(g, -'[', -1, -1); m = g->nodes[no].cclass; ++g->ptr; --g->l; if (g->l && *g->ptr == '^') { inv = 1; ++g->ptr; --g->l; } if (g->l && *g->ptr == ']') { cclass_add(m, ']', ']'); ++g->ptr; --g->l; } while (g->l && *g->ptr != ']') { struct Cclass *cat; int first, last; first = escape(g->cmap->type, &g->ptr, &g->l, &cat); if (first == -256) { cclass_union(m, cclass_remap(cat, g->cmap)); } else { if (g->l >= 2 && g->ptr[0] == '-' && g->ptr[1] != ']') { ++g->ptr; --g->l; last = escape(g->cmap->type, &g->ptr, &g->l, &cat); } else { last = first; } if (fold) { if (first >= 0) first = joe_tolower(g->cmap, first); if (last >= 0) last = joe_tolower(g->cmap, last); } cclass_add(m, first, last); } } if (g->l && *g->ptr == ']') { ++g->ptr; --g->l; } if (inv) cclass_inv(m); } else { struct Cclass *cat; int ch = escape(g->cmap->type, &g->ptr, &g->l, &cat); if (ch == -256) { no = mk_node(g, -'[', -1, -1); if (cat) cat = cclass_remap(cat, g->cmap); cclass_union(g->nodes[no].cclass, cat); } else { if (fold) { if (g->cmap->type) { /* Unicode folding... */ int idx = rmap_lookup(rtree_fold, ch, 0); if (idx < FOLDMAGIC) no = mk_node(g, ch + idx, -1, -1); else { idx -= FOLDMAGIC; no = mk_node(g, fold_repl[idx][0], -1, -1); if (fold_repl[idx][1]) { no = mk_node(g, -',', no, mk_node(g, fold_repl[idx][1], -1, -1)); } if (fold_repl[idx][2]) { no = mk_node(g, -',', no, mk_node(g, fold_repl[idx][2], -1, -1)); } } } else { no = mk_node(g, joe_tolower(g->cmap, ch), -1, -1); } } else { no = mk_node(g, ch, -1, -1); } } } /* Extend it */ for (;;) { if (g->l && *g->ptr == '*') { while (g->l && *g->ptr == '*') { ++g->ptr; --g->l; } no = mk_node(g, -'*', -1, no); } else if (g->l && *g->ptr == '+') { while (g->l && *g->ptr == '+') { ++g->ptr; --g->l; } no = mk_node(g, -'+', -1, no); } else if (g->l && *g->ptr == '?') { while (g->l && *g->ptr == '?') { ++g->ptr; --g->l; } no = mk_node(g, -'?', -1, no); } else if (g->l && *g->ptr == '{') { int org = no; int min = 0, max = 0; ++g->ptr; --g->l; while (g->l && *g->ptr >= '0' && *g->ptr <= '9') { min = min * 10 + *g->ptr++ - '0'; --g->l; } if (g->l && *g->ptr == ',') { ++g->ptr; --g->l; if (g->l && *g->ptr >= '0' && *g->ptr <= '9') { while (g->l && *g->ptr >= '0' && *g->ptr <= '9') { max = max * 10 + *g->ptr++ - '0'; g->l--; } } else { max = -1; } } if (g->l && *g->ptr == '}') { ++g->ptr; --g->l; } /* Turn max into optional */ if (max > min) max -= min; /* Do min */ no = -1; while (min--) { if (no == -1) no = org; else no = mk_node(g, -',', no, org); } if (max == 0) { /* Exact */ } else if (max == -1) { /* Open */ no = mk_node(g, -',', no, mk_node(g, -'*', -1, org)); } else { /* Max */ int q = -1; while (max--) { if (q == -1) q = mk_node(g, -'?', -1, org); else q = mk_node(g, -'?', -1, mk_node(g, -',', q, org)); } no = mk_node(g, -',', no, q); } } else if (g->l && *g->ptr == '|') { if (prec < 1) { ++g->ptr; --g->l; no = mk_node(g,-'|', no, do_parse_conventional(g, 1, fold)); } else { break; } } else if (!g->l || *g->ptr == ')') { break; } else if (prec < 2) { no = mk_node(g, -',', no, do_parse_conventional(g, 2, fold)); } else break; } return no; } /* JOE syntax regex */ static int do_parse(struct regcomp *g, int prec, int fold) { int no = -1; /* Get first item */ again: if (!g->l || (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == ')') || (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '|')) { no = -1; } else if (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '(') { g->ptr += 2; g->l -= 2; if (g->l >= 2 && g->ptr[0] == '?' && g->ptr[1] == '#') { /* Ignore comment */ /* printf("Comment\n"); */ g->ptr += 2; g->l -= 2; while (g->l >= 2 && !(g->ptr[0] == '\\' && g->ptr[1] == ')')) { ++g->ptr; --g->l; } if (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == ')') { g->ptr += 2; g->l -= 2; } else if (!g->err) { g->err = "Missing \\\\)"; } goto again; } else if (g->l >= 2 && g->ptr[0] == '?' && g->ptr[1] == ':') { /* Grouping, no sub-match addressing */ g->ptr += 2; no = mk_node(g, -'(', -1, do_parse(g, 0, fold)); } else { /* Grouping with sub-match addressing */ no = mk_node(g, -'{', -1, do_parse(g, 0, fold)); } if (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == ')') { g->ptr += 2; g->l -= 2; } else if (!g->err) { g->err = "Missing \\\\)"; } } else if (g->l >= 2 && g->ptr[0] == '\\' && (g->ptr[1] == '^' || g->ptr[1] == '$' || g->ptr[1] == '>' || g->ptr[1] == '<' /* || g->ptr[1] == 'b' || g->ptr[1] == 'B' || g->ptr[1] == 'A' || g->ptr[1] == 'Z' || g->ptr[1] == 'z' */)) { no = mk_node(g, -g->ptr[1], -1, -1); g->ptr += 2; g->l -= 2; } else if (g->l >= 2 && g->ptr[0] == '\\' && (g->ptr[1] == '.')) { no = mk_node(g, -'.', -1, -1); g->ptr += 2; g->l -= 2; } else if (g->l >= 2 && g->ptr[0] == '\\' && (g->ptr[1] == 'y')) { no = mk_node(g, -'{', -1, mk_node(g, -'*', -1, mk_node(g, -'.', -1, -1))); g->ptr += 2; g->l -= 2; } else if (g->l >= 2 && g->ptr[0] == '\\' && (g->ptr[1] == 'Y')) { no = mk_node(g, -'{', -1, mk_node(g, -'*', -1, mk_node(g, -'e', -1, -1))); g->ptr += 2; g->l -= 2; } else if (g->l >= 2 && g->ptr[0] == '\\' && (g->ptr[1] == '!')) { no = mk_node(g, -'e', -1, -1); g->ptr += 2; g->l -= 2; } else if (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '[') { struct Cclass *m; int inv = 0; no = mk_node(g, -'[', -1, -1); m = g->nodes[no].cclass; g->ptr += 2; g->l -= 2; if (g->l && *g->ptr == '^') { inv = 1; ++g->ptr; --g->l; } if (g->l && *g->ptr == ']') { cclass_add(m, ']', ']'); ++g->ptr; --g->l; } while (g->l && *g->ptr != ']') { struct Cclass *cat; int first, last; first = escape(g->cmap->type, &g->ptr, &g->l, &cat); if (first == -256) { cclass_union(m, cclass_remap(cat, g->cmap)); } else { if (g->l >= 2 && g->ptr[0] == '-' && g->ptr[1] != ']') { ++g->ptr; --g->l; last = escape(g->cmap->type, &g->ptr, &g->l, &cat); } else { last = first; } if (fold) { if (first >= 0) first = joe_tolower(g->cmap, first); if (last >= 0) last = joe_tolower(g->cmap, last); } cclass_add(m, first, last); } } if (g->l && *g->ptr == ']') { ++g->ptr; --g->l; } else if (!g->err) { g->err = "Missing ]"; } if (inv) cclass_inv(m); } else { struct Cclass *cat; int ch = escape(g->cmap->type, &g->ptr, &g->l, &cat); if (ch == -256) { no = mk_node(g, -'[', -1, -1); cclass_union(g->nodes[no].cclass, cclass_remap(cat, g->cmap)); } else { if (fold) { if (g->cmap->type) { /* Unicode folding... */ int idx = rmap_lookup(rtree_fold, ch, 0); if (idx < FOLDMAGIC) no = mk_node(g, ch + idx, -1, -1); else { idx -= FOLDMAGIC; no = mk_node(g, fold_repl[idx][0], -1, -1); if (fold_repl[idx][1]) { no = mk_node(g, -',', no, mk_node(g, fold_repl[idx][1], -1, -1)); } if (fold_repl[idx][2]) { no = mk_node(g, -',', no, mk_node(g, fold_repl[idx][2], -1, -1)); } } } else { no = mk_node(g, joe_tolower(g->cmap, ch), -1, -1); } } else { no = mk_node(g, ch, -1, -1); } } } /* Extend it */ for (;;) { if (g->l >=2 && g->ptr[0] == '\\' && g->ptr[1] == '*') { while (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '*') { g->ptr += 2; g->l -= 2; } no = mk_node(g, -'*', -1, no); } else if (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '+') { while (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '+') { g->ptr += 2; g->l -= 2; } no = mk_node(g, -'+', -1, no); } else if (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '?') { while (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '?') { g->ptr += 2; g->l -= 2; } no = mk_node(g, -'?', -1, no); } else if (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '{') { int org = no; int min = 0, max = 0; g->ptr += 2; g->l -= 2; while (g->l && *g->ptr >= '0' && *g->ptr <= '9') { min = min * 10 + *g->ptr++ - '0'; --g->l; } if (g->l && *g->ptr == ',') { ++g->ptr; --g->l; if (g->l && *g->ptr >= '0' && *g->ptr <= '9') { while (g->l && *g->ptr >= '0' && *g->ptr <= '9') { max = max * 10 + *g->ptr++ - '0'; g->l--; } } else { max = -1; } } if (g->l && *g->ptr == '}') { ++g->ptr; --g->l; } else if (!g->err) { g->err = "Missing }"; } /* Turn max into optional */ if (max > min) max -= min; /* Do min */ no = -1; while (min--) { if (no == -1) no = org; else no = mk_node(g, -',', no, org); } if (max == 0) { /* Exact */ } else if (max == -1) { /* Open */ no = mk_node(g, -',', no, mk_node(g, -'*', -1, org)); } else { /* Max */ int q = -1; while (max--) { if (q == -1) q = mk_node(g, -'?', -1, org); else q = mk_node(g, -'?', -1, mk_node(g, -',', q, org)); } no = mk_node(g, -',', no, q); } } else if (g->l >= 2 && g->ptr[0] == '\\' && g->ptr[1] == '|') { if (prec < 1) { g->ptr += 2; g->l -= 2; no = mk_node(g, -'|', no, do_parse(g, 1, fold)); } else { break; } } else if (!g->l || (g->ptr[0] == '\\' && g->ptr[1] == ')')) { break; } else if (prec < 2) { no = mk_node(g, -',', no, do_parse(g, 2, fold)); } else break; } return no; } /* Disassembler */ static const char *iname(int c) { if (c >= 0) return "CHAR"; else switch(c) { case iEXPR: return "EXPR"; case iDOT: return "DOT"; case iBOL: return "BOL"; case iEOL: return "EOL"; case iBOW: return "BOW"; case iEOW: return "EOW"; case iBRA: return "BRA"; case iKET: return "KET"; case iFORK: return "FORK"; case iJUMP: return "JUMP"; case iCLASS: return "CLASS"; case iEND: return "END"; default: return "HUH?"; } } static void unasm(Frag *f) { ptrdiff_t pc = 0; logmessage_0("PC INSN\n"); logmessage_0("-- ----\n"); for (;;) { ptrdiff_t i = pc; int c = fetchi(f, &pc); if (c >= 0) logmessage_2("%lld: '%c'\n", (long long)i, c); else switch(c) { case iDOT: logmessage_1("%lld: .\n", (long long)i); break; case iEXPR: logmessage_1("%lld: expr\n", (long long)i); break; case iBOL: logmessage_1("%lld: ^\n", (long long)i); break; case iEOL: logmessage_1("%lld: $\n", (long long)i); break; case iBOW: logmessage_1("%lld: <\n", (long long)i); break; case iEOW: logmessage_1("%lld: >\n", (long long)i); break; case iBRA: logmessage_2("%lld: bra %d\n", (long long)i, fetchi(f, &pc)); break; case iKET: logmessage_2("%lld: ket %d\n", (long long)i, fetchi(f, &pc)); break; case iFORK: { int arg = fetchi(f, &pc); logmessage_2("%lld: fork %lld\n", (long long)i, (long long)(arg + (pc - SIZEOF(int)))); break; } case iJUMP: { int arg = fetchi(f, &pc); logmessage_2("%lld: jump %lld\n", (long long)i, (long long)(arg + (pc - SIZEOF(int)))); break; } case iCLASS: { struct Cclass *r = (struct Cclass *)fetchp(f, &pc); logmessage_1("%lld: class ", (long long)i); cclass_show(r); break; } case iEND: logmessage_1("%lld: end\n", (long long)i); return; } } } /* Determine leading prefix of search string */ /* We can use Boyer-Moore on the prefix if: Character set is byte coded. Character set is UTF-8 coded, but no folding is requested. Character set is UTF-8 coded, folding is requested, but character is below 128 */ static int extract(struct regcomp *g, int no, int fold) { while (no != -1) { if (g->nodes[no].type == -',') { if (!extract(g, g->nodes[no].l, fold)) { no = g->nodes[no].r; continue; /* This keeps recursion depth low */ } else { return -1; } } else if ((g->nodes[no].type >= 0) && ( (g->nodes[no].type < 128 || !g->cmap->type || !fold))) { if (g->cmap->type) { /* UTF-8 */ char buf[8]; ptrdiff_t x; utf8_encode(buf, g->nodes[no].type); for (x = 0; buf[x]; ++x) { if (g->prefix_len + 1 == g->prefix_size) { g->prefix_size *= 2; g->prefix = (char *)joe_realloc(g->prefix, g->prefix_size); } g->prefix[g->prefix_len++] = buf[x]; } } else { /* Bytes */ if (g->prefix_len + 1 == g->prefix_size) { g->prefix_size *= 2; g->prefix = (char *)joe_realloc(g->prefix, g->prefix_size); } g->prefix[g->prefix_len++] = TO_CHAR_OK(g->nodes[no].type); } return 0; } else { /* Anything else, and we're done. */ return -1; } } return 0; } /* Convert parse-tree into code with infinite loops */ static void codegen(struct regcomp *g, int no, int *end) { while(no != -1) { switch (g->nodes[no].type) { case -'{': { int my_bra_no = g->bra_no++; emiti(g->frag, iBRA); emiti(g->frag, my_bra_no); codegen(g, g->nodes[no].r, 0); emiti(g->frag, iKET); emiti(g->frag, my_bra_no); break; } case -'(': { no = g->nodes[no].r; continue; } case -'*': { ptrdiff_t targ, start; emiti(g->frag, iFORK); targ = emiti(g->frag, 0); align_frag(g->frag, SIZEOF(int)); start = g->frag->len; codegen(g, g->nodes[no].r, 0); emiti(g->frag, iFORK); emit_branch(g->frag, start); fixup_branch(g->frag,targ); break; } case -'+': { ptrdiff_t start; align_frag(g->frag, SIZEOF(int)); start = g->frag->len; codegen(g, g->nodes[no].r, 0); emiti(g->frag, iFORK); emit_branch(g->frag, start); break; } case -'?': { ptrdiff_t targ; emiti(g->frag, iFORK); targ = emiti(g->frag, 0); codegen(g, g->nodes[no].r, 0); fixup_branch(g->frag, targ); break; } case -'|': { ptrdiff_t alt; int first; int my_end = 0; if (!end) { end = &my_end; first = 1; } else { first = 0; } emiti(g->frag, iFORK); alt = emiti(g->frag, 0); /* First */ codegen(g, g->nodes[no].l, 0); emiti(g->frag, iJUMP); *end = (int)emiti(g->frag, *end); /* Rest */ fixup_branch(g->frag, alt); codegen(g, g->nodes[no].r, end); /* All JUMPs go here */ if (first) { frag_link(g->frag, *end); } break; } case -',': { codegen(g, g->nodes[no].l, 0); no = g->nodes[no].r; continue; /* This keeps recursion depth low */ } case -'.': { emiti(g->frag, iDOT); break; } case -'e': { emiti(g->frag, iEXPR); break; } case -'^': { emiti(g->frag, iBOL); break; } case -'$': { emiti(g->frag, iEOL); break; } case -'<': { emiti(g->frag, iBOW); break; } case -'>': { emiti(g->frag, iEOW); break; } case -'[': { emiti(g->frag, iCLASS); emitp(g->frag, g->nodes[no].cclass); cclass_opt(g->nodes[no].cclass); break; } default: { emiti(g->frag, g->nodes[no].type); break; } } break; } } /* User level of parser: call it with a regex- it returns a program in a malloc block */ /* #define DEBUG 1 */ struct regcomp *joe_regcomp(struct charmap *cmap, const char *s, ptrdiff_t len, int fold, int stdfmt, int debug) { int no; struct regcomp *g; g = (struct regcomp *)joe_malloc(SIZEOF(struct regcomp)); g->len = 0; g->size = 10; g->nodes = (struct node *)joe_malloc(SIZEOF(struct node) * g->size); g->cmap = cmap; iz_frag(g->frag, SIZEOF(int)); g->bra_no = 0; g->err = 0; g->prefix_len = 0; g->prefix_size = 32; g->prefix = (char *)joe_malloc(g->prefix_size); /* Parse expression */ g->ptr = s; g->l = len; if (stdfmt) no = do_parse_conventional(g, 0, fold); else no = do_parse(g, 0, fold); if (g->l && !g->err) { g->err = "Extra junk at end of expression"; } /* Extract leading prefix */ extract(g, no, fold); g->prefix[g->prefix_len] = 0; /* Print parse tree */ if (debug) { logmessage_0("Parse tree:\n"); show(g, no, 0); logmessage_1("Leading prefix '%s'\n", g->prefix); } /* Convert tree into NFA in the form of byte code */ /* Surround regex with .*( ).* */ #if 0 /* .* */ emiti(g->frag, iFORK); b = emit_branch(g->frag, 0); if (debug) { logmessage_1("Total size = %d\n",g->frag->len); } a = emiti(g->frag, iDOT); emiti(g->frag, iFORK); emit_branch(g->frag, a); fixup_branch(g->frag, b); /* ( */ emiti(g->frag, iBRA); emiti(g->frag, 0); #endif /* User's regex */ codegen(g, no, 0); #if 0 /* ) */ emiti(g->frag, iKET); emiti(g->frag, 0); /* .* */ emiti(g->frag, iFORK); b = emit_branch(g->frag, 0); a = emiti(g->frag, iDOT); emiti(g->frag, iFORK); emit_branch(g->frag, a); fixup_branch(g->frag, b); #endif emiti(g->frag, iEND); /* Give back space */ fin_code(g->frag); /* Free parse tree */ /* joe_free(g->nodes); g->nodes = 0; */ /* Unassemble code */ if (debug) { logmessage_0("NFA-program:\n"); unasm(g->frag); logmessage_1("Total size = %lld\n",(long long)g->frag->len); } return g; } void joe_regfree(struct regcomp *g) { if (g->nodes) joe_free(g->nodes); clr_frag(g->frag); joe_free(g->prefix); joe_free(g); } #if 0 int rmatch(const unsigned char *pattern, const unsigned char *s, ...) { int match; Regex_t r[1]; Regmatch_t pmatch[MAX_MATCHES]; unsigned char **results[MAX_MATCHES]; int n = 0; va_list ap; va_start(ap, s); while (results[n] = va_arg(ap, unsigned char **)) ++n; printf("%d\n",n); va_end(ap); if (Regcomp(r, pattern, 0)) return -1; if (n > r[0]->bra_no) { Regfree(r); return -2; } match = Regexec(r, s, n, pmatch, 0); Regfree(r); if (!match) { int x; for (x = 0; x != n; ++x) { int y; int start = pmatch[x].rm_so; int len = pmatch[x].rm_eo - pmatch[x].rm_so; unsigned char *q = (unsigned char *)malloc(len + 1); for (y = 0; y != len; ++y) q[y] = s[start + y]; q[y] = 0; *results[x] = q; } } return match; } #endif /* Regular expression matcher */ #define STACK_SIZE 8 struct thread { unsigned char *pc; Regmatch_t pos[MAX_MATCHES]; int sp; int stack[STACK_SIZE]; }; static int better(Regmatch_t *a, Regmatch_t *b, int bra_no) { int y; for (y = 0; y != bra_no; ++y) { if (a[y].rm_so < b[y].rm_so) return 1; if (a[y].rm_so > b[y].rm_so) return 0; if (a[y].rm_eo < b[y].rm_eo) return 1; if (a[y].rm_eo > b[y].rm_eo) return 0; } return 0; } static int add_thread(struct thread *pool, unsigned char *start, int l, int le, unsigned char *pc, Regmatch_t *pos, int bra_no, int *stack, int sp) { int x; Regmatch_t *d; int t; for (t = l; t != le; ++t) { if (pool[t].pc == pc && pool[t].sp == sp && (!sp || !memcmp(pool[t].stack, stack, (size_t)(sp * SIZEOF(int))))) { /* PCs are same, state is same... */ int y; for (y = 0; y != bra_no; ++y) { if (pool[t].pos[y].rm_so == pos[y].rm_so && pool[t].pos[y].rm_eo == pos[y].rm_eo) { /* Identical... move on */ } else if (pool[t].pos[y].rm_eo != -1 && pos[y].rm_eo != -1) { /* Both are complete */ if (pool[t].pos[y].rm_so < pos[y].rm_so) return le; /* Current one starts earlier */ else if (pool[t].pos[y].rm_so > pos[y].rm_so) { /* New one starts earlier */ for (x = 0; x != bra_no; ++x) pool[t].pos[x] = pos[x]; return le; } else if (pool[t].pos[y].rm_eo < pos[y].rm_eo) { /* Current is shorter */ return le; } else if (pool[t].pos[y].rm_eo > pos[y].rm_eo) { /* New one is shorter */ for (x = 0; x != bra_no; ++x) pool[t].pos[x] = pos[x]; return le; } /* They are identical: look at next field */ } else if (pool[t].pos[y].rm_so != -1 && pos[y].rm_so != -1) { if (pool[t].pos[y].rm_so < pos[y].rm_so) return le; /* Current one starts earlier */ else if (pool[t].pos[y].rm_so > pos[y].rm_so) { /* New one starts earlier */ for (x = 0; x != bra_no; ++x) pool[t].pos[x] = pos[x]; return le; } } else { /* Add if there are no identical others */ break; } } if (y == bra_no) return le; /* They are equal, leave the current one. */ } } /* Add it */ if (le - l == MAX_THREADS) { printf("ran out of threads\n"); exit(-1); return le; } pool[le].pc = pc; d = pool[le].pos; for (x = 0;x != bra_no; ++x) d[x] = pos[x]; pool[le].sp = sp; for (x = 0; x != sp; ++x) pool[le].stack[x] = stack[x]; return le + 1; } static int add_thread1(struct thread *pool, unsigned char *start, int l, int le, unsigned char *pc, Regmatch_t *pos, int bra_no, int *stack, int sp) { int x; Regmatch_t *d; int t; for (t = l; t != le; ++t) { if (pool[t].pc == pc && pool[t].sp == sp && (!sp || !memcmp(pool[t].stack, stack, (size_t)(sp * SIZEOF(int))))) { /* PCs are same, state is same... */ int y; for (y = 0; y != bra_no; ++y) { if (pool[t].pos[y].rm_so == pos[y].rm_so && pool[t].pos[y].rm_eo == pos[y].rm_eo) { /* Identical... move on */ } else break; } if (y == bra_no) return le; /* They are equal, leave the current one. */ } } /* Add it */ if (le - l == MAX_THREADS) { printf("ran out of threads\n"); exit(-1); return le; } pool[le].pc = pc; d = pool[le].pos; for (x = 0;x != bra_no; ++x) d[x] = pos[x]; pool[le].sp = sp; for (x = 0; x != sp; ++x) pool[le].stack[x] = stack[x]; return le + 1; } int joe_regexec(struct regcomp *g, P *p, int nmatch, Regmatch_t *matches, int fold) { struct thread pool[MAX_THREADS * 2]; int start_bol = pisbol(p); int repl1 = 0, repl2 = 0; int cl, cle, nl, nle; int t; int c, d; int match = -1; int bra_no = g->bra_no; off_t byte; if (nmatch < bra_no) bra_no = nmatch; /* First thread */ cl = 0; cle = 1; nl = MAX_THREADS; nle = nl; c = '^'; /* Not a word character */ pool[cl].pc = g->frag->start; pool[cl].sp = 0; for (c = 0; c != bra_no; ++c) { pool[cl].pos[c].rm_so = -1; pool[cl].pos[c].rm_eo = -1; } /* We need the previous character: consider the case where regex begins with < */ c = prgetc(p); if (c != NO_MORE_DATA) pgetc(p); /* Scan string */ do { d = c; byte = p->byte; if (fold) { /* Case folding */ if (g->cmap->type) { /* Unicode folding */ if (repl1) { c = repl1; repl1 = 0; } else if (repl2) { c = repl2; repl2 = 0; } else { int idx; c = pgetc(p); idx = rmap_lookup(rtree_fold, c, 0); if (idx < FOLDMAGIC) { /* Replace with single character */ c += idx; } else { /* Replace with string */ idx -= FOLDMAGIC; c = fold_repl[idx][0]; repl1 = fold_repl[idx][1]; repl2 = fold_repl[idx][2]; } } } else { c = joe_tolower(g->cmap, pgetc(p)); } } else { /* No case folding */ c = pgetc(p); } #ifdef DEBUG logmessage_2("'%c' (%lld)\n", c, (long long)byte); #endif /* Give character to all threads */ for (t = cl; t != cle; ++t) { unsigned char *pc = pool[t].pc; for (;;) { int i; i = *(int *)pc; #ifdef DEBUG logmessage_3(" Thread %d PC=%lld insn=%s\n", t - cl, (long long)(pc - g->frag->start), iname(i)); #endif if (i >= 0) { /* A single character */ if (c == i) { nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } } else switch (i) { case iEXPR: { if (c != NO_MORE_DATA) { int e; if (pool[t].sp) e = pool[t].stack[pool[t].sp - 1]; else e = 0; switch (e) { case 0: { int psh; check: psh = 0; switch (c) { case '(': psh = '('; break; case '[': psh = '['; break; case '{': psh = '{'; break; case '"': psh = '"'; break; case '\'': psh = '\''; break; case '/': psh = '/'; break; } if (psh) { /* Stay */ #ifdef DEBUG logmessage_1(" Push %c\n", c); #endif pool[t].stack[pool[t].sp++] = psh; nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } else { /* Eat */ nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } break; } case '(': { int psh = 0; switch (c) { case '(': psh = '('; break; case '[': psh = '['; break; case '{': psh = '{'; break; case '"': psh = '"'; break; case '\'': psh = '\''; break; case '/': psh = '/'; break; } if (psh) { /* Stay */ #ifdef DEBUG logmessage_1(" Push %c\n", c); #endif pool[t].stack[pool[t].sp++] = psh; nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } else if (c == ')') { #ifdef DEBUG logmessage_0(" Pop )\n"); #endif if (pool[t].sp == 1) nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, 0); else nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp - 1); } else { nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } break; } case '[': { int psh = 0; switch (c) { case '(': psh = '('; break; case '[': psh = '['; break; case '{': psh = '{'; break; case '"': psh = '"'; break; case '\'': psh = '\''; break; case '/': psh = '/'; break; } if (psh) { /* Stay */ pool[t].stack[pool[t].sp++] = psh; nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } else if (c == ']') { if (pool[t].sp == 1) nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, 0); else nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp - 1); } else { nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } break; } case '{': { int psh = 0; switch (c) { case '(': psh = '('; break; case '[': psh = '['; break; case '{': psh = '{'; break; case '"': psh = '"'; break; case '\'': psh = '\''; break; case '/': psh = '/'; break; } if (psh) { /* Stay */ pool[t].stack[pool[t].sp++] = psh; nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } else if (c == '}') { if (pool[t].sp == 1) nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, 0); else nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp - 1); } else { nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } break; } case '"': { if (c == '"') { if (pool[t].sp == 1) nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, 0); else nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp - 1); } else if (c == '\\') { pool[t].stack[pool[t].sp++] = '\\'; nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } else { nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } break; } case '\'': { if (c == '\'') { if (pool[t].sp == 1) nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, 0); else nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp - 1); } else if (c == '\\') { pool[t].stack[pool[t].sp++] = '\\'; nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } else { nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } break; } case '/': { if (c == '*') { pool[t].stack[pool[t].sp - 1] = '*'; nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } else { --pool[t].sp; goto check; } break; } case '\\': { if (pool[t].sp == 1) nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, 0); else nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp - 1); break; } case '*': { if (c == '*') { pool[t].stack[pool[t].sp - 1] = 'E'; nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } else { nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } break; } case 'E': { if (c == '*') { nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } else if (c == '/') { if (pool[t].sp == 1) nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, 0); else nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp - 1); } else { pool[t].stack[pool[t].sp - 1] = '*'; nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } break; } } } break; } case iDOT: { if (c != NO_MORE_DATA && c != '\n') /* . doesn't match end of string */ nle = add_thread(pool, g->frag->start, nl, nle, pc + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, pool[t].sp); break; } case iBOL: { if (start_bol || d == '\n') { pc += SIZEOF(int); continue; } break; } case iEOL: { if (c == NO_MORE_DATA || c == '\n') { pc += SIZEOF(int); continue; } break; } case iBOW: { if (joe_isalnum_(g->cmap, c) && !joe_isalnum_(g->cmap, d)) { pc += SIZEOF(int); continue; } break; } case iEOW: { if (!joe_isalnum_(g->cmap, c) && joe_isalnum_(g->cmap, d)) { pc += SIZEOF(int); continue; } break; } case iBRA: { int idx = *(int *)(pc + SIZEOF(int)); if (idx < bra_no) pool[t].pos[idx].rm_so = byte; pc += 2 * SIZEOF(int); continue; } case iKET: { int idx = *(int *)(pc + SIZEOF(int)); if (idx < bra_no) pool[t].pos[idx].rm_eo = byte; pc += 2 * SIZEOF(int); continue; } case iFORK: { cle = add_thread1(pool, g->frag->start, cl, cle, pc + *(int *)(pc + SIZEOF(int)) + SIZEOF(int), pool[t].pos, bra_no, pool[t].stack, pool[t].sp); pc += 2 * SIZEOF(int); continue; } case iJUMP: { pc += *(int *)(pc + SIZEOF(int)) + SIZEOF(int); continue; } case iCLASS: { struct Cclass *cclass; pc += SIZEOF(int); pc += align_o((pc - (unsigned char *)0), SIZEOF(struct Cclass *)); cclass = *(struct Cclass **)pc; if (cclass_lookup(cclass, c)) { pc += SIZEOF(struct Cclass *); pc += align_o((pc - (unsigned char *)0), SIZEOF(int)); nle = add_thread(pool, g->frag->start, nl, nle, pc, pool[t].pos, bra_no, pool[t].stack, pool[t].sp); } break; } case iEND: { if (match || better(pool[t].pos, matches, bra_no)) { int x; for (x = 0; x != bra_no; ++x) { matches[x] = pool[t].pos[x]; } match = 0; } break; } } /* If we fall out of the switch, we break the for */ break; } } /* New becomes current */ cl = nl; cle = nle; /* Too many threads? */ if (cle - cl == MAX_THREADS) return -2; /* Current becomes new */ if (nl == MAX_THREADS) nl = 0; else nl = MAX_THREADS; nle = nl; start_bol = 0; } while (match && c != NO_MORE_DATA && cl != cle); if (!match && c != NO_MORE_DATA) prgetc(p); for (c = bra_no; c < nmatch; ++c) { matches[c].rm_so = -1; } return match; }