JOE - Joe's own editor Mercurial

Brought to you by: jhallen, jjjordan, marx_sk, shallot
[26e832]: / joe / utf8.c Maximize Restore History
324 lines (298 with data), 7.4 kB

/*
 *	UTF-8 Utilities
 *	Copyright
 *		(C) 2004 Joseph H. Allen
 *
 *	This file is part of JOE (Joe's Own Editor)
 */
#include "types.h"

/* UTF-8 Encoder
 *
 * c is Unicode character.
 * buf is 7 byte buffer- utf-8 coded character is written to this followed by a 0 termination.
 * returns length (not including terminator).
 */

ptrdiff_t utf8_encode(char *buf,int c)
{
	if (c < 0x80) {
		buf[0] = TO_CHAR_OK(c);
		buf[1] = 0;
		return 1;
	} else if(c < 0x800) {
		buf[0] = TO_CHAR_OK((0xc0|(c>>6)));
		buf[1] = TO_CHAR_OK((0x80|(c&0x3F)));
		buf[2] = 0;
		return 2;
	} else if(c < 0x10000) {
		buf[0] = TO_CHAR_OK((0xe0|(c>>12)));
		buf[1] = TO_CHAR_OK((0x80|((c>>6)&0x3f)));
		buf[2] = TO_CHAR_OK((0x80|((c)&0x3f)));
		buf[3] = 0;
		return 3;
	} else if(c < 0x200000) {
		buf[0] = TO_CHAR_OK((0xf0|(c>>18)));
		buf[1] = TO_CHAR_OK((0x80|((c>>12)&0x3f)));
		buf[2] = TO_CHAR_OK((0x80|((c>>6)&0x3f)));
		buf[3] = TO_CHAR_OK((0x80|((c)&0x3f)));
		buf[4] = 0;
		return 4;
	} else if(c < 0x4000000) {
		buf[0] = TO_CHAR_OK((0xf8|(c>>24)));
		buf[1] = TO_CHAR_OK((0x80|((c>>18)&0x3f)));
		buf[2] = TO_CHAR_OK((0x80|((c>>12)&0x3f)));
		buf[3] = TO_CHAR_OK((0x80|((c>>6)&0x3f)));
		buf[4] = TO_CHAR_OK((0x80|((c)&0x3f)));
		buf[5] = 0;
		return 5;
	} else {
		buf[0] = TO_CHAR_OK((0xfC|(c>>30)));
		buf[1] = TO_CHAR_OK((0x80|((c>>24)&0x3f)));
		buf[2] = TO_CHAR_OK((0x80|((c>>18)&0x3f)));
		buf[3] = TO_CHAR_OK((0x80|((c>>12)&0x3f)));
		buf[4] = TO_CHAR_OK((0x80|((c>>6)&0x3f)));
		buf[5] = TO_CHAR_OK((0x80|((c)&0x3f)));
		buf[6] = 0;
		return 6;
	}
}

/* UTF-8 Decoder
 *
 * Returns 0 - 7FFFFFFF: decoded character
 *                   -1: byte accepted, no character decoded yet.
 *                   -2: incomplete byte sequence
 *                   -3: no byte sequence started, but character is between 128 - 191, 254 or 255
 */

int utf8_decode(struct utf8_sm *utf8_sm,char c)
{
	if (utf8_sm->state) {
		if ((c&0xC0)==0x80) {
			utf8_sm->buf[utf8_sm->ptr++] = c;
			--utf8_sm->state;
			utf8_sm->accu = ((utf8_sm->accu<<6)|(c&0x3F));
			if(!utf8_sm->state)
				return utf8_sm->accu;
		} else {
			utf8_sm->state = 0;
			return UTF8_INCOMPLETE;
		}
	} else if ((c&0x80)==0x00) {
		/* 0 - 127 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 0;
		return c;
	} else if ((c&0xE0)==0xC0) {
		/* 192 - 223 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 1;
		utf8_sm->accu = (c&0x1F);
	} else if ((c&0xF0)==0xE0) {
		/* 224 - 239 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 2;
		utf8_sm->accu = (c&0x0F);
	} else if ((c&0xF8)==0xF0) {
		/* 240 - 247 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 3;
		utf8_sm->accu = (c&0x07);
	} else if ((c&0xFC)==0xF8) {
		/* 248 - 251 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 4;
		utf8_sm->accu = (c&0x03);
	} else if ((c&0xFE)==0xFC) {
		/* 252 - 253 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 5;
		utf8_sm->accu = (c&0x01);
	} else {
		/* 128 - 191, 254, 255 */
		utf8_sm->ptr = 0;
		utf8_sm->state = 0;
		return UTF8_BAD;
	}
	return UTF8_ACCEPTED;
}

/* Initialize state machine */

void utf8_init(struct utf8_sm *utf8_sm)
{
	utf8_sm->ptr = 0;
	utf8_sm->state = 0;
}

/* Decode first utf-8 sequence in a string */

int utf8_decode_string(const char *s)
{
	struct utf8_sm sm;
	int c;
	utf8_init(&sm);
	do
		c = utf8_decode(&sm, *s++);
		while (c == UTF8_ACCEPTED);
	return c;
}

/* Decode and advance
 *
 * Returns: 0 - 7FFFFFFF: decoded character
 *  UTF8_INCOMPLETE: incomplete sequence
 *  UTF8_BAD: bad start of sequence found.
 *
 * p/plen are always advanced in such a way that repeated called to utf8_decode_fwrd do not cause
 * infinite loops.
 *
 * Pass NULL in plen for zero-terminated strings
 */

int utf8_decode_fwrd(const char **p,ptrdiff_t *plen)
{
	struct utf8_sm sm;
	const char *s = *p;
	ptrdiff_t len;
	int c = UTF8_INCOMPLETE; /* Return this on no more input. */
	if (plen)
		len = *plen;
	else
		len = -1;

	utf8_init(&sm);

	while (plen ? (len != 0) : (*s != 0)) {
		c = utf8_decode(&sm, *s);
		if (c >= 0) {
			/* We've got a character */
			--len;
			++s;
			break;
		} else if (c == UTF8_INCOMPLETE) {
			/* Bad sequence detected.  Caller should feed rest of string in again. */
			break;
		} else if (c == UTF8_BAD) {
			/* Bad start of UTF-8 sequence.  We need to eat this char to avoid infinite loops. */
			--len;
			++s;
			/* But we should tell the caller that something bad was found. */
			break;
		} else {
			/* If c is -1, utf8_decode accepted the character, so we should get the next one. */
			--len;
			++s;
		}
	}

	if (plen)
		*plen = len;
	*p = s;

	return c;
}

/* Get next character from string and advance it, locale dependent */

int fwrd_c(struct charmap *map, const char **s, ptrdiff_t *len)
{
	if (map->type)
		return utf8_decode_fwrd(s, len);
	else {
		int c = *(const unsigned char *)*s;
		*s = *s + 1;
		if (len)
			*len = *len - 1;
		return c;
	}
}

/* UTF-16 */

/* Initialize state machine */

void utf16_init(struct utf16_sm *sm)
{
	sm->state = 0;
}

int utf16_decode(struct utf16_sm *sm, unsigned short c)
{
	if (sm->state) {
		if (c >= 0xDC00 && c <= 0xDFFF) {
			int rtn = ((sm->state - 0xD800) << 10) + (c - 0xDC00) + 0x10000;
			sm->state = 0;
			return rtn;
		} else if (c >= 0xD800 && c <= 0xDBFF) {
			sm->state = c;
			return UTF16_INCOMPLETE;
		} else {
			/* Sequence was incomplete, we should signal an error somehow.. */
			return c;
		}
	} else {
		if (c >= 0xD800 && c <= 0xDBFF) {
			sm->state = c;
			return UTF16_ACCEPTED;
		} else if (c >= 0xDC00 && c <= 0xDFFF) {
			return UTF16_BAD;
		} else {
			return c;
		}
	}
}

int utf16r_decode(struct utf16_sm *sm, unsigned short c)
{
	c = (unsigned short)((c >> 8) + (c << 8)); /* Reverse bytes */
	if (sm->state) {
		if (c >= 0xDC00 && c <= 0xDFFF) {
			int rtn = ((sm->state - 0xD800) << 10) + (c - 0xDC00) + 0x10000;
			sm->state = 0;
			return rtn;
		} else if (c >= 0xD800 && c <= 0xDBFF) {
			sm->state = c;
			return UTF16_INCOMPLETE;
		} else {
			/* Sequence was incomplete, we should signal an error somehow.. */
			return c;
		}
	} else {
		if (c >= 0xD800 && c <= 0xDBFF) {
			sm->state = c;
			return UTF16_ACCEPTED;
		} else if (c >= 0xDC00 && c <= 0xDFFF) {
			return UTF16_BAD;
		} else {
			return c;
		}
	}
}

/* UTF-16 encoder */

ptrdiff_t utf16_encode(char *buf, int c)
{
	if ((c >= 0 && c < 0xD800) || (c >= 0xE000 && c < 0x10000)) {
		unsigned short d = (unsigned short)c;
		*(unsigned short *)buf = d;
		return 2;
	} else if (c >= 0x10000 && c < 0x110000) {
		unsigned short d;
		c -= 0x10000;
		d = (unsigned short)(0xD800 + (c >> 10));
		*(unsigned short *)buf = d;
		d = (unsigned short)(0xDC00 + (c & 0x3FF));
		*(unsigned short *)(buf + 2) = d;
		return 4;
	} else {
		return UTF16_BAD;
	}
}

/* UTF-16R encoder */

ptrdiff_t utf16r_encode(char *buf, int c)
{
	if ((c >= 0 && c < 0xD800) || (c >= 0xE000 && c < 0x10000)) {
		unsigned short d = (unsigned short)c;
		d = (unsigned short)((d >> 8) + (d << 8));
		*(unsigned short *)buf = d;
		return 2;
	} else if (c >= 0x10000 && c < 0x110000) {
		unsigned short d;
		c -= 0x10000;
		d = (unsigned short)(0xD800 + (c >> 10));
		d = (unsigned short)((d >> 8) + (d << 8));
		*(unsigned short *)buf = d;
		d = (unsigned short)(0xDC00 + (c & 0x3FF));
		d = (unsigned short)((d >> 8) + (d << 8));
		*(unsigned short *)(buf + 2) = d;
		return 4;
	} else {
		return UTF16_BAD;
	}
}
JOE - Joe's own editor Mercurial

Branches

Tags

[26e832]: / joe / utf8.c Maximize Restore History

324 lines (298 with data), 7.4 kB