JOE - Joe's own editor Mercurial

Brought to you by: jhallen, jjjordan, marx_sk, shallot
[c4b27c]: / joe / utf8.c Maximize Restore History
623 lines (551 with data), 14.7 kB

/*
 *	UTF-8 Utilities
 *	Copyright
 *		(C) 2004 Joseph H. Allen
 *
 *	This file is part of JOE (Joe's Own Editor)
 */
#include "types.h"

/* Under AmigaOS we have setlocale() but don't have langinfo.h and associated stuff,
 * so we have to disable the whole piece of code
 */
#ifdef __amigaos
#undef HAVE_SETLOCALE
#endif

/* Cygwin has CODESET, but it's crummy */
#ifdef __CYGWIN__
#undef HAVE_SETLOCALE
#endif

/* If it looks old, forget it */
#ifndef CODESET
#undef HAVE_SETLOCALE
#endif

#if defined(HAVE_LOCALE_H) && defined(HAVE_SETLOCALE)
#	include <locale.h>
#       include <langinfo.h>
#endif

/* nl_langinfo(CODESET) is broken on many systems.  If HAVE_SETLOCALE is undefined,
   JOE uses a limited internal version instead */

/* UTF-8 Encoder
 *
 * c is unicode character.
 * buf is 7 byte buffer- utf-8 coded character is written to this followed by a 0 termination.
 * returns length (not including terminator).
 */

ptrdiff_t utf8_encode(char *buf,int c)
{
	if (c < 0x80) {
		buf[0] = TO_CHAR_OK(c);
		buf[1] = 0;
		return 1;
	} else if(c < 0x800) {
		buf[0] = TO_CHAR_OK((0xc0|(c>>6)));
		buf[1] = TO_CHAR_OK((0x80|(c&0x3F)));
		buf[2] = 0;
		return 2;
	} else if(c < 0x10000) {
		buf[0] = TO_CHAR_OK((0xe0|(c>>12)));
		buf[1] = TO_CHAR_OK((0x80|((c>>6)&0x3f)));
		buf[2] = TO_CHAR_OK((0x80|((c)&0x3f)));
		buf[3] = 0;
		return 3;
	} else if(c < 0x200000) {
		buf[0] = TO_CHAR_OK((0xf0|(c>>18)));
		buf[1] = TO_CHAR_OK((0x80|((c>>12)&0x3f)));
		buf[2] = TO_CHAR_OK((0x80|((c>>6)&0x3f)));
		buf[3] = TO_CHAR_OK((0x80|((c)&0x3f)));
		buf[4] = 0;
		return 4;
	} else if(c < 0x4000000) {
		buf[0] = TO_CHAR_OK((0xf8|(c>>24)));
		buf[1] = TO_CHAR_OK((0x80|((c>>18)&0x3f)));
		buf[2] = TO_CHAR_OK((0x80|((c>>12)&0x3f)));
		buf[3] = TO_CHAR_OK((0x80|((c>>6)&0x3f)));
		buf[4] = TO_CHAR_OK((0x80|((c)&0x3f)));
		buf[5] = 0;
		return 5;
	} else {
		buf[0] = TO_CHAR_OK((0xfC|(c>>30)));
		buf[1] = TO_CHAR_OK((0x80|((c>>24)&0x3f)));
		buf[2] = TO_CHAR_OK((0x80|((c>>18)&0x3f)));
		buf[3] = TO_CHAR_OK((0x80|((c>>12)&0x3f)));
		buf[4] = TO_CHAR_OK((0x80|((c>>6)&0x3f)));
		buf[5] = TO_CHAR_OK((0x80|((c)&0x3f)));
		buf[6] = 0;
		return 6;
	}
}

/* UTF-8 Decoder
 *
 * Returns 0 - 7FFFFFFF: decoded character
 *                   -1: byte accepted, no character decoded yet.
 *                   -2: incomplete byte sequence
 *                   -3: no byte sequence started, but character is between 128 - 191, 254 or 255
 */

int utf8_decode(struct utf8_sm *utf8_sm,char c)
{
	if (utf8_sm->state) {
		if ((c&0xC0)==0x80) {
			utf8_sm->buf[utf8_sm->ptr++] = c;
			--utf8_sm->state;
			utf8_sm->accu = ((utf8_sm->accu<<6)|(c&0x3F));
			if(!utf8_sm->state)
				return utf8_sm->accu;
		} else {
			utf8_sm->state = 0;
			return -2;
		}
	} else if ((c&0xE0)==0xC0) {
		/* 192 - 223 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 1;
		utf8_sm->accu = (c&0x1F);
	} else if ((c&0xF0)==0xE0) {
		/* 224 - 239 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 2;
		utf8_sm->accu = (c&0x0F);
	} else if ((c&0xF8)==0xF0) {
		/* 240 - 247 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 3;
		utf8_sm->accu = (c&0x07);
	} else if ((c&0xFC)==0xF8) {
		/* 248 - 251 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 4;
		utf8_sm->accu = (c&0x03);
	} else if ((c&0xFE)==0xFC) {
		/* 252 - 253 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 5;
		utf8_sm->accu = (c&0x01);
	} else if ((c&0x80)==0x00) {
		/* 0 - 127 */
		utf8_sm->buf[0] = c;
		utf8_sm->ptr = 1;
		utf8_sm->state = 0;
		return c;
	} else {
		/* 128 - 191, 254, 255 */
		utf8_sm->ptr = 0;
		utf8_sm->state = 0;
		return -3;
	}
	return -1;
}

/* Initialize state machine */

void utf8_init(struct utf8_sm *utf8_sm)
{
	utf8_sm->ptr = 0;
	utf8_sm->state = 0;
}

/* Decode an entire string */

int utf8_decode_string(const char *s)
{
	struct utf8_sm sm;
	ptrdiff_t x;
	int c = -1;
	utf8_init(&sm);
	for(x=0;s[x];++x)
		c = utf8_decode(&sm,s[x]);
	return c;
}

/* Decode and advance
 *
 * Returns: 0 - 7FFFFFFF: decoded character
 *  -2: incomplete sequence
 *  -3: bad start of sequence found.
 *
 * p/plen are always advanced in such a way that repeated called to utf8_decode_fwrd do not cause
 * infinite loops.
 */

int utf8_decode_fwrd(const char **p,ptrdiff_t *plen)
{
	struct utf8_sm sm;
	const char *s = *p;
	ptrdiff_t len;
	int c = -2; /* Return this on no more input. */
	if (plen)
		len = *plen;
	else
		len = -1;

	utf8_init(&sm);

	while (len) {
		c = utf8_decode(&sm, *s);
		if (c >= 0) {
			/* We've got a character */
			--len;
			++s;
			break;
		} else if (c == -2) {
			/* Bad sequence detected.  Caller should feed rest of string in again. */
			break;
		} else if (c == -3) {
			/* Bad start of UTF-8 sequence.  We need to eat this char to avoid infinite loops. */
			--len;
			++s;
			/* But we should tell the caller that something bad was found. */
			break;
		} else {
			/* If c is -1, utf8_decode accepted the character, so we should get the next one. */
			--len;
			++s;
		}
	}

	if (plen)
		*plen = len;
	*p = s;

	return c;
}

/* For systems (BSD) with no nl_langinfo(CODESET) */

/*
 * This is a quick-and-dirty emulator of the nl_langinfo(CODESET)
 * function defined in the Single Unix Specification for those systems
 * (FreeBSD, etc.) that don't have one yet. It behaves as if it had
 * been called after setlocale(LC_CTYPE, ""), that is it looks at
 * the locale environment variables.
 *
 * http://www.opengroup.org/onlinepubs/7908799/xsh/langinfo.h.html
 *
 * Please extend it as needed and suggest improvements to the author.
 * This emulator will hopefully become redundant soon as
 * nl_langinfo(CODESET) becomes more widely implemented.
 *
 * Since the proposed Li18nux encoding name registry is still not mature,
 * the output follows the MIME registry where possible:
 *
 *   http://www.iana.org/assignments/character-sets
 *
 * A possible autoconf test for the availability of nl_langinfo(CODESET)
 * can be found in
 *
 *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#activate
 *
 * Markus.Kuhn@cl.cam.ac.uk -- 2002-03-11
 * Permission to use, copy, modify, and distribute this software
 * for any purpose and without fee is hereby granted. The author
 * disclaims all warranties with regard to this software.
 *
 * Latest version:
 *
 *   http://www.cl.cam.ac.uk/~mgk25/ucs/langinfo.c
 */

const char *joe_getcodeset(char *l)
{
  static char buf[16];
  char *p;
  
  if (l || ((l = getenv("LC_ALL"))   && *l) ||
      ((l = getenv("LC_CTYPE")) && *l) ||
      ((l = getenv("LANG"))     && *l)) {

    /* check standardized locales */
    if (!zcmp(l, "C") || !zcmp(l, "POSIX"))
      return "ascii";

    /* check for encoding name fragment */
    if (zstr(l, "UTF") || zstr(l, "utf"))
      return "UTF-8";

    if ((p = zstr(l, "8859-"))) {
      memcpy(buf, "ISO-8859-\0\0", 12);
      p += 5;
      if (*p >= '0' && *p <= '9') {
	buf[9] = *p++;
	if (*p >= '0' && *p <= '9') buf[10] = *p++;
	return buf;
      }
    }

    if (zstr(l, "KOI8-R")) return "KOI8-R";
    if (zstr(l, "KOI8-U")) return "KOI8-U";
    if (zstr(l, "620")) return "TIS-620";
    if (zstr(l, "1251")) return "CP1251";
    if (zstr(l, "2312")) return "GB2312";
    if (zstr(l, "HKSCS")) return "Big5HKSCS";   /* no MIME charset */
    if (zstr(l, "Big5") || zstr(l, "BIG5")) return "Big5";
    if (zstr(l, "GBK")) return "GBK";           /* no MIME charset */
    if (zstr(l, "18030")) return "GB18030";     /* no MIME charset */
    if (zstr(l, "Shift_JIS") || zstr(l, "SJIS")) return "Shift_JIS";
    /* check for conclusive modifier */
    if (zstr(l, "euro")) return "ISO-8859-15";
    /* check for language (and perhaps country) codes */
    if (zstr(l, "zh_TW")) return "Big5";
    if (zstr(l, "zh_HK")) return "Big5HKSCS";   /* no MIME charset */
    if (zstr(l, "zh")) return "GB2312";
    if (zstr(l, "ja")) return "EUC-JP";
    if (zstr(l, "ko")) return "EUC-KR";
    if (zstr(l, "ru")) return "KOI8-R";
    if (zstr(l, "uk")) return "KOI8-U";
    if (zstr(l, "pl") || zstr(l, "hr") ||
	zstr(l, "hu") || zstr(l, "cs") ||
	zstr(l, "sk") || zstr(l, "sl")) return "ISO-8859-2";
    if (zstr(l, "eo") || zstr(l, "mt")) return "ISO-8859-3";
    if (zstr(l, "el")) return "ISO-8859-7";
    if (zstr(l, "he")) return "ISO-8859-8";
    if (zstr(l, "tr")) return "ISO-8859-9";
    if (zstr(l, "th")) return "TIS-620";      /* or ISO-8859-11 */
    if (zstr(l, "lt")) return "ISO-8859-13";
    if (zstr(l, "cy")) return "ISO-8859-14";
    if (zstr(l, "ro")) return "ISO-8859-2";   /* or ISO-8859-16 */
    if (zstr(l, "am") || zstr(l, "vi")) return "UTF-8";
    /* Send me further rules if you like, but don't forget that we are
     * *only* interested in locale naming conventions on platforms
     * that do not already provide an nl_langinfo(CODESET) implementation. */
    return "ISO-8859-1"; /* should perhaps be "UTF-8" instead */
  }
  return "ascii";
}

/* Initialize locale for JOE */

const char *codeset;	/* Codeset of terminal */

const char *non_utf8_codeset;
			/* Codeset of local language non-UTF-8 */

const char *locale_lang;
			/* Our local language */

const char *locale_msgs;
			/* Language to use for editor messages */

struct charmap *locale_map;
			/* Character map of terminal, default map for new files */

struct charmap *locale_map_non_utf8;
			/* Old, non-utf8 version of locale */

void joe_locale()
{
	const char *sc;
	char *s, *t, *u;

	sc=getenv("LC_ALL");
	if (!sc || !*sc) {
		sc=getenv("LC_MESSAGES");
		if (!sc || !*sc) {
			sc=getenv("LANG");
		}
	}

	if (!sc)
		sc = "C";
	s=zdup(sc);

	if ((t=zrchr(s,'.')))
		*t = 0;

	if (!zicmp(s, "C") || !zicmp(s, "POSIX"))
		locale_msgs = "en_US"; /* Because aspell has no default */
	else
		locale_msgs = s;

	sc=getenv("LC_ALL");
	if (!sc || !*sc) {
		sc=getenv("LC_CTYPE");
		if (!sc || !*sc) {
			sc=getenv("LANG");
		}
	}

	if (!sc)
		sc = "C";

	s = zdup(sc);

	u = zdup(s);

	if ((t=zrchr(s,'.')))
		*t = 0;

	locale_lang = s;

#ifdef HAVE_SETLOCALE
	setlocale(LC_ALL,s);
	non_utf8_codeset = zdup(nl_langinfo(CODESET));
#else
	non_utf8_codeset = joe_getcodeset(s);
#endif


	/* printf("joe_locale\n"); */
#ifdef HAVE_SETLOCALE
	/* printf("set_locale\n"); */
	setlocale(LC_ALL,"");
#ifdef ENABLE_NLS
	/* Set up gettext() */
	bindtextdomain(PACKAGE, LOCALEDIR);
	textdomain(PACKAGE);
	/* printf("%s %s %s\n",PACKAGE,LOCALEDIR,joe_gettext("New File")); */
#endif
	codeset = zdup(nl_langinfo(CODESET));
#else
	codeset = joe_getcodeset(u);
#endif

	locale_map = find_charmap(codeset);
	if (!locale_map)
		locale_map = find_charmap("ascii");

	locale_map_non_utf8 = find_charmap(non_utf8_codeset);
	if (!locale_map_non_utf8)
		locale_map_non_utf8 = find_charmap("ascii");

	fdefault.charmap = locale_map;
	pdefault.charmap = locale_map;

/*
	printf("Character set is %s\n",locale_map->name);

	for(x=0;x!=128;++x)
		printf("%x	space=%d blank=%d alpha=%d alnum=%d punct=%d print=%d\n",
		       x,joe_isspace(locale_map,x), joe_isblank(locale_map,x), joe_isalpha_(locale_map,x),
		       joe_isalnum_(locale_map,x), joe_ispunct(locale_map,x), joe_isprint(locale_map,x));
*/

/* Use iconv() */
#ifdef junk
	to_utf = iconv_open("UTF-8", non_utf8_codeset);
	from_utf = iconv_open(non_utf8_codeset, "UTF-8");
#endif

/* For no locale support */
#ifdef junk
	locale_map = find_charmap("ascii");
	fdefault.charmap = locale_map;
	pdefault.charmap = locale_map;
#endif
	init_gettext(locale_msgs);
}

void to_utf8(struct charmap *map,char *s,int c)
{
	int d = to_uni(map,c);

	if (d==-1)
		utf8_encode(s,'?');
	else
		utf8_encode(s,d);
#ifdef junk
	/* Iconv() way */
	char buf[10];
	char *ibufp = buf;
	char *obufp = s;
	ptrdiff_t ibuf_sz=1;
	ptrdiff_t obuf_sz= 10;
	buf[0]=c;
	buf[1]=0;

	if (to_utf==(iconv_t)-1 ||
	    iconv(to_utf,(char **)&ibufp,&ibuf_sz,(char **)&obufp,&obuf_sz)==(size_t)-1) {
		s[0]='?';
		s[1]=0;
	} else {
		*obufp = 0;
	}
#endif
}

int from_utf8(struct charmap *map,const char *s)
{
	int d = utf8_decode_string(s);
	int c = from_uni(map,d);
	if (c==-1)
		return '?';
	else
		return c;

#ifdef junk
	/* Iconv() way */
	ptrdiff_t ibuf_sz=zlen(s);
	char *ibufp=s;
	ptrdiff_t obuf_sz=10;
	char obuf[10];
	char *obufp = obuf;


	if (from_utf==(iconv_t)-1 ||
	    iconv(from_utf,(char **)&ibufp,&ibuf_sz,(char **)&obufp,&obuf_sz)==((size_t)-1))
		return '?';
	else
		return obuf[0];
#endif
}

void my_iconv(char *dest, ptrdiff_t destsiz, struct charmap *dest_map,
              const char *src, struct charmap *src_map)
{
	if (dest_map == src_map) {
		zlcpy (dest, destsiz, src);
		return;
	}

	if (src_map->type) {
		/* src is UTF-8 */
		if (dest_map->type) {
			/* UTF-8 to UTF-8? */
			zlcpy (dest, destsiz, src);
		} else {
			--destsiz;
			/* UTF-8 to non-UTF-8 */
			while (*src && destsiz) {
				ptrdiff_t len = -1;
				int c = utf8_decode_fwrd(&src, &len);
				if (c >= 0) {
					int d = from_uni(dest_map, c);
					if (d >= 0)
						*dest++ = TO_CHAR_OK(d);
					else
						*dest++ = '?';
				} else
					*dest++ = 'X';
				--destsiz;
			}
			*dest = 0;
		}
	} else {
		/* src is not UTF-8 */
		if (!dest_map->type) {
			/* Non UTF-8 to non-UTF-8 */
			--destsiz;
			while (*src && destsiz) {
				int c = to_uni(src_map, *src++);
				int d;
				if (c >= 0) {
					d = from_uni(dest_map, c);
					if (d >= 0)
						*dest++ = TO_CHAR_OK(d);
					else
						*dest++ = '?';
				} else
					*dest++ = '?';
				--destsiz;
			}
			*dest = 0;
		} else {
			/* Non-UTF-8 to UTF-8 */
			--destsiz;
			while (*src && destsiz >= 6) {
				int c = to_uni(src_map, *src++);
				if (c >= 0) {
					ptrdiff_t l = utf8_encode(dest, c);
					destsiz -= l;
					dest += l;
				} else {
					*dest++ = '?';
					--destsiz;
				}
			}
			*dest = 0;
		}
	}
}

/* Guess character set */

int guess_non_utf8;
int guess_utf8;

struct charmap *guess_map(const char *buf, ptrdiff_t len)
{
	const char *p;
	ptrdiff_t plen;
	int c;
	int flag;

	/* No info? Use default */
	if (!len || (!guess_non_utf8 && !guess_utf8))
		return locale_map;

	/* Does it look like UTF-8? */
	p = buf;
	plen = len;
	c = 0;
	flag = 0;
	while (plen) {
		/* Break if we could possibly run out of data in
		   the middle of utf-8 sequence */
		if (plen < 7)
			break;
		/* if (*p >= 128) (for unsigned char) */
		if (*p < 0)
			flag = 1;
		c = utf8_decode_fwrd(&p, &plen);
		if (c < 0)
			break;
	}

	if (flag && c >= 0) {
		/* There are characters above 128, and there are no utf-8 errors */
		if (locale_map->type || !guess_utf8)
			return locale_map;
		else
			return find_charmap("utf-8");
	}

	if (!flag || !guess_non_utf8) {
		/* No characters above 128 */
		return locale_map;
	} else {
		/* Choose non-utf8 version of locale */
		return locale_map_non_utf8;
	}
}
JOE - Joe's own editor Mercurial

Branches

Tags

[c4b27c]: / joe / utf8.c Maximize Restore History

623 lines (551 with data), 14.7 kB