-/* UCS-4 methods */
-/* */
-/* UCS-4 character codes are implemented as nonnegative integers. */
-/* */
-/************************************************************************/
-
-Lisp_Object ucs_to_mule_table[65536];
-Lisp_Object mule_to_ucs_table;
-
-DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
-Map UCS-4 code CODE to Mule character CHARACTER.
-
-Return T on success, NIL on failure.
-*/
- (code, character))
-{
- unsigned int c;
-
- CHECK_CHAR (character);
- CHECK_INT (code);
- c = XINT (code);
-
- if (c < sizeof (ucs_to_mule_table))
- {
- ucs_to_mule_table[c] = character;
- return Qt;
- }
- else
- return Qnil;
-}
-
-static Lisp_Object
-ucs_to_char (unsigned long code)
-{
- if (code < sizeof (ucs_to_mule_table))
- {
- return ucs_to_mule_table[code];
- }
- else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
- {
- unsigned int c;
-
- code -= 0xe00000;
- c = code % (94 * 94);
- return make_char
- (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
- (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
- CHARSET_LEFT_TO_RIGHT),
- c / 94 + 33, c % 94 + 33));
- }
- else
- return Qnil;
-}
-
-DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
-Return Mule character corresponding to UCS code CODE (a positive integer).
-*/
- (code))
-{
- CHECK_NATNUM (code);
- return ucs_to_char (XINT (code));
-}
-
-DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
-Map Mule character CHARACTER to UCS code CODE (a positive integer).
-*/
- (character, code))
-{
- /* #### Isn't this gilding the lily? Fput_char_table checks its args.
- Fset_char_ucs is more restrictive on index arg, but should
- check code arg in a char_table method. */
- CHECK_CHAR (character);
- CHECK_NATNUM (code);
- return Fput_char_table (character, code, mule_to_ucs_table);
-}
-
-DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
-Return the UCS code (a positive integer) corresponding to CHARACTER.
-*/
- (character))
-{
- return Fget_char_table (character, mule_to_ucs_table);
-}
-
-/* Decode a UCS-4 character into a buffer. If the lookup fails, use
- <GETA MARK> (U+3013) of JIS X 0208, which means correct character
- is not found, instead.
- #### do something more appropriate (use blob?)
- Danger, Will Robinson! Data loss. Should we signal user? */
-static void
-decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
-{
- Lisp_Object chr = ucs_to_char (ch);
-
- if (! NILP (chr))
- {
- Bufbyte work[MAX_EMCHAR_LEN];
- int len;
-
- ch = XCHAR (chr);
- len = (ch < 128) ?
- simple_set_charptr_emchar (work, ch) :
- non_ascii_set_charptr_emchar (work, ch);
- Dynarr_add_many (dst, work, len);
- }
- else
- {
- Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
- Dynarr_add (dst, 34 + 128);
- Dynarr_add (dst, 46 + 128);
- }
-}
-
-static unsigned long
-mule_char_to_ucs4 (Lisp_Object charset,
- unsigned char h, unsigned char l)
-{
- Lisp_Object code
- = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
- mule_to_ucs_table);
-
- if (INTP (code))
- {
- return XINT (code);
- }
- else if ( (XCHARSET_DIMENSION (charset) == 2) &&
- (XCHARSET_CHARS (charset) == 94) )
- {
- unsigned char final = XCHARSET_FINAL (charset);
-
- if ( ('@' <= final) && (final < 0x7f) )
- {
- return 0xe00000 + (final - '@') * 94 * 94
- + ((h & 127) - 33) * 94 + (l & 127) - 33;
- }
- else
- {
- return '?';
- }
- }
- else
- {
- return '?';
- }
-}
-
-static void
-encode_ucs4 (Lisp_Object charset,
- unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
-{
- unsigned long code = mule_char_to_ucs4 (charset, h, l);
- Dynarr_add (dst, code >> 24);
- Dynarr_add (dst, (code >> 16) & 255);
- Dynarr_add (dst, (code >> 8) & 255);
- Dynarr_add (dst, code & 255);
-}
-
-static int
-detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
- unsigned int n)
-{
- while (n--)
- {
- int c = *src++;
- switch (st->ucs4.in_byte)
- {
- case 0:
- if (c >= 128)
- return 0;
- else
- st->ucs4.in_byte++;
- break;
- case 3:
- st->ucs4.in_byte = 0;
- break;
- default:
- st->ucs4.in_byte++;
- }
- }
- return CODING_CATEGORY_UCS4_MASK;
-}
-
-static void
-decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
-
- while (n--)
- {
- unsigned char c = *src++;
- switch (flags)
- {
- case 0:
- ch = c;
- flags = 3;
- break;
- case 1:
- decode_ucs4 ( ( ch << 8 ) | c, dst);
- ch = 0;
- flags = 0;
- break;
- default:
- ch = ( ch << 8 ) | c;
- flags--;
- }
- }
- if (flags & CODING_STATE_END)
- DECODE_OUTPUT_PARTIAL_CHAR (ch);
-
- str->flags = flags;
- str->ch = ch;
-}
-
-static void
-encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- unsigned char char_boundary = str->iso2022.current_char_boundary;
- Lisp_Object charset = str->iso2022.current_charset;
-
-#ifdef ENABLE_COMPOSITE_CHARS
- /* flags for handling composite chars. We do a little switcharoo
- on the source while we're outputting the composite char. */
- unsigned int saved_n = 0;
- CONST unsigned char *saved_src = NULL;
- int in_composite = 0;
-
- back_to_square_n:
-#endif
-
- while (n--)
- {
- unsigned char c = *src++;
-
- if (BYTE_ASCII_P (c))
- { /* Processing ASCII character */
- ch = 0;
- encode_ucs4 (Vcharset_ascii, c, 0, dst);
- char_boundary = 1;
- }
- else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
- { /* Processing Leading Byte */
- ch = 0;
- charset = CHARSET_BY_LEADING_BYTE (c);
- if (LEADING_BYTE_PREFIX_P(c))
- ch = c;
- char_boundary = 0;
- }
- else
- { /* Processing Non-ASCII character */
- char_boundary = 1;
- if (EQ (charset, Vcharset_control_1))
- {
- encode_ucs4 (Vcharset_control_1, c, 0, dst);
- }
- else
- {
- switch (XCHARSET_REP_BYTES (charset))
- {
- case 2:
- encode_ucs4 (charset, c, 0, dst);
- break;
- case 3:
- if (XCHARSET_PRIVATE_P (charset))
- {
- encode_ucs4 (charset, c, 0, dst);
- ch = 0;
- }
- else if (ch)
- {
-#ifdef ENABLE_COMPOSITE_CHARS
- if (EQ (charset, Vcharset_composite))
- {
- if (in_composite)
- {
- /* #### Bother! We don't know how to
- handle this yet. */
- Dynarr_add (dst, 0);
- Dynarr_add (dst, 0);
- Dynarr_add (dst, 0);
- Dynarr_add (dst, '~');
- }
- else
- {
- Emchar emch = MAKE_CHAR (Vcharset_composite,
- ch & 0x7F, c & 0x7F);
- Lisp_Object lstr = composite_char_string (emch);
- saved_n = n;
- saved_src = src;
- in_composite = 1;
- src = XSTRING_DATA (lstr);
- n = XSTRING_LENGTH (lstr);
- }
- }
- else
-#endif /* ENABLE_COMPOSITE_CHARS */
- {
- encode_ucs4(charset, ch, c, dst);
- }
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- case 4:
- if (ch)
- {
- encode_ucs4 (charset, ch, c, dst);
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- default:
- abort ();
- }
- }
- }
- }
-
-#ifdef ENABLE_COMPOSITE_CHARS
- if (in_composite)
- {
- n = saved_n;
- src = saved_src;
- in_composite = 0;
- goto back_to_square_n; /* Wheeeeeeeee ..... */
- }
-#endif /* ENABLE_COMPOSITE_CHARS */
-
- str->flags = flags;
- str->ch = ch;
- str->iso2022.current_char_boundary = char_boundary;
- str->iso2022.current_charset = charset;
-
- /* Verbum caro factum est! */
-}
-
-\f
-/************************************************************************/
-/* UTF-8 methods */
-/************************************************************************/
-
-static int
-detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
- unsigned int n)
-{
- while (n--)
- {
- unsigned char c = *src++;
- switch (st->utf8.in_byte)
- {
- case 0:
- if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
- return 0;
- else if (c >= 0xfc)
- st->utf8.in_byte = 5;
- else if (c >= 0xf8)
- st->utf8.in_byte = 4;
- else if (c >= 0xf0)
- st->utf8.in_byte = 3;
- else if (c >= 0xe0)
- st->utf8.in_byte = 2;
- else if (c >= 0xc0)
- st->utf8.in_byte = 1;
- else if (c >= 0x80)
- return 0;
- break;
- default:
- if ((c & 0xc0) != 0x80)
- return 0;
- else
- st->utf8.in_byte--;
- }
- }
- return CODING_CATEGORY_UTF8_MASK;
-}
-
-static void
-decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- eol_type_t eol_type = str->eol_type;
-
- while (n--)
- {
- unsigned char c = *src++;
- switch (flags)
- {
- case 0:
- if ( c >= 0xfc )
- {
- ch = c & 0x01;
- flags = 5;
- }
- else if ( c >= 0xf8 )
- {
- ch = c & 0x03;
- flags = 4;
- }
- else if ( c >= 0xf0 )
- {
- ch = c & 0x07;
- flags = 3;
- }
- else if ( c >= 0xe0 )
- {
- ch = c & 0x0f;
- flags = 2;
- }
- else if ( c >= 0xc0 )
- {
- ch = c & 0x1f;
- flags = 1;
- }
- else
- {
- DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
- decode_ucs4 (c, dst);
- }
- break;
- case 1:
- ch = ( ch << 6 ) | ( c & 0x3f );
- decode_ucs4 (ch, dst);
- ch = 0;
- flags = 0;
- break;
- default:
- ch = ( ch << 6 ) | ( c & 0x3f );
- flags--;
- }
- label_continue_loop:;
- }
-
- if (flags & CODING_STATE_END)
- DECODE_OUTPUT_PARTIAL_CHAR (ch);
-
- str->flags = flags;
- str->ch = ch;
-}
-
-static void
-encode_utf8 (Lisp_Object charset,
- unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
-{
- unsigned long code = mule_char_to_ucs4 (charset, h, l);
- if ( code <= 0x7f )
- {
- Dynarr_add (dst, code);
- }
- else if ( code <= 0x7ff )
- {
- Dynarr_add (dst, (code >> 6) | 0xc0);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else if ( code <= 0xffff )
- {
- Dynarr_add (dst, (code >> 12) | 0xe0);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else if ( code <= 0x1fffff )
- {
- Dynarr_add (dst, (code >> 18) | 0xf0);
- Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else if ( code <= 0x3ffffff )
- {
- Dynarr_add (dst, (code >> 24) | 0xf8);
- Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else
- {
- Dynarr_add (dst, (code >> 30) | 0xfc);
- Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
-}
-
-static void
-encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
- unsigned char char_boundary = str->iso2022.current_char_boundary;
- Lisp_Object charset = str->iso2022.current_charset;
-
-#ifdef ENABLE_COMPOSITE_CHARS
- /* flags for handling composite chars. We do a little switcharoo
- on the source while we're outputting the composite char. */
- unsigned int saved_n = 0;
- CONST unsigned char *saved_src = NULL;
- int in_composite = 0;
-
- back_to_square_n:
-#endif /* ENABLE_COMPOSITE_CHARS */
-
- while (n--)
- {
- unsigned char c = *src++;
-
- if (BYTE_ASCII_P (c))
- { /* Processing ASCII character */
- ch = 0;
- if (c == '\n')
- {
- if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
- Dynarr_add (dst, '\r');
- if (eol_type != EOL_CR)
- Dynarr_add (dst, c);
- }
- else
- encode_utf8 (Vcharset_ascii, c, 0, dst);
- char_boundary = 1;
- }
- else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
- { /* Processing Leading Byte */
- ch = 0;
- charset = CHARSET_BY_LEADING_BYTE (c);
- if (LEADING_BYTE_PREFIX_P(c))
- ch = c;
- char_boundary = 0;
- }
- else
- { /* Processing Non-ASCII character */
- char_boundary = 1;
- if (EQ (charset, Vcharset_control_1))
- {
- encode_utf8 (Vcharset_control_1, c, 0, dst);
- }
- else
- {
- switch (XCHARSET_REP_BYTES (charset))
- {
- case 2:
- encode_utf8 (charset, c, 0, dst);
- break;
- case 3:
- if (XCHARSET_PRIVATE_P (charset))
- {
- encode_utf8 (charset, c, 0, dst);
- ch = 0;
- }
- else if (ch)
- {
-#ifdef ENABLE_COMPOSITE_CHARS
- if (EQ (charset, Vcharset_composite))
- {
- if (in_composite)
- {
- /* #### Bother! We don't know how to
- handle this yet. */
- encode_utf8 (Vcharset_ascii, '~', 0, dst);
- }
- else
- {
- Emchar emch = MAKE_CHAR (Vcharset_composite,
- ch & 0x7F, c & 0x7F);
- Lisp_Object lstr = composite_char_string (emch);
- saved_n = n;
- saved_src = src;
- in_composite = 1;
- src = XSTRING_DATA (lstr);
- n = XSTRING_LENGTH (lstr);
- }
- }
- else
-#endif /* ENABLE_COMPOSITE_CHARS */
- {
- encode_utf8 (charset, ch, c, dst);
- }
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- case 4:
- if (ch)
- {
- encode_utf8 (charset, ch, c, dst);
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- default:
- abort ();
- }
- }
- }
- }
-
-#ifdef ENABLE_COMPOSITE_CHARS
- if (in_composite)
- {
- n = saved_n;
- src = saved_src;
- in_composite = 0;
- goto back_to_square_n; /* Wheeeeeeeee ..... */
- }
-#endif
-
- str->flags = flags;
- str->ch = ch;
- str->iso2022.current_char_boundary = char_boundary;
- str->iso2022.current_charset = charset;
-
- /* Verbum caro factum est! */
-}
-
-\f
-/************************************************************************/