- if (INTP (code))
- {
- return XINT (code);
- }
- else if ( (XCHARSET_DIMENSION (charset) == 2) &&
- (XCHARSET_CHARS (charset) == 94) )
- {
- unsigned char final = XCHARSET_FINAL (charset);
-
- if ( ('@' <= final) && (final < 0x7f) )
- {
- return 0xe00000 + (final - '@') * 94 * 94
- + ((h & 127) - 33) * 94 + (l & 127) - 33;
- }
- else
- {
- return '?';
- }
- }
- else
- {
- return '?';
- }
-}
-
-static void
-encode_ucs4 (Lisp_Object charset,
- unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
-{
- unsigned long code = mule_char_to_ucs4 (charset, h, l);
- Dynarr_add (dst, code >> 24);
- Dynarr_add (dst, (code >> 16) & 255);
- Dynarr_add (dst, (code >> 8) & 255);
- Dynarr_add (dst, code & 255);
-}
-
-static int
-detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
- unsigned int n)
-{
- while (n--)
- {
- int c = *src++;
- switch (st->ucs4.in_byte)
- {
- case 0:
- if (c >= 128)
- return 0;
- else
- st->ucs4.in_byte++;
- break;
- case 3:
- st->ucs4.in_byte = 0;
- break;
- default:
- st->ucs4.in_byte++;
- }
- }
- return CODING_CATEGORY_UCS4_MASK;
-}
-
-static void
-decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- unsigned char counter = str->counter;
-
- while (n--)
- {
- unsigned char c = *src++;
- switch (counter)
- {
- case 0:
- ch = c;
- counter = 3;
- break;
- case 1:
- decode_ucs4 ( ( ch << 8 ) | c, dst);
- ch = 0;
- counter = 0;
- break;
- default:
- ch = ( ch << 8 ) | c;
- counter--;
- }
- }
- if (counter & CODING_STATE_END)
- DECODE_OUTPUT_PARTIAL_CHAR (ch);
-
- str->flags = flags;
- str->ch = ch;
- str->counter = counter;
-}
-
-static void
-encode_coding_ucs4 (Lstream *encoding, const unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- unsigned char char_boundary = str->iso2022.current_char_boundary;
- Lisp_Object charset = str->iso2022.current_charset;
-
-#ifdef ENABLE_COMPOSITE_CHARS
- /* flags for handling composite chars. We do a little switcharoo
- on the source while we're outputting the composite char. */
- unsigned int saved_n = 0;
- const unsigned char *saved_src = NULL;
- int in_composite = 0;
-
- back_to_square_n:
-#endif
-
- while (n--)
- {
- unsigned char c = *src++;
-
- if (BYTE_ASCII_P (c))
- { /* Processing ASCII character */
- ch = 0;
- encode_ucs4 (Vcharset_ascii, c, 0, dst);
- char_boundary = 1;
- }
- else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
- { /* Processing Leading Byte */
- ch = 0;
- charset = CHARSET_BY_LEADING_BYTE (c);
- if (LEADING_BYTE_PREFIX_P(c))
- ch = c;
- char_boundary = 0;
- }
- else
- { /* Processing Non-ASCII character */
- char_boundary = 1;
- if (EQ (charset, Vcharset_control_1))
- {
- encode_ucs4 (Vcharset_control_1, c, 0, dst);
- }
- else
- {
- switch (XCHARSET_REP_BYTES (charset))
- {
- case 2:
- encode_ucs4 (charset, c, 0, dst);
- break;
- case 3:
- if (XCHARSET_PRIVATE_P (charset))
- {
- encode_ucs4 (charset, c, 0, dst);
- ch = 0;
- }
- else if (ch)
- {
-#ifdef ENABLE_COMPOSITE_CHARS
- if (EQ (charset, Vcharset_composite))
- {
- if (in_composite)
- {
- /* #### Bother! We don't know how to
- handle this yet. */
- Dynarr_add (dst, 0);
- Dynarr_add (dst, 0);
- Dynarr_add (dst, 0);
- Dynarr_add (dst, '~');
- }
- else
- {
- Emchar emch = MAKE_CHAR (Vcharset_composite,
- ch & 0x7F, c & 0x7F);
- Lisp_Object lstr = composite_char_string (emch);
- saved_n = n;
- saved_src = src;
- in_composite = 1;
- src = XSTRING_DATA (lstr);
- n = XSTRING_LENGTH (lstr);
- }
- }
- else
-#endif /* ENABLE_COMPOSITE_CHARS */
- {
- encode_ucs4(charset, ch, c, dst);
- }
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- case 4:
- if (ch)
- {
- encode_ucs4 (charset, ch, c, dst);
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- default:
- abort ();
- }
- }
- }
- }
-
-#ifdef ENABLE_COMPOSITE_CHARS
- if (in_composite)
- {
- n = saved_n;
- src = saved_src;
- in_composite = 0;
- goto back_to_square_n; /* Wheeeeeeeee ..... */
- }
-#endif /* ENABLE_COMPOSITE_CHARS */
-
- str->flags = flags;
- str->ch = ch;
- str->iso2022.current_char_boundary = char_boundary;
- str->iso2022.current_charset = charset;
-
- /* Verbum caro factum est! */
-}
-
-\f
-/************************************************************************/
-/* UTF-8 methods */
-/************************************************************************/
-
-static int
-detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
- unsigned int n)
-{
- while (n--)
- {
- unsigned char c = *src++;
- switch (st->utf8.in_byte)
- {
- case 0:
- if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
- return 0;
- else if (c >= 0xfc)
- st->utf8.in_byte = 5;
- else if (c >= 0xf8)
- st->utf8.in_byte = 4;
- else if (c >= 0xf0)
- st->utf8.in_byte = 3;
- else if (c >= 0xe0)
- st->utf8.in_byte = 2;
- else if (c >= 0xc0)
- st->utf8.in_byte = 1;
- else if (c >= 0x80)
- return 0;
- break;
- default:
- if ((c & 0xc0) != 0x80)
- return 0;
- else
- st->utf8.in_byte--;
- }
- }
- return CODING_CATEGORY_UTF8_MASK;
-}
-
-static void
-decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- eol_type_t eol_type = str->eol_type;
- unsigned char counter = str->counter;
-
- while (n--)
- {
- unsigned char c = *src++;
- switch (counter)
- {
- case 0:
- if ( c >= 0xfc )
- {
- ch = c & 0x01;
- counter = 5;
- }
- else if ( c >= 0xf8 )
- {
- ch = c & 0x03;
- counter = 4;
- }
- else if ( c >= 0xf0 )
- {
- ch = c & 0x07;
- counter = 3;
- }
- else if ( c >= 0xe0 )
- {
- ch = c & 0x0f;
- counter = 2;
- }
- else if ( c >= 0xc0 )
- {
- ch = c & 0x1f;
- counter = 1;
- }
- else
- {
- DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
- decode_ucs4 (c, dst);
- }
- break;
- case 1:
- ch = ( ch << 6 ) | ( c & 0x3f );
- decode_ucs4 (ch, dst);
- ch = 0;
- counter = 0;
- break;
- default:
- ch = ( ch << 6 ) | ( c & 0x3f );
- counter--;
- }
- label_continue_loop:;
- }
-
- if (flags & CODING_STATE_END)
- DECODE_OUTPUT_PARTIAL_CHAR (ch);
-
- str->flags = flags;
- str->ch = ch;
- str->counter = counter;
-}
-
-static void
-encode_utf8 (Lisp_Object charset,
- unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
-{
- unsigned long code = mule_char_to_ucs4 (charset, h, l);
- if ( code <= 0x7f )
- {
- Dynarr_add (dst, code);
- }
- else if ( code <= 0x7ff )
- {
- Dynarr_add (dst, (code >> 6) | 0xc0);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else if ( code <= 0xffff )
- {
- Dynarr_add (dst, (code >> 12) | 0xe0);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else if ( code <= 0x1fffff )
- {
- Dynarr_add (dst, (code >> 18) | 0xf0);
- Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else if ( code <= 0x3ffffff )
- {
- Dynarr_add (dst, (code >> 24) | 0xf8);
- Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else
- {
- Dynarr_add (dst, (code >> 30) | 0xfc);
- Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
-}
-
-static void
-encode_coding_utf8 (Lstream *encoding, const unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
- unsigned char char_boundary = str->iso2022.current_char_boundary;
- Lisp_Object charset = str->iso2022.current_charset;
-
-#ifdef ENABLE_COMPOSITE_CHARS
- /* flags for handling composite chars. We do a little switcharoo
- on the source while we're outputting the composite char. */
- unsigned int saved_n = 0;
- const unsigned char *saved_src = NULL;
- int in_composite = 0;
-
- back_to_square_n:
-#endif /* ENABLE_COMPOSITE_CHARS */
-
- while (n--)
- {
- unsigned char c = *src++;
-
- if (BYTE_ASCII_P (c))
- { /* Processing ASCII character */
- ch = 0;
- if (c == '\n')
- {
- if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
- Dynarr_add (dst, '\r');
- if (eol_type != EOL_CR)
- Dynarr_add (dst, c);
- }
- else
- encode_utf8 (Vcharset_ascii, c, 0, dst);
- char_boundary = 1;
- }
- else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
- { /* Processing Leading Byte */
- ch = 0;
- charset = CHARSET_BY_LEADING_BYTE (c);
- if (LEADING_BYTE_PREFIX_P(c))
- ch = c;
- char_boundary = 0;
- }
- else
- { /* Processing Non-ASCII character */
- char_boundary = 1;
- if (EQ (charset, Vcharset_control_1))
- {
- encode_utf8 (Vcharset_control_1, c, 0, dst);
- }
- else
- {
- switch (XCHARSET_REP_BYTES (charset))
- {
- case 2:
- encode_utf8 (charset, c, 0, dst);
- break;
- case 3:
- if (XCHARSET_PRIVATE_P (charset))
- {
- encode_utf8 (charset, c, 0, dst);
- ch = 0;
- }
- else if (ch)
- {
-#ifdef ENABLE_COMPOSITE_CHARS
- if (EQ (charset, Vcharset_composite))
- {
- if (in_composite)
- {
- /* #### Bother! We don't know how to
- handle this yet. */
- encode_utf8 (Vcharset_ascii, '~', 0, dst);
- }
- else
- {
- Emchar emch = MAKE_CHAR (Vcharset_composite,
- ch & 0x7F, c & 0x7F);
- Lisp_Object lstr = composite_char_string (emch);
- saved_n = n;
- saved_src = src;
- in_composite = 1;
- src = XSTRING_DATA (lstr);
- n = XSTRING_LENGTH (lstr);
- }
- }
- else
-#endif /* ENABLE_COMPOSITE_CHARS */
- {
- encode_utf8 (charset, ch, c, dst);
- }
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- case 4:
- if (ch)
- {
- encode_utf8 (charset, ch, c, dst);
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- default:
- abort ();
- }
- }
- }
- }
-
-#ifdef ENABLE_COMPOSITE_CHARS
- if (in_composite)
- {
- n = saved_n;
- src = saved_src;
- in_composite = 0;
- goto back_to_square_n; /* Wheeeeeeeee ..... */
- }
-#endif
-
- str->flags = flags;
- str->ch = ch;
- str->iso2022.current_char_boundary = char_boundary;
- str->iso2022.current_charset = charset;
-
- /* Verbum caro factum est! */
-}
-
-\f
-/************************************************************************/
-/* ISO2022 methods */
-/************************************************************************/
-
-/* The following note describes the coding system ISO2022 briefly.
- Since the intention of this note is to help understand the
- functions in this file, some parts are NOT ACCURATE or OVERLY
- SIMPLIFIED. For thorough understanding, please refer to the
- original document of ISO2022.
-
- ISO2022 provides many mechanisms to encode several character sets
- in 7-bit and 8-bit environments. For 7-bit environments, all text
- is encoded using bytes less than 128. This may make the encoded
- text a little bit longer, but the text passes more easily through
- several gateways, some of which strip off MSB (Most Signigant Bit).
-
- There are two kinds of character sets: control character set and
- graphic character set. The former contains control characters such
- as `newline' and `escape' to provide control functions (control
- functions are also provided by escape sequences). The latter
- contains graphic characters such as 'A' and '-'. Emacs recognizes
- two control character sets and many graphic character sets.
-
- Graphic character sets are classified into one of the following
- four classes, according to the number of bytes (DIMENSION) and
- number of characters in one dimension (CHARS) of the set:
- - DIMENSION1_CHARS94
- - DIMENSION1_CHARS96
- - DIMENSION2_CHARS94
- - DIMENSION2_CHARS96
-
- In addition, each character set is assigned an identification tag,
- unique for each set, called "final character" (denoted as <F>
- hereafter). The <F> of each character set is decided by ECMA(*)
- when it is registered in ISO. The code range of <F> is 0x30..0x7F
- (0x30..0x3F are for private use only).
-
- Note (*): ECMA = European Computer Manufacturers Association
-
- Here are examples of graphic character set [NAME(<F>)]:
- o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
- o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
- o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
- o DIMENSION2_CHARS96 -- none for the moment
-
- A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
- C0 [0x00..0x1F] -- control character plane 0
- GL [0x20..0x7F] -- graphic character plane 0
- C1 [0x80..0x9F] -- control character plane 1
- GR [0xA0..0xFF] -- graphic character plane 1
-
- A control character set is directly designated and invoked to C0 or
- C1 by an escape sequence. The most common case is that:
- - ISO646's control character set is designated/invoked to C0, and
- - ISO6429's control character set is designated/invoked to C1,
- and usually these designations/invocations are omitted in encoded
- text. In a 7-bit environment, only C0 can be used, and a control
- character for C1 is encoded by an appropriate escape sequence to
- fit into the environment. All control characters for C1 are
- defined to have corresponding escape sequences.
-
- A graphic character set is at first designated to one of four
- graphic registers (G0 through G3), then these graphic registers are
- invoked to GL or GR. These designations and invocations can be
- done independently. The most common case is that G0 is invoked to
- GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
- these invocations and designations are omitted in encoded text.
- In a 7-bit environment, only GL can be used.
-
- When a graphic character set of CHARS94 is invoked to GL, codes
- 0x20 and 0x7F of the GL area work as control characters SPACE and
- DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
- be used.
-
- There are two ways of invocation: locking-shift and single-shift.
- With locking-shift, the invocation lasts until the next different
- invocation, whereas with single-shift, the invocation affects the
- following character only and doesn't affect the locking-shift
- state. Invocations are done by the following control characters or
- escape sequences:
-
- ----------------------------------------------------------------------
- abbrev function cntrl escape seq description
- ----------------------------------------------------------------------
- SI/LS0 (shift-in) 0x0F none invoke G0 into GL
- SO/LS1 (shift-out) 0x0E none invoke G1 into GL
- LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
- LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
- LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
- LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
- LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
- SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
- SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
- ----------------------------------------------------------------------
- (*) These are not used by any known coding system.
-
- Control characters for these functions are defined by macros
- ISO_CODE_XXX in `coding.h'.
-
- Designations are done by the following escape sequences: