+static int
+detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
+ unsigned int n)
+{
+ while (n--)
+ {
+ int c = *src++;
+ switch (st->ucs4.in_byte)
+ {
+ case 0:
+ if (c >= 128)
+ return 0;
+ else
+ st->ucs4.in_byte++;
+ break;
+ case 3:
+ st->ucs4.in_byte = 0;
+ break;
+ default:
+ st->ucs4.in_byte++;
+ }
+ }
+ return CODING_CATEGORY_UCS4_MASK;
+}
+
+static void
+decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+ struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ unsigned char counter = str->counter;
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (counter)
+ {
+ case 0:
+ ch = c;
+ counter = 3;
+ break;
+ case 1:
+ decode_ucs4 ( ( ch << 8 ) | c, dst);
+ ch = 0;
+ counter = 0;
+ break;
+ default:
+ ch = ( ch << 8 ) | c;
+ counter--;
+ }
+ }
+ if (counter & CODING_STATE_END)
+ DECODE_OUTPUT_PARTIAL_CHAR (ch);
+
+ str->flags = flags;
+ str->ch = ch;
+ str->counter = counter;
+}
+
+static void
+encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+#ifndef UTF2000
+ struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ unsigned char char_boundary = str->iso2022.current_char_boundary;
+ Lisp_Object charset = str->iso2022.current_charset;
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ /* flags for handling composite chars. We do a little switcharoo
+ on the source while we're outputting the composite char. */
+ unsigned int saved_n = 0;
+ CONST unsigned char *saved_src = NULL;
+ int in_composite = 0;
+
+ back_to_square_n:
+#endif
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+
+ if (BYTE_ASCII_P (c))
+ { /* Processing ASCII character */
+ ch = 0;
+ encode_ucs4 (Vcharset_ascii, c, 0, dst);
+ char_boundary = 1;
+ }
+ else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
+ { /* Processing Leading Byte */
+ ch = 0;
+ charset = CHARSET_BY_LEADING_BYTE (c);
+ if (LEADING_BYTE_PREFIX_P(c))
+ ch = c;
+ char_boundary = 0;
+ }
+ else
+ { /* Processing Non-ASCII character */
+ char_boundary = 1;
+ if (EQ (charset, Vcharset_control_1))
+ {
+ encode_ucs4 (Vcharset_control_1, c, 0, dst);
+ }
+ else
+ {
+ switch (XCHARSET_REP_BYTES (charset))
+ {
+ case 2:
+ encode_ucs4 (charset, c, 0, dst);
+ break;
+ case 3:
+ if (XCHARSET_PRIVATE_P (charset))
+ {
+ encode_ucs4 (charset, c, 0, dst);
+ ch = 0;
+ }
+ else if (ch)
+ {
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (EQ (charset, Vcharset_composite))
+ {
+ if (in_composite)
+ {
+ /* #### Bother! We don't know how to
+ handle this yet. */
+ Dynarr_add (dst, 0);
+ Dynarr_add (dst, 0);
+ Dynarr_add (dst, 0);
+ Dynarr_add (dst, '~');
+ }
+ else
+ {
+ Emchar emch = MAKE_CHAR (Vcharset_composite,
+ ch & 0x7F, c & 0x7F);
+ Lisp_Object lstr = composite_char_string (emch);
+ saved_n = n;
+ saved_src = src;
+ in_composite = 1;
+ src = XSTRING_DATA (lstr);
+ n = XSTRING_LENGTH (lstr);
+ }
+ }
+ else
+#endif /* ENABLE_COMPOSITE_CHARS */
+ {
+ encode_ucs4(charset, ch, c, dst);
+ }
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ case 4:
+ if (ch)
+ {
+ encode_ucs4 (charset, ch, c, dst);
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ default:
+ abort ();
+ }
+ }
+ }
+ }
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (in_composite)
+ {
+ n = saved_n;
+ src = saved_src;
+ in_composite = 0;
+ goto back_to_square_n; /* Wheeeeeeeee ..... */
+ }
+#endif /* ENABLE_COMPOSITE_CHARS */
+
+ str->flags = flags;
+ str->ch = ch;
+ str->iso2022.current_char_boundary = char_boundary;
+ str->iso2022.current_charset = charset;
+
+ /* Verbum caro factum est! */
+#endif
+}
+
+\f
+/************************************************************************/
+/* UTF-8 methods */
+/************************************************************************/
+
+static int
+detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
+ unsigned int n)
+{
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (st->utf8.in_byte)
+ {
+ case 0:
+ if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+ return 0;
+ else if (c >= 0xfc)
+ st->utf8.in_byte = 5;
+ else if (c >= 0xf8)
+ st->utf8.in_byte = 4;
+ else if (c >= 0xf0)
+ st->utf8.in_byte = 3;
+ else if (c >= 0xe0)
+ st->utf8.in_byte = 2;
+ else if (c >= 0xc0)
+ st->utf8.in_byte = 1;
+ else if (c >= 0x80)
+ return 0;
+ break;
+ default:
+ if ((c & 0xc0) != 0x80)
+ return 0;
+ else
+ st->utf8.in_byte--;
+ }
+ }
+ return CODING_CATEGORY_UTF8_MASK;
+}
+
+static void
+decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+ struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = str->eol_type;
+ unsigned char counter = str->counter;
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (counter)
+ {
+ case 0:
+ if ( c >= 0xfc )
+ {
+ ch = c & 0x01;
+ counter = 5;
+ }
+ else if ( c >= 0xf8 )
+ {
+ ch = c & 0x03;
+ counter = 4;
+ }
+ else if ( c >= 0xf0 )
+ {
+ ch = c & 0x07;
+ counter = 3;
+ }
+ else if ( c >= 0xe0 )
+ {
+ ch = c & 0x0f;
+ counter = 2;
+ }
+ else if ( c >= 0xc0 )
+ {
+ ch = c & 0x1f;
+ counter = 1;
+ }
+ else
+ {
+ DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
+ decode_ucs4 (c, dst);
+ }
+ break;
+ case 1:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ decode_ucs4 (ch, dst);
+ ch = 0;
+ counter = 0;
+ break;
+ default:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ counter--;
+ }
+ label_continue_loop:;
+ }
+
+ if (flags & CODING_STATE_END)
+ DECODE_OUTPUT_PARTIAL_CHAR (ch);
+
+ str->flags = flags;
+ str->ch = ch;
+ str->counter = counter;
+}
+
+#ifndef UTF2000
+static void
+encode_utf8 (Lisp_Object charset,
+ unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
+{
+ unsigned long code = mule_char_to_ucs4 (charset, h, l);
+ if ( code <= 0x7f )
+ {
+ Dynarr_add (dst, code);
+ }
+ else if ( code <= 0x7ff )
+ {
+ Dynarr_add (dst, (code >> 6) | 0xc0);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else if ( code <= 0xffff )
+ {
+ Dynarr_add (dst, (code >> 12) | 0xe0);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else if ( code <= 0x1fffff )
+ {
+ Dynarr_add (dst, (code >> 18) | 0xf0);
+ Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else if ( code <= 0x3ffffff )
+ {
+ Dynarr_add (dst, (code >> 24) | 0xf8);
+ Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else
+ {
+ Dynarr_add (dst, (code >> 30) | 0xfc);
+ Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+}
+#endif
+
+static void
+encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+ struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
+ unsigned char char_boundary = str->iso2022.current_char_boundary;
+#ifdef UTF2000
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (char_boundary)
+ {
+ case 0:
+ if ( c >= 0xfc )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 5;
+ }
+ else if ( c >= 0xf8 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 4;
+ }
+ else if ( c >= 0xf0 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 3;
+ }
+ else if ( c >= 0xe0 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 2;
+ }
+ else if ( c >= 0xc0 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 1;
+ }
+ else
+ {
+ if (c == '\n')
+ {
+ if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
+ Dynarr_add (dst, '\r');
+ if (eol_type != EOL_CR)
+ Dynarr_add (dst, c);
+ }
+ else
+ Dynarr_add (dst, c);
+ char_boundary = 0;
+ }
+ break;
+ case 1:
+ Dynarr_add (dst, c);
+ char_boundary = 0;
+ break;
+ default:
+ Dynarr_add (dst, c);
+ char_boundary--;
+ }
+ }
+#else /* not UTF2000 */
+ Lisp_Object charset = str->iso2022.current_charset;
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ /* flags for handling composite chars. We do a little switcharoo
+ on the source while we're outputting the composite char. */
+ unsigned int saved_n = 0;
+ CONST unsigned char *saved_src = NULL;
+ int in_composite = 0;
+
+ back_to_square_n:
+#endif /* ENABLE_COMPOSITE_CHARS */
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+
+ if (BYTE_ASCII_P (c))
+ { /* Processing ASCII character */
+ ch = 0;
+ if (c == '\n')
+ {
+ if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
+ Dynarr_add (dst, '\r');
+ if (eol_type != EOL_CR)
+ Dynarr_add (dst, c);
+ }
+ else
+ encode_utf8 (Vcharset_ascii, c, 0, dst);
+ char_boundary = 1;
+ }
+ else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
+ { /* Processing Leading Byte */
+ ch = 0;
+ charset = CHARSET_BY_LEADING_BYTE (c);
+ if (LEADING_BYTE_PREFIX_P(c))
+ ch = c;
+ char_boundary = 0;
+ }
+ else
+ { /* Processing Non-ASCII character */
+ char_boundary = 1;
+ if (EQ (charset, Vcharset_control_1))
+ {
+ encode_utf8 (Vcharset_control_1, c, 0, dst);
+ }
+ else
+ {
+ switch (XCHARSET_REP_BYTES (charset))
+ {
+ case 2:
+ encode_utf8 (charset, c, 0, dst);
+ break;
+ case 3:
+ if (XCHARSET_PRIVATE_P (charset))
+ {
+ encode_utf8 (charset, c, 0, dst);
+ ch = 0;
+ }
+ else if (ch)
+ {
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (EQ (charset, Vcharset_composite))
+ {
+ if (in_composite)
+ {
+ /* #### Bother! We don't know how to
+ handle this yet. */
+ encode_utf8 (Vcharset_ascii, '~', 0, dst);
+ }
+ else
+ {
+ Emchar emch = MAKE_CHAR (Vcharset_composite,
+ ch & 0x7F, c & 0x7F);
+ Lisp_Object lstr = composite_char_string (emch);
+ saved_n = n;
+ saved_src = src;
+ in_composite = 1;
+ src = XSTRING_DATA (lstr);
+ n = XSTRING_LENGTH (lstr);
+ }
+ }
+ else
+#endif /* ENABLE_COMPOSITE_CHARS */
+ {
+ encode_utf8 (charset, ch, c, dst);
+ }
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ case 4:
+ if (ch)
+ {
+ encode_utf8 (charset, ch, c, dst);
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ default:
+ abort ();
+ }
+ }
+ }
+ }
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (in_composite)
+ {
+ n = saved_n;
+ src = saved_src;
+ in_composite = 0;
+ goto back_to_square_n; /* Wheeeeeeeee ..... */
+ }
+#endif
+
+#endif /* not UTF2000 */
+ str->flags = flags;
+ str->ch = ch;
+ str->iso2022.current_char_boundary = char_boundary;
+#ifndef UTF2000
+ str->iso2022.current_charset = charset;
+#endif
+
+ /* Verbum caro factum est! */
+}
+
+\f
+/************************************************************************/
+/* ISO2022 methods */
+/************************************************************************/
+
+/* The following note describes the coding system ISO2022 briefly.
+ Since the intention of this note is to help understand the
+ functions in this file, some parts are NOT ACCURATE or OVERLY
+ SIMPLIFIED. For thorough understanding, please refer to the
+ original document of ISO2022.
+
+ ISO2022 provides many mechanisms to encode several character sets
+ in 7-bit and 8-bit environments. For 7-bit environments, all text
+ is encoded using bytes less than 128. This may make the encoded
+ text a little bit longer, but the text passes more easily through
+ several gateways, some of which strip off MSB (Most Signigant Bit).
+
+ There are two kinds of character sets: control character set and
+ graphic character set. The former contains control characters such
+ as `newline' and `escape' to provide control functions (control
+ functions are also provided by escape sequences). The latter
+ contains graphic characters such as 'A' and '-'. Emacs recognizes
+ two control character sets and many graphic character sets.
+
+ Graphic character sets are classified into one of the following
+ four classes, according to the number of bytes (DIMENSION) and
+ number of characters in one dimension (CHARS) of the set:
+ - DIMENSION1_CHARS94
+ - DIMENSION1_CHARS96
+ - DIMENSION2_CHARS94
+ - DIMENSION2_CHARS96
+
+ In addition, each character set is assigned an identification tag,
+ unique for each set, called "final character" (denoted as <F>
+ hereafter). The <F> of each character set is decided by ECMA(*)
+ when it is registered in ISO. The code range of <F> is 0x30..0x7F
+ (0x30..0x3F are for private use only).
+
+ Note (*): ECMA = European Computer Manufacturers Association
+
+ Here are examples of graphic character set [NAME(<F>)]:
+ o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
+ o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
+ o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
+ o DIMENSION2_CHARS96 -- none for the moment
+
+ A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
+ C0 [0x00..0x1F] -- control character plane 0
+ GL [0x20..0x7F] -- graphic character plane 0
+ C1 [0x80..0x9F] -- control character plane 1
+ GR [0xA0..0xFF] -- graphic character plane 1
+
+ A control character set is directly designated and invoked to C0 or
+ C1 by an escape sequence. The most common case is that:
+ - ISO646's control character set is designated/invoked to C0, and
+ - ISO6429's control character set is designated/invoked to C1,
+ and usually these designations/invocations are omitted in encoded
+ text. In a 7-bit environment, only C0 can be used, and a control
+ character for C1 is encoded by an appropriate escape sequence to
+ fit into the environment. All control characters for C1 are
+ defined to have corresponding escape sequences.
+
+ A graphic character set is at first designated to one of four
+ graphic registers (G0 through G3), then these graphic registers are
+ invoked to GL or GR. These designations and invocations can be
+ done independently. The most common case is that G0 is invoked to
+ GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
+ these invocations and designations are omitted in encoded text.
+ In a 7-bit environment, only GL can be used.
+
+ When a graphic character set of CHARS94 is invoked to GL, codes
+ 0x20 and 0x7F of the GL area work as control characters SPACE and
+ DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
+ be used.
+
+ There are two ways of invocation: locking-shift and single-shift.
+ With locking-shift, the invocation lasts until the next different
+ invocation, whereas with single-shift, the invocation affects the
+ following character only and doesn't affect the locking-shift
+ state. Invocations are done by the following control characters or
+ escape sequences:
+
+ ----------------------------------------------------------------------
+ abbrev function cntrl escape seq description
+ ----------------------------------------------------------------------
+ SI/LS0 (shift-in) 0x0F none invoke G0 into GL
+ SO/LS1 (shift-out) 0x0E none invoke G1 into GL
+ LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
+ LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
+ LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
+ LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
+ LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
+ SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
+ SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
+ ----------------------------------------------------------------------
+ (*) These are not used by any known coding system.
+
+ Control characters for these functions are defined by macros
+ ISO_CODE_XXX in `coding.h'.
+
+ Designations are done by the following escape sequences:
+ ----------------------------------------------------------------------
+ escape sequence description
+ ----------------------------------------------------------------------
+ ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
+ ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
+ ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
+ ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
+ ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
+ ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
+ ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
+ ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
+ ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
+ ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
+ ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
+ ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
+ ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
+ ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
+ ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
+ ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
+ ----------------------------------------------------------------------
+
+ In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
+ of dimension 1, chars 94, and final character <F>, etc...
+
+ Note (*): Although these designations are not allowed in ISO2022,
+ Emacs accepts them on decoding, and produces them on encoding
+ CHARS96 character sets in a coding system which is characterized as
+ 7-bit environment, non-locking-shift, and non-single-shift.
+
+ Note (**): If <F> is '@', 'A', or 'B', the intermediate character
+ '(' can be omitted. We refer to this as "short-form" hereafter.
+
+ Now you may notice that there are a lot of ways for encoding the