- unsigned char final = XCHARSET_FINAL (charset);
-
- if ( ('@' <= final) && (final < 0x7f) )
- {
- return 0xe00000 + (final - '@') * 94 * 94
- + ((h & 127) - 33) * 94 + (l & 127) - 33;
- }
- else
- {
- return '?';
- }
- }
- else
- {
- return '?';
- }
-}
-
-static void
-encode_ucs4 (Lisp_Object charset,
- unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
-{
- unsigned long code = mule_char_to_ucs4 (charset, h, l);
- Dynarr_add (dst, code >> 24);
- Dynarr_add (dst, (code >> 16) & 255);
- Dynarr_add (dst, (code >> 8) & 255);
- Dynarr_add (dst, code & 255);
-}
-
-static int
-detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
- unsigned int n)
-{
- while (n--)
- {
- int c = *src++;
- switch (st->ucs4.in_byte)
- {
- case 0:
- if (c >= 128)
- return 0;
- else
- st->ucs4.in_byte++;
- break;
- case 3:
- st->ucs4.in_byte = 0;
- break;
- default:
- st->ucs4.in_byte++;
- }
- }
- return CODING_CATEGORY_UCS4_MASK;
-}
-
-static void
-decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- unsigned char counter = str->counter;
-
- while (n--)
- {
- unsigned char c = *src++;
- switch (counter)
- {
- case 0:
- ch = c;
- counter = 3;
- break;
- case 1:
- decode_ucs4 ( ( ch << 8 ) | c, dst);
- ch = 0;
- counter = 0;
- break;
- default:
- ch = ( ch << 8 ) | c;
- counter--;
- }
- }
- if (counter & CODING_STATE_END)
- DECODE_OUTPUT_PARTIAL_CHAR (ch);
-
- str->flags = flags;
- str->ch = ch;
- str->counter = counter;
-}
-
-static void
-encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- unsigned char char_boundary = str->iso2022.current_char_boundary;
- Lisp_Object charset = str->iso2022.current_charset;
-
-#ifdef ENABLE_COMPOSITE_CHARS
- /* flags for handling composite chars. We do a little switcharoo
- on the source while we're outputting the composite char. */
- unsigned int saved_n = 0;
- CONST unsigned char *saved_src = NULL;
- int in_composite = 0;
-
- back_to_square_n:
-#endif
-
- while (n--)
- {
- unsigned char c = *src++;
-
- if (BYTE_ASCII_P (c))
- { /* Processing ASCII character */
- ch = 0;
- encode_ucs4 (Vcharset_ascii, c, 0, dst);
- char_boundary = 1;
- }
- else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
- { /* Processing Leading Byte */
- ch = 0;
- charset = CHARSET_BY_LEADING_BYTE (c);
- if (LEADING_BYTE_PREFIX_P(c))
- ch = c;
- char_boundary = 0;
- }
- else
- { /* Processing Non-ASCII character */
- char_boundary = 1;
- if (EQ (charset, Vcharset_control_1))
- {
- encode_ucs4 (Vcharset_control_1, c, 0, dst);
- }
- else
- {
- switch (XCHARSET_REP_BYTES (charset))
- {
- case 2:
- encode_ucs4 (charset, c, 0, dst);
- break;
- case 3:
- if (XCHARSET_PRIVATE_P (charset))
- {
- encode_ucs4 (charset, c, 0, dst);
- ch = 0;
- }
- else if (ch)
- {
-#ifdef ENABLE_COMPOSITE_CHARS
- if (EQ (charset, Vcharset_composite))
- {
- if (in_composite)
- {
- /* #### Bother! We don't know how to
- handle this yet. */
- Dynarr_add (dst, 0);
- Dynarr_add (dst, 0);
- Dynarr_add (dst, 0);
- Dynarr_add (dst, '~');
- }
- else
- {
- Emchar emch = MAKE_CHAR (Vcharset_composite,
- ch & 0x7F, c & 0x7F);
- Lisp_Object lstr = composite_char_string (emch);
- saved_n = n;
- saved_src = src;
- in_composite = 1;
- src = XSTRING_DATA (lstr);
- n = XSTRING_LENGTH (lstr);
- }
- }
- else
-#endif /* ENABLE_COMPOSITE_CHARS */
- {
- encode_ucs4(charset, ch, c, dst);
- }
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- case 4:
- if (ch)
- {
- encode_ucs4 (charset, ch, c, dst);
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- default:
- abort ();
- }
- }
- }
- }
-
-#ifdef ENABLE_COMPOSITE_CHARS
- if (in_composite)
- {
- n = saved_n;
- src = saved_src;
- in_composite = 0;
- goto back_to_square_n; /* Wheeeeeeeee ..... */
- }
-#endif /* ENABLE_COMPOSITE_CHARS */
-
- str->flags = flags;
- str->ch = ch;
- str->iso2022.current_char_boundary = char_boundary;
- str->iso2022.current_charset = charset;
-
- /* Verbum caro factum est! */
-}
-
-\f
-/************************************************************************/
-/* UTF-8 methods */
-/************************************************************************/
-
-static int
-detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
- unsigned int n)
-{
- while (n--)
- {
- unsigned char c = *src++;
- switch (st->utf8.in_byte)
- {
- case 0:
- if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
- return 0;
- else if (c >= 0xfc)
- st->utf8.in_byte = 5;
- else if (c >= 0xf8)
- st->utf8.in_byte = 4;
- else if (c >= 0xf0)
- st->utf8.in_byte = 3;
- else if (c >= 0xe0)
- st->utf8.in_byte = 2;
- else if (c >= 0xc0)
- st->utf8.in_byte = 1;
- else if (c >= 0x80)
- return 0;
- break;
- default:
- if ((c & 0xc0) != 0x80)
- return 0;
- else
- st->utf8.in_byte--;
- }
- }
- return CODING_CATEGORY_UTF8_MASK;
-}
-
-static void
-decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- eol_type_t eol_type = str->eol_type;
- unsigned char counter = str->counter;
-
- while (n--)
- {
- unsigned char c = *src++;
- switch (counter)
- {
- case 0:
- if ( c >= 0xfc )
- {
- ch = c & 0x01;
- counter = 5;
- }
- else if ( c >= 0xf8 )
- {
- ch = c & 0x03;
- counter = 4;
- }
- else if ( c >= 0xf0 )
- {
- ch = c & 0x07;
- counter = 3;
- }
- else if ( c >= 0xe0 )
- {
- ch = c & 0x0f;
- counter = 2;
- }
- else if ( c >= 0xc0 )
- {
- ch = c & 0x1f;
- counter = 1;
- }
- else
- {
- DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
- decode_ucs4 (c, dst);
- }
- break;
- case 1:
- ch = ( ch << 6 ) | ( c & 0x3f );
- decode_ucs4 (ch, dst);
- ch = 0;
- counter = 0;
- break;
- default:
- ch = ( ch << 6 ) | ( c & 0x3f );
- counter--;
- }
- label_continue_loop:;
- }
-
- if (flags & CODING_STATE_END)
- DECODE_OUTPUT_PARTIAL_CHAR (ch);
-
- str->flags = flags;
- str->ch = ch;
- str->counter = counter;
-}
-
-static void
-encode_utf8 (Lisp_Object charset,
- unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
-{
- unsigned long code = mule_char_to_ucs4 (charset, h, l);
- if ( code <= 0x7f )
- {
- Dynarr_add (dst, code);
- }
- else if ( code <= 0x7ff )
- {
- Dynarr_add (dst, (code >> 6) | 0xc0);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else if ( code <= 0xffff )
- {
- Dynarr_add (dst, (code >> 12) | 0xe0);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else if ( code <= 0x1fffff )
- {
- Dynarr_add (dst, (code >> 18) | 0xf0);
- Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else if ( code <= 0x3ffffff )
- {
- Dynarr_add (dst, (code >> 24) | 0xf8);
- Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
- else
- {
- Dynarr_add (dst, (code >> 30) | 0xfc);
- Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (code & 0x3f) | 0x80);
- }
-}
-
-static void
-encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
- unsigned_char_dynarr *dst, unsigned int n)
-{
- struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- unsigned int flags = str->flags;
- unsigned int ch = str->ch;
- eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
- unsigned char char_boundary = str->iso2022.current_char_boundary;
- Lisp_Object charset = str->iso2022.current_charset;
-
-#ifdef ENABLE_COMPOSITE_CHARS
- /* flags for handling composite chars. We do a little switcharoo
- on the source while we're outputting the composite char. */
- unsigned int saved_n = 0;
- CONST unsigned char *saved_src = NULL;
- int in_composite = 0;
-
- back_to_square_n:
-#endif /* ENABLE_COMPOSITE_CHARS */
-
- while (n--)
- {
- unsigned char c = *src++;
-
- if (BYTE_ASCII_P (c))
- { /* Processing ASCII character */
- ch = 0;
- if (c == '\n')
- {
- if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
- Dynarr_add (dst, '\r');
- if (eol_type != EOL_CR)
- Dynarr_add (dst, c);
- }
- else
- encode_utf8 (Vcharset_ascii, c, 0, dst);
- char_boundary = 1;
- }
- else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
- { /* Processing Leading Byte */
- ch = 0;
- charset = CHARSET_BY_LEADING_BYTE (c);
- if (LEADING_BYTE_PREFIX_P(c))
- ch = c;
- char_boundary = 0;
- }
- else
- { /* Processing Non-ASCII character */
- char_boundary = 1;
- if (EQ (charset, Vcharset_control_1))
- {
- encode_utf8 (Vcharset_control_1, c, 0, dst);
- }
- else
- {
- switch (XCHARSET_REP_BYTES (charset))
- {
- case 2:
- encode_utf8 (charset, c, 0, dst);
- break;
- case 3:
- if (XCHARSET_PRIVATE_P (charset))
- {
- encode_utf8 (charset, c, 0, dst);
- ch = 0;
- }
- else if (ch)
- {
-#ifdef ENABLE_COMPOSITE_CHARS
- if (EQ (charset, Vcharset_composite))
- {
- if (in_composite)
- {
- /* #### Bother! We don't know how to
- handle this yet. */
- encode_utf8 (Vcharset_ascii, '~', 0, dst);
- }
- else
- {
- Emchar emch = MAKE_CHAR (Vcharset_composite,
- ch & 0x7F, c & 0x7F);
- Lisp_Object lstr = composite_char_string (emch);
- saved_n = n;
- saved_src = src;
- in_composite = 1;
- src = XSTRING_DATA (lstr);
- n = XSTRING_LENGTH (lstr);
- }
- }
- else
-#endif /* ENABLE_COMPOSITE_CHARS */
- {
- encode_utf8 (charset, ch, c, dst);
- }
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- case 4:
- if (ch)
- {
- encode_utf8 (charset, ch, c, dst);
- ch = 0;
- }
- else
- {
- ch = c;
- char_boundary = 0;
- }
- break;
- default:
- abort ();
- }
- }
- }
- }
-
-#ifdef ENABLE_COMPOSITE_CHARS
- if (in_composite)
- {
- n = saved_n;
- src = saved_src;
- in_composite = 0;
- goto back_to_square_n; /* Wheeeeeeeee ..... */
- }
-#endif
-
- str->flags = flags;
- str->ch = ch;
- str->iso2022.current_char_boundary = char_boundary;
- str->iso2022.current_charset = charset;
-
- /* Verbum caro factum est! */
-}
-
-\f
-/************************************************************************/
-/* ISO2022 methods */
-/************************************************************************/
-
-/* The following note describes the coding system ISO2022 briefly.
- Since the intention of this note is to help understand the
- functions in this file, some parts are NOT ACCURATE or OVERLY
- SIMPLIFIED. For thorough understanding, please refer to the
- original document of ISO2022.
-
- ISO2022 provides many mechanisms to encode several character sets
- in 7-bit and 8-bit environments. For 7-bit environments, all text
- is encoded using bytes less than 128. This may make the encoded
- text a little bit longer, but the text passes more easily through
- several gateways, some of which strip off MSB (Most Signigant Bit).
-
- There are two kinds of character sets: control character set and
- graphic character set. The former contains control characters such
- as `newline' and `escape' to provide control functions (control
- functions are also provided by escape sequences). The latter
- contains graphic characters such as 'A' and '-'. Emacs recognizes
- two control character sets and many graphic character sets.
-
- Graphic character sets are classified into one of the following
- four classes, according to the number of bytes (DIMENSION) and
- number of characters in one dimension (CHARS) of the set:
- - DIMENSION1_CHARS94
- - DIMENSION1_CHARS96
- - DIMENSION2_CHARS94
- - DIMENSION2_CHARS96
-
- In addition, each character set is assigned an identification tag,
- unique for each set, called "final character" (denoted as <F>
- hereafter). The <F> of each character set is decided by ECMA(*)
- when it is registered in ISO. The code range of <F> is 0x30..0x7F
- (0x30..0x3F are for private use only).
-
- Note (*): ECMA = European Computer Manufacturers Association
-
- Here are examples of graphic character set [NAME(<F>)]:
- o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
- o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
- o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
- o DIMENSION2_CHARS96 -- none for the moment
-
- A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
- C0 [0x00..0x1F] -- control character plane 0
- GL [0x20..0x7F] -- graphic character plane 0
- C1 [0x80..0x9F] -- control character plane 1
- GR [0xA0..0xFF] -- graphic character plane 1
-
- A control character set is directly designated and invoked to C0 or
- C1 by an escape sequence. The most common case is that:
- - ISO646's control character set is designated/invoked to C0, and
- - ISO6429's control character set is designated/invoked to C1,
- and usually these designations/invocations are omitted in encoded
- text. In a 7-bit environment, only C0 can be used, and a control
- character for C1 is encoded by an appropriate escape sequence to
- fit into the environment. All control characters for C1 are
- defined to have corresponding escape sequences.
-
- A graphic character set is at first designated to one of four
- graphic registers (G0 through G3), then these graphic registers are
- invoked to GL or GR. These designations and invocations can be
- done independently. The most common case is that G0 is invoked to
- GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
- these invocations and designations are omitted in encoded text.
- In a 7-bit environment, only GL can be used.
-
- When a graphic character set of CHARS94 is invoked to GL, codes
- 0x20 and 0x7F of the GL area work as control characters SPACE and
- DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
- be used.
-
- There are two ways of invocation: locking-shift and single-shift.
- With locking-shift, the invocation lasts until the next different
- invocation, whereas with single-shift, the invocation affects the
- following character only and doesn't affect the locking-shift
- state. Invocations are done by the following control characters or
- escape sequences:
-
- ----------------------------------------------------------------------
- abbrev function cntrl escape seq description
- ----------------------------------------------------------------------
- SI/LS0 (shift-in) 0x0F none invoke G0 into GL
- SO/LS1 (shift-out) 0x0E none invoke G1 into GL
- LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
- LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
- LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
- LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
- LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
- SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
- SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
- ----------------------------------------------------------------------
- (*) These are not used by any known coding system.
-
- Control characters for these functions are defined by macros
- ISO_CODE_XXX in `coding.h'.
-
- Designations are done by the following escape sequences:
- ----------------------------------------------------------------------
- escape sequence description
- ----------------------------------------------------------------------
- ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
- ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
- ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
- ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
- ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
- ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
- ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
- ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
- ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
- ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
- ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
- ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
- ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
- ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
- ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
- ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
- ----------------------------------------------------------------------
-
- In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
- of dimension 1, chars 94, and final character <F>, etc...
-
- Note (*): Although these designations are not allowed in ISO2022,
- Emacs accepts them on decoding, and produces them on encoding
- CHARS96 character sets in a coding system which is characterized as
- 7-bit environment, non-locking-shift, and non-single-shift.
-
- Note (**): If <F> is '@', 'A', or 'B', the intermediate character
- '(' can be omitted. We refer to this as "short-form" hereafter.
-
- Now you may notice that there are a lot of ways for encoding the
- same multilingual text in ISO2022. Actually, there exist many
- coding systems such as Compound Text (used in X11's inter client
- communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
- (used in Korean internet), EUC (Extended UNIX Code, used in Asian
- localized platforms), and all of these are variants of ISO2022.
-
- In addition to the above, Emacs handles two more kinds of escape
- sequences: ISO6429's direction specification and Emacs' private
- sequence for specifying character composition.
-
- ISO6429's direction specification takes the following form:
- o CSI ']' -- end of the current direction
- o CSI '0' ']' -- end of the current direction
- o CSI '1' ']' -- start of left-to-right text
- o CSI '2' ']' -- start of right-to-left text
- The control character CSI (0x9B: control sequence introducer) is
- abbreviated to the escape sequence ESC '[' in a 7-bit environment.
-
- Character composition specification takes the following form:
- o ESC '0' -- start character composition
- o ESC '1' -- end character composition
- Since these are not standard escape sequences of any ISO standard,
- their use with these meanings is restricted to Emacs only. */
-
-static void
-reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (!NILP (coding_system))
- iso->charset[i] =
- XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
- else
- iso->charset[i] = Qt;
- iso->invalid_designated[i] = 0;
- }
- iso->esc = ISO_ESC_NOTHING;
- iso->esc_bytes_index = 0;
- iso->register_left = 0;
- iso->register_right = 1;
- iso->switched_dir_and_no_valid_charset_yet = 0;
- iso->invalid_switch_dir = 0;
- iso->output_direction_sequence = 0;
- iso->output_literally = 0;
-#ifdef ENABLE_COMPOSITE_CHARS
- if (iso->composite_chars)
- Dynarr_reset (iso->composite_chars);
-#endif
-}
-
-static int
-fit_to_be_escape_quoted (unsigned char c)
-{
- switch (c)
- {
- case ISO_CODE_ESC:
- case ISO_CODE_CSI:
- case ISO_CODE_SS2:
- case ISO_CODE_SS3:
- case ISO_CODE_SO:
- case ISO_CODE_SI:
- return 1;