+ if (c < sizeof (fcd->ucs_to_mule_table))
+ {
+ fcd->ucs_to_mule_table[c] = character;
+ return Qt;
+ }
+ else
+ return Qnil;
+}
+
+static Lisp_Object
+ucs_to_char (unsigned long code)
+{
+ if (code < sizeof (fcd->ucs_to_mule_table))
+ {
+ return fcd->ucs_to_mule_table[code];
+ }
+ else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
+ {
+ unsigned int c;
+
+ code -= 0xe00000;
+ c = code % (94 * 94);
+ return make_char
+ (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
+ (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
+ CHARSET_LEFT_TO_RIGHT),
+ c / 94 + 33, c % 94 + 33));
+ }
+ else
+ return Qnil;
+}
+
+DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
+Return Mule character corresponding to UCS code CODE (a positive integer).
+*/
+ (code))
+{
+ CHECK_NATNUM (code);
+ return ucs_to_char (XINT (code));
+}
+
+DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
+Map Mule character CHARACTER to UCS code CODE (a positive integer).
+*/
+ (character, code))
+{
+ /* #### Isn't this gilding the lily? Fput_char_table checks its args.
+ Fset_char_ucs is more restrictive on index arg, but should
+ check code arg in a char_table method. */
+ CHECK_CHAR (character);
+ CHECK_NATNUM (code);
+ return Fput_char_table (character, code, mule_to_ucs_table);
+}
+
+DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
+Return the UCS code (a positive integer) corresponding to CHARACTER.
+*/
+ (character))
+{
+ return Fget_char_table (character, mule_to_ucs_table);
+}
+#endif
+
+#ifdef UTF2000
+#define decode_ucs4 DECODE_ADD_UCS_CHAR
+#else
+/* Decode a UCS-4 character into a buffer. If the lookup fails, use
+ <GETA MARK> (U+3013) of JIS X 0208, which means correct character
+ is not found, instead.
+ #### do something more appropriate (use blob?)
+ Danger, Will Robinson! Data loss. Should we signal user? */
+static void
+decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
+{
+ Lisp_Object chr = ucs_to_char (ch);
+
+ if (! NILP (chr))
+ {
+ Bufbyte work[MAX_EMCHAR_LEN];
+ int len;
+
+ ch = XCHAR (chr);
+ len = (ch < 128) ?
+ simple_set_charptr_emchar (work, ch) :
+ non_ascii_set_charptr_emchar (work, ch);
+ Dynarr_add_many (dst, work, len);
+ }
+ else
+ {
+ Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
+ Dynarr_add (dst, 34 + 128);
+ Dynarr_add (dst, 46 + 128);
+ }
+}
+#endif
+
+#ifndef UTF2000
+static unsigned long
+mule_char_to_ucs4 (Lisp_Object charset,
+ unsigned char h, unsigned char l)
+{
+ Lisp_Object code
+ = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
+ mule_to_ucs_table);
+
+ if (INTP (code))
+ {
+ return XINT (code);
+ }
+ else if ( (XCHARSET_DIMENSION (charset) == 2) &&
+ (XCHARSET_CHARS (charset) == 94) )
+ {
+ unsigned char final = XCHARSET_FINAL (charset);
+
+ if ( ('@' <= final) && (final < 0x7f) )
+ {
+ return 0xe00000 + (final - '@') * 94 * 94
+ + ((h & 127) - 33) * 94 + (l & 127) - 33;
+ }
+ else
+ {
+ return '?';
+ }
+ }
+ else
+ {
+ return '?';
+ }
+}
+
+static void
+encode_ucs4 (Lisp_Object charset,
+ unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
+{
+ unsigned long code = mule_char_to_ucs4 (charset, h, l);
+ Dynarr_add (dst, code >> 24);
+ Dynarr_add (dst, (code >> 16) & 255);
+ Dynarr_add (dst, (code >> 8) & 255);
+ Dynarr_add (dst, code & 255);
+}
+#endif
+
+static int
+detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
+ unsigned int n)
+{
+ while (n--)
+ {
+ int c = *src++;
+ switch (st->ucs4.in_byte)
+ {
+ case 0:
+ if (c >= 128)
+ return 0;
+ else
+ st->ucs4.in_byte++;
+ break;
+ case 3:
+ st->ucs4.in_byte = 0;
+ break;
+ default:
+ st->ucs4.in_byte++;
+ }
+ }
+ return CODING_CATEGORY_UCS4_MASK;
+}
+
+static void
+decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+ struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ unsigned char counter = str->counter;
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (counter)
+ {
+ case 0:
+ ch = c;
+ counter = 3;
+ break;
+ case 1:
+ decode_ucs4 ( ( ch << 8 ) | c, dst);
+ ch = 0;
+ counter = 0;
+ break;
+ default:
+ ch = ( ch << 8 ) | c;
+ counter--;
+ }
+ }
+ if (counter & CODING_STATE_END)
+ DECODE_OUTPUT_PARTIAL_CHAR (ch);
+
+ str->flags = flags;
+ str->ch = ch;
+ str->counter = counter;
+}
+
+static void
+encode_coding_ucs4 (Lstream *encoding, const unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+#ifndef UTF2000
+ struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ unsigned char char_boundary = str->iso2022.current_char_boundary;
+ Lisp_Object charset = str->iso2022.current_charset;
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ /* flags for handling composite chars. We do a little switcharoo
+ on the source while we're outputting the composite char. */
+ unsigned int saved_n = 0;
+ const unsigned char *saved_src = NULL;
+ int in_composite = 0;
+
+ back_to_square_n:
+#endif
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+
+ if (BYTE_ASCII_P (c))
+ { /* Processing ASCII character */
+ ch = 0;
+ encode_ucs4 (Vcharset_ascii, c, 0, dst);
+ char_boundary = 1;
+ }
+ else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
+ { /* Processing Leading Byte */
+ ch = 0;
+ charset = CHARSET_BY_LEADING_BYTE (c);
+ if (LEADING_BYTE_PREFIX_P(c))
+ ch = c;
+ char_boundary = 0;
+ }
+ else
+ { /* Processing Non-ASCII character */
+ char_boundary = 1;
+ if (EQ (charset, Vcharset_control_1))
+ {
+ encode_ucs4 (Vcharset_control_1, c, 0, dst);
+ }
+ else
+ {
+ switch (XCHARSET_REP_BYTES (charset))
+ {
+ case 2:
+ encode_ucs4 (charset, c, 0, dst);
+ break;
+ case 3:
+ if (XCHARSET_PRIVATE_P (charset))
+ {
+ encode_ucs4 (charset, c, 0, dst);
+ ch = 0;
+ }
+ else if (ch)
+ {
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (EQ (charset, Vcharset_composite))
+ {
+ if (in_composite)
+ {
+ /* #### Bother! We don't know how to
+ handle this yet. */
+ Dynarr_add (dst, 0);
+ Dynarr_add (dst, 0);
+ Dynarr_add (dst, 0);
+ Dynarr_add (dst, '~');
+ }
+ else
+ {
+ Emchar emch = MAKE_CHAR (Vcharset_composite,
+ ch & 0x7F, c & 0x7F);
+ Lisp_Object lstr = composite_char_string (emch);
+ saved_n = n;
+ saved_src = src;
+ in_composite = 1;
+ src = XSTRING_DATA (lstr);
+ n = XSTRING_LENGTH (lstr);
+ }
+ }
+ else
+#endif /* ENABLE_COMPOSITE_CHARS */
+ {
+ encode_ucs4(charset, ch, c, dst);
+ }
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ case 4:
+ if (ch)
+ {
+ encode_ucs4 (charset, ch, c, dst);
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ default:
+ abort ();
+ }
+ }
+ }
+ }
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (in_composite)
+ {
+ n = saved_n;
+ src = saved_src;
+ in_composite = 0;
+ goto back_to_square_n; /* Wheeeeeeeee ..... */
+ }
+#endif /* ENABLE_COMPOSITE_CHARS */
+
+ str->flags = flags;
+ str->ch = ch;
+ str->iso2022.current_char_boundary = char_boundary;
+ str->iso2022.current_charset = charset;
+
+ /* Verbum caro factum est! */
+#endif
+}
+
+\f
+/************************************************************************/
+/* UTF-8 methods */
+/************************************************************************/
+
+static int
+detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
+ unsigned int n)
+{
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (st->utf8.in_byte)
+ {
+ case 0:
+ if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+ return 0;
+ else if (c >= 0xfc)
+ st->utf8.in_byte = 5;
+ else if (c >= 0xf8)
+ st->utf8.in_byte = 4;
+ else if (c >= 0xf0)
+ st->utf8.in_byte = 3;
+ else if (c >= 0xe0)
+ st->utf8.in_byte = 2;
+ else if (c >= 0xc0)
+ st->utf8.in_byte = 1;
+ else if (c >= 0x80)
+ return 0;
+ break;
+ default:
+ if ((c & 0xc0) != 0x80)
+ return 0;
+ else
+ st->utf8.in_byte--;
+ }
+ }
+ return CODING_CATEGORY_UTF8_MASK;
+}
+
+static void
+decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+ struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = str->eol_type;
+ unsigned char counter = str->counter;
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (counter)
+ {
+ case 0:
+ if ( c >= 0xfc )
+ {
+ ch = c & 0x01;
+ counter = 5;
+ }
+ else if ( c >= 0xf8 )
+ {
+ ch = c & 0x03;
+ counter = 4;
+ }
+ else if ( c >= 0xf0 )
+ {
+ ch = c & 0x07;
+ counter = 3;
+ }
+ else if ( c >= 0xe0 )
+ {
+ ch = c & 0x0f;
+ counter = 2;
+ }
+ else if ( c >= 0xc0 )
+ {
+ ch = c & 0x1f;
+ counter = 1;
+ }
+ else
+ {
+ DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
+ decode_ucs4 (c, dst);
+ }
+ break;
+ case 1:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ decode_ucs4 (ch, dst);
+ ch = 0;
+ counter = 0;
+ break;
+ default:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ counter--;
+ }
+ label_continue_loop:;
+ }
+
+ if (flags & CODING_STATE_END)
+ DECODE_OUTPUT_PARTIAL_CHAR (ch);
+
+ str->flags = flags;
+ str->ch = ch;
+ str->counter = counter;
+}
+
+#ifndef UTF2000
+static void
+encode_utf8 (Lisp_Object charset,
+ unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
+{
+ unsigned long code = mule_char_to_ucs4 (charset, h, l);
+ if ( code <= 0x7f )
+ {
+ Dynarr_add (dst, code);
+ }
+ else if ( code <= 0x7ff )
+ {
+ Dynarr_add (dst, (code >> 6) | 0xc0);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else if ( code <= 0xffff )
+ {
+ Dynarr_add (dst, (code >> 12) | 0xe0);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else if ( code <= 0x1fffff )
+ {
+ Dynarr_add (dst, (code >> 18) | 0xf0);
+ Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else if ( code <= 0x3ffffff )
+ {
+ Dynarr_add (dst, (code >> 24) | 0xf8);
+ Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else
+ {
+ Dynarr_add (dst, (code >> 30) | 0xfc);
+ Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+}
+#endif
+
+static void
+encode_coding_utf8 (Lstream *encoding, const unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+ struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
+ unsigned char char_boundary = str->iso2022.current_char_boundary;
+#ifdef UTF2000
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (char_boundary)
+ {
+ case 0:
+ if ( c >= 0xfc )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 5;
+ }
+ else if ( c >= 0xf8 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 4;
+ }
+ else if ( c >= 0xf0 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 3;
+ }
+ else if ( c >= 0xe0 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 2;
+ }
+ else if ( c >= 0xc0 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 1;
+ }
+ else
+ {
+ if (c == '\n')
+ {
+ if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
+ Dynarr_add (dst, '\r');
+ if (eol_type != EOL_CR)
+ Dynarr_add (dst, c);
+ }
+ else
+ Dynarr_add (dst, c);
+ char_boundary = 0;
+ }
+ break;
+ case 1:
+ Dynarr_add (dst, c);
+ char_boundary = 0;
+ break;
+ default:
+ Dynarr_add (dst, c);
+ char_boundary--;
+ }
+ }
+#else /* not UTF2000 */
+ Lisp_Object charset = str->iso2022.current_charset;
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ /* flags for handling composite chars. We do a little switcharoo
+ on the source while we're outputting the composite char. */
+ unsigned int saved_n = 0;
+ const unsigned char *saved_src = NULL;
+ int in_composite = 0;
+
+ back_to_square_n:
+#endif /* ENABLE_COMPOSITE_CHARS */
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+
+ if (BYTE_ASCII_P (c))
+ { /* Processing ASCII character */
+ ch = 0;
+ if (c == '\n')
+ {
+ if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
+ Dynarr_add (dst, '\r');
+ if (eol_type != EOL_CR)
+ Dynarr_add (dst, c);
+ }
+ else
+ encode_utf8 (Vcharset_ascii, c, 0, dst);
+ char_boundary = 1;
+ }
+ else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
+ { /* Processing Leading Byte */
+ ch = 0;
+ charset = CHARSET_BY_LEADING_BYTE (c);
+ if (LEADING_BYTE_PREFIX_P(c))
+ ch = c;
+ char_boundary = 0;
+ }
+ else
+ { /* Processing Non-ASCII character */
+ char_boundary = 1;
+ if (EQ (charset, Vcharset_control_1))
+ {
+ encode_utf8 (Vcharset_control_1, c, 0, dst);
+ }
+ else
+ {
+ switch (XCHARSET_REP_BYTES (charset))
+ {
+ case 2:
+ encode_utf8 (charset, c, 0, dst);
+ break;
+ case 3:
+ if (XCHARSET_PRIVATE_P (charset))
+ {
+ encode_utf8 (charset, c, 0, dst);
+ ch = 0;
+ }
+ else if (ch)
+ {
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (EQ (charset, Vcharset_composite))
+ {
+ if (in_composite)
+ {
+ /* #### Bother! We don't know how to
+ handle this yet. */
+ encode_utf8 (Vcharset_ascii, '~', 0, dst);
+ }
+ else
+ {
+ Emchar emch = MAKE_CHAR (Vcharset_composite,
+ ch & 0x7F, c & 0x7F);
+ Lisp_Object lstr = composite_char_string (emch);
+ saved_n = n;
+ saved_src = src;
+ in_composite = 1;
+ src = XSTRING_DATA (lstr);
+ n = XSTRING_LENGTH (lstr);
+ }
+ }
+ else
+#endif /* ENABLE_COMPOSITE_CHARS */
+ {
+ encode_utf8 (charset, ch, c, dst);
+ }
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ case 4:
+ if (ch)
+ {
+ encode_utf8 (charset, ch, c, dst);
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ default:
+ abort ();
+ }
+ }
+ }
+ }
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (in_composite)
+ {
+ n = saved_n;
+ src = saved_src;
+ in_composite = 0;
+ goto back_to_square_n; /* Wheeeeeeeee ..... */
+ }
+#endif
+
+#endif /* not UTF2000 */
+ str->flags = flags;
+ str->ch = ch;
+ str->iso2022.current_char_boundary = char_boundary;
+#ifndef UTF2000
+ str->iso2022.current_charset = charset;
+#endif
+
+ /* Verbum caro factum est! */
+}
+
+\f
+/************************************************************************/
+/* ISO2022 methods */
+/************************************************************************/
+
+/* The following note describes the coding system ISO2022 briefly.
+ Since the intention of this note is to help understand the
+ functions in this file, some parts are NOT ACCURATE or OVERLY
+ SIMPLIFIED. For thorough understanding, please refer to the
+ original document of ISO2022.
+
+ ISO2022 provides many mechanisms to encode several character sets
+ in 7-bit and 8-bit environments. For 7-bit environments, all text
+ is encoded using bytes less than 128. This may make the encoded
+ text a little bit longer, but the text passes more easily through
+ several gateways, some of which strip off MSB (Most Signigant Bit).
+
+ There are two kinds of character sets: control character set and
+ graphic character set. The former contains control characters such
+ as `newline' and `escape' to provide control functions (control
+ functions are also provided by escape sequences). The latter
+ contains graphic characters such as 'A' and '-'. Emacs recognizes
+ two control character sets and many graphic character sets.
+
+ Graphic character sets are classified into one of the following
+ four classes, according to the number of bytes (DIMENSION) and
+ number of characters in one dimension (CHARS) of the set:
+ - DIMENSION1_CHARS94
+ - DIMENSION1_CHARS96
+ - DIMENSION2_CHARS94
+ - DIMENSION2_CHARS96
+
+ In addition, each character set is assigned an identification tag,
+ unique for each set, called "final character" (denoted as <F>
+ hereafter). The <F> of each character set is decided by ECMA(*)
+ when it is registered in ISO. The code range of <F> is 0x30..0x7F
+ (0x30..0x3F are for private use only).
+
+ Note (*): ECMA = European Computer Manufacturers Association
+
+ Here are examples of graphic character set [NAME(<F>)]:
+ o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
+ o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
+ o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
+ o DIMENSION2_CHARS96 -- none for the moment
+
+ A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
+ C0 [0x00..0x1F] -- control character plane 0
+ GL [0x20..0x7F] -- graphic character plane 0
+ C1 [0x80..0x9F] -- control character plane 1
+ GR [0xA0..0xFF] -- graphic character plane 1