From: tomo Date: Mon, 8 Jul 2002 08:42:53 +0000 (+0000) Subject: (Vcoded_charset_entity_reference_alist): New variable. X-Git-Tag: r21-2-44-0_18-m12~9 X-Git-Url: http://git.chise.org/gitweb/?a=commitdiff_plain;h=7532868f03090d56d24722bb26f37e3c9a65fe03;p=chise%2Fxemacs-chise.git (Vcoded_charset_entity_reference_alist): New variable. (Quse_entity_reference): New variable. (Qd): New variable. (Qx): New variable. (QX): New variable. (coding_system_description): Add description for `ccs_priority_list'. (mark_coding_system): Mark `ccs_priority_list' in UTF-2000. (allocate_coding_system): Initialize `ccs_priority_list' in UTF-2000. (Fmake_coding_system): Add description about 'use-entity-reference; setup CODING_SYSTEM_USE_ENTITY_REFERENCE(codesys). (Fcoding_system_property): Accept `disable-composition' and `use-entity-reference' in UTF-2000. (struct decoding_stream): Add new member `er_counter' and `er_buf' in UTF-2000. (decode_flush_er_chars): New inline function. (decode_add_er_char): New function. (decode_coding_utf8): Use `decode_flush_er_chars' and `decode_add_er_char'. (COMPOSE_FLUSH_CHARS): Use `decode_add_er_char' instead of `DECODE_ADD_UCS_CHAR'. (COMPOSE_ADD_CHAR): Likewise. (reset_decoding_stream): Initialize `str->er_counter' in UTF-2000. (decode_coding_big5): Use `decode_defined_char' instead of `DECODE_DEFINED_CHAR'. (decode_coding_utf8): Decode entity-reference if CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys). (char_encode_utf8): Encode non-Unicode characters as entity-references if CODING_SYSTEM_USE_ENTITY_REFERENCE(str->codesys). (syms_of_file_coding): Add new symbols `use-entity-reference', `d', `x', `X'. (vars_of_file_coding): Add new variable `coded-charset-entity-reference-alist'. (complex_vars_of_file_coding): Declare `disable-composition' and `use-entity-reference' to be coding-system-properties in UTF-2000. --- diff --git a/src/text-coding.c b/src/text-coding.c index 2a0ed68..796e757 100644 --- a/src/text-coding.c +++ b/src/text-coding.c @@ -1,7 +1,7 @@ /* Code conversion functions. Copyright (C) 1991, 1995 Free Software Foundation, Inc. Copyright (C) 1995 Sun Microsystems, Inc. - Copyright (C) 1999,2000,2001 MORIOKA Tomohiko + Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko This file is part of XEmacs. @@ -23,6 +23,7 @@ Boston, MA 02111-1307, USA. */ /* Synched up with: Mule 2.3. Not in FSF. */ /* Rewritten by Ben Wing . */ +/* Rewritten by MORIOKA Tomohiko for XEmacs UTF-2000. */ #include #include "lisp.h" @@ -46,6 +47,8 @@ Lisp_Object Vcoding_system_for_read; Lisp_Object Vcoding_system_for_write; Lisp_Object Vfile_name_coding_system; +Lisp_Object Vcoded_charset_entity_reference_alist; + /* Table of symbols identifying each coding category. */ Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST]; @@ -103,6 +106,8 @@ Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift; #endif #ifdef UTF2000 Lisp_Object Qdisable_composition; +Lisp_Object Quse_entity_reference; +Lisp_Object Qd, Qx, QX; #endif Lisp_Object Qencode, Qdecode; @@ -329,6 +334,9 @@ static const struct lrecord_description coding_system_description[] = { { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) }, +#ifdef UTF2000 + { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) }, +#endif #endif { XD_END } }; @@ -397,6 +405,9 @@ mark_coding_system (Lisp_Object obj) } mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys)); +#ifdef UTF2000 + mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys)); +#endif return CODING_SYSTEM_POST_READ_CONVERSION (codesys); } @@ -645,6 +656,9 @@ allocate_coding_system (enum coding_system_type type, Lisp_Object name) CODING_SYSTEM_TYPE (codesys) = type; CODING_SYSTEM_MNEMONIC (codesys) = Qnil; #ifdef MULE +#ifdef UTF2000 + CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil; +#endif if (type == CODESYS_ISO2022) { int i; @@ -814,6 +828,9 @@ character set. Recognized properties are: If non-nil, composition/decomposition for combining characters are disabled. +'use-entity-reference + If non-nil, SGML style entity-reference is used for non-system-characters. + 'post-read-conversion Function called after a file has been read in, to perform the decoding. Called with two arguments, START and END, denoting @@ -965,6 +982,8 @@ if TYPE is 'ccl: #ifdef UTF2000 else if (EQ (key, Qdisable_composition)) CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value); + else if (EQ (key, Quse_entity_reference)) + CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value); #endif #ifdef MULE else if (ty == CODESYS_ISO2022) @@ -1427,6 +1446,12 @@ Return the PROP property of CODING-SYSTEM. else if (EQ (prop, Qpre_write_conversion)) return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system); #ifdef MULE +#ifdef UTF2000 + else if (EQ (prop, Qdisable_composition)) + return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil; + else if (EQ (prop, Quse_entity_reference)) + return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil; +#endif else if (type == CODESYS_ISO2022) { if (EQ (prop, Qcharset_g0)) @@ -2224,6 +2249,9 @@ struct decoding_stream unsigned char counter; #endif #ifdef UTF2000 + unsigned char er_counter; + unsigned char er_buf[16]; + unsigned combined_char_count; Emchar combined_chars[16]; Lisp_Object combining_table; @@ -2235,6 +2263,128 @@ struct decoding_stream extern Lisp_Object Vcharacter_composition_table; INLINE_HEADER void +decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst); +INLINE_HEADER void +decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst) +{ + if ( str->er_counter > 0) + { + Dynarr_add_many (dst, str->er_buf, str->er_counter); + str->er_counter = 0; + } +} + +void decode_add_er_char (struct decoding_stream *str, Emchar character, + unsigned_char_dynarr* dst); +void +decode_add_er_char (struct decoding_stream *str, Emchar c, + unsigned_char_dynarr* dst) +{ + if (str->er_counter == 0) + { + if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys) + && (c == '&') ) + { + str->er_buf[0] = '&'; + str->er_counter++; + } + else + DECODE_ADD_UCS_CHAR (c, dst); + } + else if (c == ';') + { + Lisp_Object string = make_string (str->er_buf, + str->er_counter); + Lisp_Object rest = Vcoded_charset_entity_reference_alist; + Lisp_Object cell; + Lisp_Object ret; + Lisp_Object pat; + Lisp_Object ccs; + int base; + + while (!NILP (rest)) + { + cell = Fcar (rest); + ccs = Fcar (cell); + if (NILP (ccs = Ffind_charset (ccs))) + continue; + + cell = Fcdr (cell); + ret = Fcar (cell); + if (STRINGP (ret)) + pat = ret; + else + continue; + + cell = Fcdr (cell); + cell = Fcdr (cell); + ret = Fcar (cell); + if (EQ (ret, Qd)) + { + pat = concat3 (build_string ("^&"), + pat, build_string ("\\([0-9]+\\)$")); + base = 10; + } + else if (EQ (ret, Qx)) + { + pat = concat3 (build_string ("^&"), + pat, build_string ("\\([0-9a-f]+\\)$")); + base = 16; + } + else if (EQ (ret, QX)) + { + pat = concat3 (build_string ("^&"), + pat, build_string ("\\([0-9A-F]+\\)$")); + base = 16; + } + else + continue; + + if (!NILP (Fstring_match (pat, string, Qnil, Qnil))) + { + int code + = XINT (Fstring_to_number + (Fsubstring (string, + Fmatch_beginning (make_int (1)), + Fmatch_end (make_int (1))), + make_int (base))); + + DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst); + goto decoded; + } + rest = Fcdr (rest); + } + if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"), + string, Qnil, Qnil))) + { + int code + = XINT (Fstring_to_number + (Fsubstring (string, + Fmatch_beginning (make_int (1)), + Fmatch_end (make_int (1))), + make_int (16))); + + DECODE_ADD_UCS_CHAR (code, dst); + } + else + { + Dynarr_add_many (dst, str->er_buf, str->er_counter); + Dynarr_add (dst, ';'); + } + decoded: + str->er_counter = 0; + } + else if ( (str->er_counter >= 16) || (c >= 0x7F) ) + { + Dynarr_add_many (dst, str->er_buf, str->er_counter); + str->er_counter = 0; + DECODE_ADD_UCS_CHAR (c, dst); + } + else + str->er_buf[str->er_counter++] = c; +} + +INLINE_HEADER void COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst); INLINE_HEADER void COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst) @@ -2242,19 +2392,19 @@ COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst) unsigned i; for (i = 0; i < str->combined_char_count; i++) - DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst); + decode_add_er_char (str, str->combined_chars[i], dst); str->combined_char_count = 0; str->combining_table = Qnil; } -void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character, - unsigned_char_dynarr* dst); +void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character, + unsigned_char_dynarr* dst); void -COMPOSE_ADD_CHAR(struct decoding_stream *str, - Emchar character, unsigned_char_dynarr* dst) +COMPOSE_ADD_CHAR (struct decoding_stream *str, + Emchar character, unsigned_char_dynarr* dst) { if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys)) - DECODE_ADD_UCS_CHAR (character, dst); + decode_add_er_char (str, character, dst); else if (!CHAR_TABLEP (str->combining_table)) { Lisp_Object ret @@ -2262,7 +2412,7 @@ COMPOSE_ADD_CHAR(struct decoding_stream *str, character); if (NILP (ret)) - DECODE_ADD_UCS_CHAR (character, dst); + decode_add_er_char (str, character, dst); else { str->combined_chars[0] = character; @@ -2284,7 +2434,7 @@ COMPOSE_ADD_CHAR(struct decoding_stream *str, char2); if (NILP (ret)) { - DECODE_ADD_UCS_CHAR (char2, dst); + decode_add_er_char (str, character, dst); str->combined_char_count = 0; str->combining_table = Qnil; } @@ -2303,7 +2453,7 @@ COMPOSE_ADD_CHAR(struct decoding_stream *str, else { COMPOSE_FLUSH_CHARS (str, dst); - DECODE_ADD_UCS_CHAR (character, dst); + decode_add_er_char (str, character, dst); } } } @@ -2446,6 +2596,7 @@ reset_decoding_stream (struct decoding_stream *str) str->counter = 0; #endif /* MULE */ #ifdef UTF2000 + str->er_counter = 0; str->combined_char_count = 0; str->combining_table = Qnil; #endif @@ -3624,7 +3775,7 @@ decode_coding_big5 (Lstream *decoding, const Extbyte *src, { #ifdef UTF2000 int code_point = (cpos << 8) | c; - Emchar char_id = DECODE_DEFINED_CHAR (ccs, code_point); + Emchar char_id = decode_defined_char (ccs, code_point); if (char_id < 0) char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point); @@ -4011,35 +4162,42 @@ decode_coding_utf8 (Lstream *decoding, const Extbyte *src, unsigned char c = *(unsigned char *)src++; if (counter == 0) { - if ( c < 0xC0 ) + if ( c < ' ' ) { + decode_flush_er_chars (str, dst); DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); DECODE_ADD_UCS_CHAR (c, dst); } - else if ( c < 0xE0 ) - { - cpos = c & 0x1f; - counter = 1; - } - else if ( c < 0xF0 ) - { - cpos = c & 0x0f; - counter = 2; - } - else if ( c < 0xF8 ) - { - cpos = c & 0x07; - counter = 3; - } - else if ( c < 0xFC ) - { - cpos = c & 0x03; - counter = 4; - } + else if ( c < 0xC0 ) + decode_add_er_char (str, c, dst); else { - cpos = c & 0x01; - counter = 5; + decode_flush_er_chars (str, dst); + if ( c < 0xE0 ) + { + cpos = c & 0x1f; + counter = 1; + } + else if ( c < 0xF0 ) + { + cpos = c & 0x0f; + counter = 2; + } + else if ( c < 0xF8 ) + { + cpos = c & 0x07; + counter = 3; + } + else if ( c < 0xFC ) + { + cpos = c & 0x03; + counter = 4; + } + else + { + cpos = c & 0x01; + counter = 5; + } } } else if ( (c & 0xC0) == 0x80 ) @@ -4065,12 +4223,15 @@ decode_coding_utf8 (Lstream *decoding, const Extbyte *src, } if (flags & CODING_STATE_END) - if (counter > 0) - { - decode_output_utf8_partial_char (counter, cpos, dst); - cpos = 0; - counter = 0; - } + { + decode_flush_er_chars (str, dst); + if (counter > 0) + { + decode_output_utf8_partial_char (counter, cpos, dst); + cpos = 0; + counter = 0; + } + } str->flags = flags; str->cpos = cpos; str->counter = counter; @@ -4093,40 +4254,117 @@ char_encode_utf8 (struct encoding_stream *str, Emchar ch, { Dynarr_add (dst, ch); } - else if (ch <= 0x7ff) - { - Dynarr_add (dst, (ch >> 6) | 0xc0); - Dynarr_add (dst, (ch & 0x3f) | 0x80); - } - else if (ch <= 0xffff) - { - Dynarr_add (dst, (ch >> 12) | 0xe0); - Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80); - Dynarr_add (dst, (ch & 0x3f) | 0x80); - } - else if (ch <= 0x1fffff) - { - Dynarr_add (dst, (ch >> 18) | 0xf0); - Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80); - Dynarr_add (dst, (ch & 0x3f) | 0x80); - } - else if (ch <= 0x3ffffff) - { - Dynarr_add (dst, (ch >> 24) | 0xf8); - Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80); - Dynarr_add (dst, (ch & 0x3f) | 0x80); - } else { - Dynarr_add (dst, (ch >> 30) | 0xfc); - Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80); - Dynarr_add (dst, (ch & 0x3f) | 0x80); + int code_point = charset_code_point (Vcharset_ucs, ch); + + if ( (code_point < 0) || (code_point > 0x10FFFF) ) + { + if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)) + { + Lisp_Object rest = Vcoded_charset_entity_reference_alist; + Lisp_Object cell; + Lisp_Object ret; + Lisp_Object ccs; + int format_columns, idx; + char buf[16], format[16]; + + while (!NILP (rest)) + { + cell = Fcar (rest); + ccs = Fcar (cell); + if (!NILP (ccs = Ffind_charset (ccs))) + { + if ( (code_point + = charset_code_point (ccs, ch)) >= 0 ) + { + cell = Fcdr (cell); + ret = Fcar (cell); + if (STRINGP (ret) + && ((idx =XSTRING_LENGTH (ret)) <= 6)) + { + strncpy (format, XSTRING_DATA (ret), idx); + } + else + continue; + + cell = Fcdr (cell); + ret = Fcar (cell); + if (INTP (ret)) + { + format [idx++] = '%'; + format_columns = XINT (ret); + if ( (2 <= format_columns) + && (format_columns <= 8) ) + { + format [idx++] = '0'; + format [idx++] = '0' + format_columns; + } + } + + cell = Fcdr (cell); + ret = Fcar (cell); + if (EQ (ret, Qd)) + format [idx++] = 'd'; + else if (EQ (ret, Qx)) + format [idx++] = 'x'; + else if (EQ (ret, QX)) + format [idx++] = 'X'; + else + continue; + format [idx++] = 0; + + sprintf (buf, format, code_point); + Dynarr_add (dst, '&'); + Dynarr_add_many (dst, buf, strlen (buf)); + Dynarr_add (dst, ';'); + return; + } + } + rest = Fcdr (rest); + } + sprintf (buf, "&MCS-%08X;", ch); + Dynarr_add_many (dst, buf, strlen (buf)); + return; + } + else + code_point = ch; + } + if (code_point <= 0x7ff) + { + Dynarr_add (dst, (code_point >> 6) | 0xc0); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } + else if (code_point <= 0xffff) + { + Dynarr_add (dst, (code_point >> 12) | 0xe0); + Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } + else if (code_point <= 0x1fffff) + { + Dynarr_add (dst, (code_point >> 18) | 0xf0); + Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } + else if (code_point <= 0x3ffffff) + { + Dynarr_add (dst, (code_point >> 24) | 0xf8); + Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } + else + { + Dynarr_add (dst, (code_point >> 30) | 0xfc); + Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } } } @@ -5789,6 +6027,10 @@ syms_of_file_coding (void) #endif /* MULE */ #ifdef UTF2000 defsymbol (&Qdisable_composition, "disable-composition"); + defsymbol (&Quse_entity_reference, "use-entity-reference"); + defsymbol (&Qd, "d"); + defsymbol (&Qx, "x"); + defsymbol (&QX, "X"); #endif defsymbol (&Qencode, "encode"); defsymbol (&Qdecode, "decode"); @@ -5892,6 +6134,17 @@ Coding system used to convert pathnames when accessing files. */ ); Vfile_name_coding_system = Qnil; + DEFVAR_LISP ("coded-charset-entity-reference-alist", + &Vcoded_charset_entity_reference_alist /* +Alist of coded-charset vs corresponding entity-reference. +Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE). +CCS is coded-charset. +CODE-COLUMNS is columns of code-point of entity-reference. +CODE-TYPE is format type of code-point of entity-reference. +`d' means decimal value and `x' means hexadecimal value. +*/ ); + Vcoded_charset_entity_reference_alist = Qnil; + DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /* Non-nil means the buffer contents are regarded as multi-byte form of characters, not a binary code. This affects the display, file I/O, @@ -5948,6 +6201,10 @@ complex_vars_of_file_coding (void) DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode); DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode); +#ifdef UTF2000 + DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition); + DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference); +#endif #endif /* MULE */ /* Need to create this here or we're really screwed. */ Fmake_coding_system