X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=src%2Ftext-coding.c;h=187d2a90e82e815323ee76494f8f47ac7044e259;hb=edb1d7f5d06e1f3ca783853fe435f41eaa32ea8e;hp=b0f987625e1e680b17f60a4349b684fcbc2dbcdf;hpb=7c800d4821261afacd6326a55941484246ad6de6;p=chise%2Fxemacs-chise.git- diff --git a/src/text-coding.c b/src/text-coding.c index b0f9876..187d2a9 100644 --- a/src/text-coding.c +++ b/src/text-coding.c @@ -1,7 +1,7 @@ /* Code conversion functions. Copyright (C) 1991, 1995 Free Software Foundation, Inc. Copyright (C) 1995 Sun Microsystems, Inc. - Copyright (C) 1999,2000 MORIOKA Tomohiko + Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko This file is part of XEmacs. @@ -23,6 +23,7 @@ Boston, MA 02111-1307, USA. */ /* Synched up with: Mule 2.3. Not in FSF. */ /* Rewritten by Ben Wing . */ +/* Rewritten by MORIOKA Tomohiko for XEmacs UTF-2000. */ #include #include "lisp.h" @@ -46,18 +47,20 @@ Lisp_Object Vcoding_system_for_read; Lisp_Object Vcoding_system_for_write; Lisp_Object Vfile_name_coding_system; +Lisp_Object Vcoded_charset_entity_reference_alist; + /* Table of symbols identifying each coding category. */ -Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1]; +Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST]; struct file_coding_dump { /* Coding system currently associated with each coding category. */ - Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1]; + Lisp_Object coding_category_system[CODING_CATEGORY_LAST]; /* Table of all coding categories in decreasing order of priority. This describes a permutation of the possible coding categories. */ - int coding_category_by_priority[CODING_CATEGORY_LAST + 1]; + int coding_category_by_priority[CODING_CATEGORY_LAST]; #if defined(MULE) && !defined(UTF2000) Lisp_Object ucs_to_mule_table[65536]; @@ -65,7 +68,7 @@ struct file_coding_dump { } *fcd; static const struct lrecord_description fcd_description_1[] = { - { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 }, + { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST }, #if defined(MULE) && !defined(UTF2000) { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) }, #endif @@ -103,6 +106,8 @@ Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift; #endif #ifdef UTF2000 Lisp_Object Qdisable_composition; +Lisp_Object Quse_entity_reference; +Lisp_Object Qd, Qx, QX; #endif Lisp_Object Qencode, Qdecode; @@ -197,8 +202,10 @@ static int detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n); static void decode_coding_big5 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, size_t n); -static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src, - unsigned_char_dynarr *dst, size_t n); +void char_encode_big5 (struct encoding_stream *str, Emchar c, + unsigned_char_dynarr *dst, unsigned int *flags); +void char_finish_big5 (struct encoding_stream *str, + unsigned_char_dynarr *dst, unsigned int *flags); static int detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n); @@ -327,6 +334,9 @@ static const struct lrecord_description coding_system_description[] = { { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) }, +#ifdef UTF2000 + { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) }, +#endif #endif { XD_END } }; @@ -377,6 +387,13 @@ mark_coding_system (Lisp_Object obj) } } break; +#ifdef UTF2000 + + case CODESYS_BIG5: + mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)); + mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)); + break; +#endif case CODESYS_CCL: mark_object (CODING_SYSTEM_CCL_DECODE (codesys)); @@ -388,6 +405,9 @@ mark_coding_system (Lisp_Object obj) } mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys)); +#ifdef UTF2000 + mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys)); +#endif return CODING_SYSTEM_POST_READ_CONVERSION (codesys); } @@ -636,12 +656,28 @@ allocate_coding_system (enum coding_system_type type, Lisp_Object name) CODING_SYSTEM_TYPE (codesys) = type; CODING_SYSTEM_MNEMONIC (codesys) = Qnil; #ifdef MULE +#ifdef UTF2000 + CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil; +#endif if (type == CODESYS_ISO2022) { int i; for (i = 0; i < 4; i++) CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil; } +#ifdef UTF2000 + if (type == CODESYS_BIG5) + { + CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0) + = Vcharset_ascii; + CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) + = Vcharset_chinese_big5; + CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) + = Qnil; + CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3) + = Qnil; + } +#endif else if (type == CODESYS_CCL) { CODING_SYSTEM_CCL_DECODE (codesys) = Qnil; @@ -792,6 +828,9 @@ character set. Recognized properties are: If non-nil, composition/decomposition for combining characters are disabled. +'use-entity-reference + If non-nil, SGML style entity-reference is used for non-system-characters. + 'post-read-conversion Function called after a file has been read in, to perform the decoding. Called with two arguments, START and END, denoting @@ -887,7 +926,6 @@ if TYPE is 'ccl: (name, type, doc_string, props)) { Lisp_Coding_System *codesys; - Lisp_Object rest, key, value; enum coding_system_type ty; int need_to_setup_eol_systems = 1; @@ -919,122 +957,135 @@ if TYPE is 'ccl: CHECK_STRING (doc_string); CODING_SYSTEM_DOC_STRING (codesys) = doc_string; - EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props) - { - if (EQ (key, Qmnemonic)) - { - if (!NILP (value)) - CHECK_STRING (value); - CODING_SYSTEM_MNEMONIC (codesys) = value; - } + { + EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props) + { + if (EQ (key, Qmnemonic)) + { + if (!NILP (value)) + CHECK_STRING (value); + CODING_SYSTEM_MNEMONIC (codesys) = value; + } - else if (EQ (key, Qeol_type)) - { - need_to_setup_eol_systems = NILP (value); - if (EQ (value, Qt)) - value = Qnil; - CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value); - } + else if (EQ (key, Qeol_type)) + { + need_to_setup_eol_systems = NILP (value); + if (EQ (value, Qt)) + value = Qnil; + CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value); + } - else if (EQ (key, Qpost_read_conversion)) - CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value; - else if (EQ (key, Qpre_write_conversion)) - CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value; + else if (EQ (key, Qpost_read_conversion)) + CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value; + else if (EQ (key, Qpre_write_conversion)) + CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value; #ifdef UTF2000 - else if (EQ (key, Qdisable_composition)) - CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value); + else if (EQ (key, Qdisable_composition)) + CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value); + else if (EQ (key, Quse_entity_reference)) + CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value); #endif #ifdef MULE - else if (ty == CODESYS_ISO2022) - { + else if (ty == CODESYS_ISO2022) + { #define FROB_INITIAL_CHARSET(charset_num) \ CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \ ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value)) - if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0); - else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1); - else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2); - else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3); + if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0); + else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1); + else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2); + else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3); #define FROB_FORCE_CHARSET(charset_num) \ CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value) - else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0); - else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1); - else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2); - else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3); + else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0); + else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1); + else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2); + else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3); #define FROB_BOOLEAN_PROPERTY(prop) \ CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value) - else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT); - else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL); - else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL); - else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN); - else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT); - else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429); - else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED); + else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT); + else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL); + else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL); + else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN); + else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT); + else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429); + else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED); - else if (EQ (key, Qinput_charset_conversion)) - { - codesys->iso2022.input_conv = - Dynarr_new (charset_conversion_spec); - parse_charset_conversion_specs (codesys->iso2022.input_conv, - value); - } - else if (EQ (key, Qoutput_charset_conversion)) - { - codesys->iso2022.output_conv = - Dynarr_new (charset_conversion_spec); - parse_charset_conversion_specs (codesys->iso2022.output_conv, - value); - } - else - signal_simple_error ("Unrecognized property", key); - } - else if (EQ (type, Qccl)) - { - Lisp_Object sym; - struct ccl_program test_ccl; - Extbyte *suffix; - - /* Check key first. */ - if (EQ (key, Qdecode)) - suffix = "-ccl-decode"; - else if (EQ (key, Qencode)) - suffix = "-ccl-encode"; - else - signal_simple_error ("Unrecognized property", key); + else if (EQ (key, Qinput_charset_conversion)) + { + codesys->iso2022.input_conv = + Dynarr_new (charset_conversion_spec); + parse_charset_conversion_specs (codesys->iso2022.input_conv, + value); + } + else if (EQ (key, Qoutput_charset_conversion)) + { + codesys->iso2022.output_conv = + Dynarr_new (charset_conversion_spec); + parse_charset_conversion_specs (codesys->iso2022.output_conv, + value); + } + else + signal_simple_error ("Unrecognized property", key); + } +#ifdef UTF2000 + else if (ty == CODESYS_BIG5) + { + if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0); + else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1); + else + signal_simple_error ("Unrecognized property", key); + } +#endif + else if (EQ (type, Qccl)) + { + Lisp_Object sym; + struct ccl_program test_ccl; + Extbyte *suffix; + + /* Check key first. */ + if (EQ (key, Qdecode)) + suffix = "-ccl-decode"; + else if (EQ (key, Qencode)) + suffix = "-ccl-encode"; + else + signal_simple_error ("Unrecognized property", key); - /* If value is vector, register it as a ccl program - associated with an newly created symbol for - backward compatibility. */ - if (VECTORP (value)) - { - sym = Fintern (concat2 (Fsymbol_name (name), - build_string (suffix)), - Qnil); - Fregister_ccl_program (sym, value); - } - else - { - CHECK_SYMBOL (value); - sym = value; - } - /* check if the given ccl programs are valid. */ - if (setup_ccl_program (&test_ccl, sym) < 0) - signal_simple_error ("Invalid CCL program", value); + /* If value is vector, register it as a ccl program + associated with an newly created symbol for + backward compatibility. */ + if (VECTORP (value)) + { + sym = Fintern (concat2 (Fsymbol_name (name), + build_string (suffix)), + Qnil); + Fregister_ccl_program (sym, value); + } + else + { + CHECK_SYMBOL (value); + sym = value; + } + /* check if the given ccl programs are valid. */ + if (setup_ccl_program (&test_ccl, sym) < 0) + signal_simple_error ("Invalid CCL program", value); - if (EQ (key, Qdecode)) - CODING_SYSTEM_CCL_DECODE (codesys) = sym; - else if (EQ (key, Qencode)) - CODING_SYSTEM_CCL_ENCODE (codesys) = sym; + if (EQ (key, Qdecode)) + CODING_SYSTEM_CCL_DECODE (codesys) = sym; + else if (EQ (key, Qencode)) + CODING_SYSTEM_CCL_ENCODE (codesys) = sym; - } + } #endif /* MULE */ - else - signal_simple_error ("Unrecognized property", key); - } + else + signal_simple_error ("Unrecognized property", key); + } + } if (need_to_setup_eol_systems) setup_eol_coding_systems (codesys); @@ -1395,6 +1446,12 @@ Return the PROP property of CODING-SYSTEM. else if (EQ (prop, Qpre_write_conversion)) return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system); #ifdef MULE +#ifdef UTF2000 + else if (EQ (prop, Qdisable_composition)) + return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil; + else if (EQ (prop, Quse_entity_reference)) + return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil; +#endif else if (type == CODESYS_ISO2022) { if (EQ (prop, Qcharset_g0)) @@ -1464,7 +1521,7 @@ decode_coding_category (Lisp_Object symbol) int i; CHECK_SYMBOL (symbol); - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) if (EQ (coding_category_symbol[i], symbol)) return i; @@ -1480,7 +1537,7 @@ Return a list of all recognized coding categories. int i; Lisp_Object list = Qnil; - for (i = CODING_CATEGORY_LAST; i >= 0; i--) + for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) list = Fcons (coding_category_symbol[i], list); return list; } @@ -1494,13 +1551,13 @@ previously. */ (list)) { - int category_to_priority[CODING_CATEGORY_LAST + 1]; + int category_to_priority[CODING_CATEGORY_LAST]; int i, j; Lisp_Object rest; /* First generate a list that maps coding categories to priorities. */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) category_to_priority[i] = -1; /* Highest priority comes from the specified list. */ @@ -1517,7 +1574,7 @@ previously. /* Now go through the existing categories by priority to retrieve the categories not yet specified and preserve their priority order. */ - for (j = 0; j <= CODING_CATEGORY_LAST; j++) + for (j = 0; j < CODING_CATEGORY_LAST; j++) { int cat = fcd->coding_category_by_priority[j]; if (category_to_priority[cat] < 0) @@ -1527,7 +1584,7 @@ previously. /* Now we need to construct the inverse of the mapping we just constructed. */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) fcd->coding_category_by_priority[category_to_priority[i]] = i; /* Phew! That was confusing. */ @@ -1542,7 +1599,7 @@ Return a list of coding categories in descending order of priority. int i; Lisp_Object list = Qnil; - for (i = CODING_CATEGORY_LAST; i >= 0; i--) + for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]], list); return list; @@ -1792,7 +1849,7 @@ coding_system_from_mask (int mask) #endif /* Look through the coding categories by priority and find the first one that is allowed. */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) { cat = fcd->coding_category_by_priority[i]; if ((mask & (1 << cat)) && @@ -1990,7 +2047,7 @@ type. Optional arg BUFFER defaults to the current buffer. #ifdef MULE decst.mask = postprocess_iso2022_mask (decst.mask); #endif - for (i = CODING_CATEGORY_LAST; i >= 0; i--) + for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) { int sys = fcd->coding_category_by_priority[i]; if (decst.mask & (1 << sys)) @@ -2192,6 +2249,9 @@ struct decoding_stream unsigned char counter; #endif #ifdef UTF2000 + unsigned char er_counter; + unsigned char er_buf[16]; + unsigned combined_char_count; Emchar combined_chars[16]; Lisp_Object combining_table; @@ -2200,7 +2260,129 @@ struct decoding_stream }; #ifdef UTF2000 -extern Lisp_Object Vcharacter_composition_table; +extern Lisp_Object Qcomposition; + +INLINE_HEADER void +decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst); +INLINE_HEADER void +decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst) +{ + if ( str->er_counter > 0) + { + Dynarr_add_many (dst, str->er_buf, str->er_counter); + str->er_counter = 0; + } +} + +void decode_add_er_char (struct decoding_stream *str, Emchar character, + unsigned_char_dynarr* dst); +void +decode_add_er_char (struct decoding_stream *str, Emchar c, + unsigned_char_dynarr* dst) +{ + if (str->er_counter == 0) + { + if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys) + && (c == '&') ) + { + str->er_buf[0] = '&'; + str->er_counter++; + } + else + DECODE_ADD_UCS_CHAR (c, dst); + } + else if (c == ';') + { + Lisp_Object string = make_string (str->er_buf, + str->er_counter); + Lisp_Object rest = Vcoded_charset_entity_reference_alist; + Lisp_Object cell; + Lisp_Object ret; + Lisp_Object pat; + Lisp_Object ccs; + int base; + + while (!NILP (rest)) + { + cell = Fcar (rest); + ccs = Fcar (cell); + if (NILP (ccs = Ffind_charset (ccs))) + continue; + + cell = Fcdr (cell); + ret = Fcar (cell); + if (STRINGP (ret)) + pat = ret; + else + continue; + + cell = Fcdr (cell); + cell = Fcdr (cell); + ret = Fcar (cell); + if (EQ (ret, Qd)) + { + pat = concat3 (build_string ("^&"), + pat, build_string ("\\([0-9]+\\)$")); + base = 10; + } + else if (EQ (ret, Qx)) + { + pat = concat3 (build_string ("^&"), + pat, build_string ("\\([0-9a-f]+\\)$")); + base = 16; + } + else if (EQ (ret, QX)) + { + pat = concat3 (build_string ("^&"), + pat, build_string ("\\([0-9A-F]+\\)$")); + base = 16; + } + else + continue; + + if (!NILP (Fstring_match (pat, string, Qnil, Qnil))) + { + int code + = XINT (Fstring_to_number + (Fsubstring (string, + Fmatch_beginning (make_int (1)), + Fmatch_end (make_int (1))), + make_int (base))); + + DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst); + goto decoded; + } + rest = Fcdr (rest); + } + if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"), + string, Qnil, Qnil))) + { + int code + = XINT (Fstring_to_number + (Fsubstring (string, + Fmatch_beginning (make_int (1)), + Fmatch_end (make_int (1))), + make_int (16))); + + DECODE_ADD_UCS_CHAR (code, dst); + } + else + { + Dynarr_add_many (dst, str->er_buf, str->er_counter); + Dynarr_add (dst, ';'); + } + decoded: + str->er_counter = 0; + } + else if ( (str->er_counter >= 16) || (c >= 0x7F) ) + { + Dynarr_add_many (dst, str->er_buf, str->er_counter); + str->er_counter = 0; + DECODE_ADD_UCS_CHAR (c, dst); + } + else + str->er_buf[str->er_counter++] = c; +} INLINE_HEADER void COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst); @@ -2210,26 +2392,26 @@ COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst) unsigned i; for (i = 0; i < str->combined_char_count; i++) - DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst); + decode_add_er_char (str, str->combined_chars[i], dst); str->combined_char_count = 0; str->combining_table = Qnil; } -void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character, - unsigned_char_dynarr* dst); +void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character, + unsigned_char_dynarr* dst); void -COMPOSE_ADD_CHAR(struct decoding_stream *str, - Emchar character, unsigned_char_dynarr* dst) +COMPOSE_ADD_CHAR (struct decoding_stream *str, + Emchar character, unsigned_char_dynarr* dst) { if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys)) - DECODE_ADD_UCS_CHAR (character, dst); - else if (!CHAR_ID_TABLE_P (str->combining_table)) + decode_add_er_char (str, character, dst); + else if (!CONSP (str->combining_table)) { Lisp_Object ret - = get_char_id_table (character, Vcharacter_composition_table); + = Fget_char_attribute (make_char (character), Qcomposition, Qnil); if (NILP (ret)) - DECODE_ADD_UCS_CHAR (character, dst); + decode_add_er_char (str, character, dst); else { str->combined_chars[0] = character; @@ -2239,16 +2421,15 @@ COMPOSE_ADD_CHAR(struct decoding_stream *str, } else { - Lisp_Object ret - = get_char_id_table (character, str->combining_table); + Lisp_Object ret = Fcdr (Fassq (make_char (character), str->combining_table)); if (CHARP (ret)) { Emchar char2 = XCHARVAL (ret); - ret = get_char_id_table (char2, Vcharacter_composition_table); + ret = Fget_char_attribute (make_char (character), Qcomposition, Qnil); if (NILP (ret)) { - DECODE_ADD_UCS_CHAR (char2, dst); + decode_add_er_char (str, character, dst); str->combined_char_count = 0; str->combining_table = Qnil; } @@ -2259,15 +2440,10 @@ COMPOSE_ADD_CHAR(struct decoding_stream *str, str->combining_table = ret; } } - else if (CHAR_ID_TABLE_P (ret)) - { - str->combined_chars[str->combined_char_count++] = character; - str->combining_table = ret; - } else { COMPOSE_FLUSH_CHARS (str, dst); - DECODE_ADD_UCS_CHAR (character, dst); + decode_add_er_char (str, character, dst); } } } @@ -2410,6 +2586,7 @@ reset_decoding_stream (struct decoding_stream *str) str->counter = 0; #endif /* MULE */ #ifdef UTF2000 + str->er_counter = 0; str->combined_char_count = 0; str->combining_table = Qnil; #endif @@ -2900,6 +3077,10 @@ reset_encoding_stream (struct encoding_stream *str) str->encode_char = &char_encode_shift_jis; str->finish = &char_finish_shift_jis; break; + case CODESYS_BIG5: + str->encode_char = &char_encode_big5; + str->finish = &char_finish_big5; + break; default: break; } @@ -3015,9 +3196,6 @@ mule_encode (Lstream *encoding, const Bufbyte *src, encode_coding_no_conversion (encoding, src, dst, n); break; #ifdef MULE - case CODESYS_BIG5: - encode_coding_big5 (encoding, src, dst, n); - break; case CODESYS_CCL: str->ccl.last_block = str->flags & CODING_STATE_END; /* When applying ccl program to stream, MUST NOT set NULL @@ -3445,8 +3623,13 @@ Return the corresponding character code in SHIFT-JIS as a cons of two bytes. contains frequently used characters and the latter contains less frequently used characters. */ +#ifdef UTF2000 +#define BYTE_BIG5_TWO_BYTE_1_P(c) \ + ((c) >= 0x81 && (c) <= 0xFE) +#else #define BYTE_BIG5_TWO_BYTE_1_P(c) \ ((c) >= 0xA1 && (c) <= 0xFE) +#endif /* Is this the second byte of a Shift-JIS two-byte char? */ @@ -3532,8 +3715,11 @@ detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n) while (n--) { unsigned char c = *(unsigned char *)src++; - if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO || - (c >= 0x80 && c <= 0xA0)) + if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO +#ifndef UTF2000 + || (c >= 0x80 && c <= 0xA0) +#endif + ) return 0; if (st->big5.in_second_byte) { @@ -3541,7 +3727,13 @@ detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n) if (c < 0x40 || (c >= 0x80 && c <= 0xA0)) return 0; } - else if (c >= 0xA1) + else if ( +#ifdef UTF2000 + c >= 0x81 +#else + c >= 0xA1 +#endif + ) st->big5.in_second_byte = 1; } return CODING_CATEGORY_BIG5_MASK; @@ -3557,6 +3749,11 @@ decode_coding_big5 (Lstream *decoding, const Extbyte *src, unsigned int flags = str->flags; unsigned int cpos = str->cpos; eol_type_t eol_type = str->eol_type; +#ifdef UTF2000 + Lisp_Object ccs + = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA + (decoding)->codesys, 1); +#endif while (n--) { @@ -3567,9 +3764,12 @@ decode_coding_big5 (Lstream *decoding, const Extbyte *src, if (BYTE_BIG5_TWO_BYTE_2_P (c)) { #ifdef UTF2000 - DECODE_ADD_UCS_CHAR - (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c), - dst); + int code_point = (cpos << 8) | c; + Emchar char_id = decode_defined_char (ccs, code_point); + + if (char_id < 0) + char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point); + DECODE_ADD_UCS_CHAR (char_id, dst); #else unsigned char b1, b2, b3; DECODE_BIG5 (cpos, c, b1, b2, b3); @@ -3604,66 +3804,78 @@ decode_coding_big5 (Lstream *decoding, const Extbyte *src, /* Convert internally-formatted data to Big5. */ -static void -encode_coding_big5 (Lstream *encoding, const Bufbyte *src, - unsigned_char_dynarr *dst, size_t n) +void +char_encode_big5 (struct encoding_stream *str, Emchar ch, + unsigned_char_dynarr *dst, unsigned int *flags) { -#ifndef UTF2000 - unsigned char c; - struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); - unsigned int flags = str->flags; - unsigned int ch = str->ch; eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); - while (n--) + if (ch == '\n') { - c = *src++; - if (c == '\n') - { - if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) - Dynarr_add (dst, '\r'); - if (eol_type != EOL_CR) - Dynarr_add (dst, '\n'); - } - else if (BYTE_ASCII_P (c)) + if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) + Dynarr_add (dst, '\r'); + if (eol_type != EOL_CR) + Dynarr_add (dst, ch); + } + else + { +#ifdef UTF2000 + int code_point; + Lisp_Object ccs + = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1); + + if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0) + Dynarr_add (dst, code_point); + else if ((code_point = charset_code_point (ccs, ch)) >= 0) { - /* ASCII. */ - Dynarr_add (dst, c); + Dynarr_add (dst, code_point >> 8); + Dynarr_add (dst, code_point & 0xFF); } - else if (BUFBYTE_LEADING_BYTE_P (c)) + else if ((code_point + = charset_code_point (Vcharset_chinese_big5, ch)) >= 0) { - if (c == LEADING_BYTE_CHINESE_BIG5_1 || - c == LEADING_BYTE_CHINESE_BIG5_2) - { - /* A recognized leading byte. */ - ch = c; - continue; /* not done with this character. */ - } - /* otherwise just ignore this character. */ + Dynarr_add (dst, code_point >> 8); + Dynarr_add (dst, code_point & 0xFF); } - else if (ch == LEADING_BYTE_CHINESE_BIG5_1 || - ch == LEADING_BYTE_CHINESE_BIG5_2) + else if ((code_point + = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0) { - /* Previous char was a recognized leading byte. */ - ch = (ch << 8) | c; - continue; /* not done with this character. */ + unsigned int I + = ((code_point >> 8) - 33) * (0xFF - 0xA1) + + ((code_point & 0xFF) - 33); + unsigned char b1 = I / BIG5_SAME_ROW + 0xA1; + unsigned char b2 = I % BIG5_SAME_ROW; + + b2 += b2 < 0x3F ? 0x40 : 0x62; + Dynarr_add (dst, b1); + Dynarr_add (dst, b2); } - else if (ch) + else if ((code_point + = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0) { - /* Encountering second byte of a Big5 character. */ + unsigned int I + = ((code_point >> 8) - 33) * (0xFF - 0xA1) + + ((code_point & 0xFF) - 33); unsigned char b1, b2; - ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2); + I += BIG5_SAME_ROW * (0xC9 - 0xA1); + b1 = I / BIG5_SAME_ROW + 0xA1; + b2 = I % BIG5_SAME_ROW; + b2 += b2 < 0x3F ? 0x40 : 0x62; Dynarr_add (dst, b1); Dynarr_add (dst, b2); } - - ch = 0; + else + Dynarr_add (dst, '?'); +#else +#endif } +} - str->flags = flags; - str->ch = ch; -#endif +void +char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst, + unsigned int *flags) +{ } @@ -3837,6 +4049,95 @@ detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n) } static void +decode_output_utf8_partial_char (unsigned char counter, + unsigned int cpos, + unsigned_char_dynarr *dst) +{ + if (counter == 5) + DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst); + else if (counter == 4) + { + if (cpos < (1 << 6)) + DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst); + else + { + DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst); + DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst); + } + } + else if (counter == 3) + { + if (cpos < (1 << 6)) + DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst); + else if (cpos < (1 << 12)) + { + DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst); + DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst); + } + else + { + DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); + } + } + else if (counter == 2) + { + if (cpos < (1 << 6)) + DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst); + else if (cpos < (1 << 12)) + { + DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst); + DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst); + } + else if (cpos < (1 << 18)) + { + DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); + } + else + { + DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); + } + } + else + { + if (cpos < (1 << 6)) + DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst); + else if (cpos < (1 << 12)) + { + DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst); + DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst); + } + else if (cpos < (1 << 18)) + { + DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); + } + else if (cpos < (1 << 24)) + { + DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); + } + else + { + DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); + DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); + } + } +} + +static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, size_t n) { @@ -3851,35 +4152,44 @@ decode_coding_utf8 (Lstream *decoding, const Extbyte *src, unsigned char c = *(unsigned char *)src++; if (counter == 0) { - if ( c < 0xC0 ) + if ( c < ' ' ) { + COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); DECODE_ADD_UCS_CHAR (c, dst); } - else if ( c < 0xE0 ) - { - cpos = c & 0x1f; - counter = 1; - } - else if ( c < 0xF0 ) - { - cpos = c & 0x0f; - counter = 2; - } - else if ( c < 0xF8 ) - { - cpos = c & 0x07; - counter = 3; - } - else if ( c < 0xFC ) - { - cpos = c & 0x03; - counter = 4; - } + else if ( c < 0xC0 ) + /* decode_add_er_char (str, c, dst); */ + COMPOSE_ADD_CHAR (str, c, dst); else { - cpos = c & 0x01; - counter = 5; + /* decode_flush_er_chars (str, dst); */ + if ( c < 0xE0 ) + { + cpos = c & 0x1f; + counter = 1; + } + else if ( c < 0xF0 ) + { + cpos = c & 0x0f; + counter = 2; + } + else if ( c < 0xF8 ) + { + cpos = c & 0x07; + counter = 3; + } + else if ( c < 0xFC ) + { + cpos = c & 0x03; + counter = 4; + } + else + { + cpos = c & 0x01; + counter = 5; + } } } else if ( (c & 0xC0) == 0x80 ) @@ -3887,7 +4197,8 @@ decode_coding_utf8 (Lstream *decoding, const Extbyte *src, cpos = ( cpos << 6 ) | ( c & 0x3f ); if (counter == 1) { - DECODE_ADD_UCS_CHAR (cpos, dst); + /* DECODE_ADD_UCS_CHAR (cpos, dst); */ + COMPOSE_ADD_CHAR (str, cpos, dst); cpos = 0; counter = 0; } @@ -3896,88 +4207,9 @@ decode_coding_utf8 (Lstream *decoding, const Extbyte *src, } else { - if (counter == 5) - DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst); - else if (counter == 4) - { - if (cpos < (1 << 6)) - DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst); - else - { - DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst); - DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst); - } - } - else if (counter == 3) - { - if (cpos < (1 << 6)) - DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst); - else if (cpos < (1 << 12)) - { - DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst); - DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst); - } - else - { - DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); - } - } - else if (counter == 2) - { - if (cpos < (1 << 6)) - DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst); - else if (cpos < (1 << 12)) - { - DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst); - DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst); - } - else if (cpos < (1 << 18)) - { - DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); - } - else - { - DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); - } - } - else - { - if (cpos < (1 << 6)) - DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst); - else if (cpos < (1 << 12)) - { - DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst); - DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst); - } - else if (cpos < (1 << 18)) - { - DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); - } - else if (cpos < (1 << 24)) - { - DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); - } - else - { - DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst); - DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst); - } - } + COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); + decode_output_utf8_partial_char (counter, cpos, dst); DECODE_ADD_BINARY_CHAR (c, dst); cpos = 0; counter = 0; @@ -3986,12 +4218,16 @@ decode_coding_utf8 (Lstream *decoding, const Extbyte *src, } if (flags & CODING_STATE_END) - if (counter > 0) - { - DECODE_ADD_BINARY_CHAR (cpos, dst); - cpos = 0; - counter = 0; - } + { + COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); + if (counter > 0) + { + decode_output_utf8_partial_char (counter, cpos, dst); + cpos = 0; + counter = 0; + } + } str->flags = flags; str->cpos = cpos; str->counter = counter; @@ -4014,40 +4250,117 @@ char_encode_utf8 (struct encoding_stream *str, Emchar ch, { Dynarr_add (dst, ch); } - else if (ch <= 0x7ff) - { - Dynarr_add (dst, (ch >> 6) | 0xc0); - Dynarr_add (dst, (ch & 0x3f) | 0x80); - } - else if (ch <= 0xffff) - { - Dynarr_add (dst, (ch >> 12) | 0xe0); - Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80); - Dynarr_add (dst, (ch & 0x3f) | 0x80); - } - else if (ch <= 0x1fffff) - { - Dynarr_add (dst, (ch >> 18) | 0xf0); - Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80); - Dynarr_add (dst, (ch & 0x3f) | 0x80); - } - else if (ch <= 0x3ffffff) - { - Dynarr_add (dst, (ch >> 24) | 0xf8); - Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80); - Dynarr_add (dst, (ch & 0x3f) | 0x80); - } else { - Dynarr_add (dst, (ch >> 30) | 0xfc); - Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80); - Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80); - Dynarr_add (dst, (ch & 0x3f) | 0x80); + int code_point = charset_code_point (Vcharset_ucs, ch); + + if ( (code_point < 0) || (code_point > 0x10FFFF) ) + { + if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)) + { + Lisp_Object rest = Vcoded_charset_entity_reference_alist; + Lisp_Object cell; + Lisp_Object ret; + Lisp_Object ccs; + int format_columns, idx; + char buf[16], format[16]; + + while (!NILP (rest)) + { + cell = Fcar (rest); + ccs = Fcar (cell); + if (!NILP (ccs = Ffind_charset (ccs))) + { + if ( (code_point + = charset_code_point (ccs, ch)) >= 0 ) + { + cell = Fcdr (cell); + ret = Fcar (cell); + if (STRINGP (ret) + && ((idx =XSTRING_LENGTH (ret)) <= 6)) + { + strncpy (format, XSTRING_DATA (ret), idx); + } + else + continue; + + cell = Fcdr (cell); + ret = Fcar (cell); + if (INTP (ret)) + { + format [idx++] = '%'; + format_columns = XINT (ret); + if ( (2 <= format_columns) + && (format_columns <= 8) ) + { + format [idx++] = '0'; + format [idx++] = '0' + format_columns; + } + } + + cell = Fcdr (cell); + ret = Fcar (cell); + if (EQ (ret, Qd)) + format [idx++] = 'd'; + else if (EQ (ret, Qx)) + format [idx++] = 'x'; + else if (EQ (ret, QX)) + format [idx++] = 'X'; + else + continue; + format [idx++] = 0; + + sprintf (buf, format, code_point); + Dynarr_add (dst, '&'); + Dynarr_add_many (dst, buf, strlen (buf)); + Dynarr_add (dst, ';'); + return; + } + } + rest = Fcdr (rest); + } + sprintf (buf, "&MCS-%08X;", ch); + Dynarr_add_many (dst, buf, strlen (buf)); + return; + } + else + code_point = ch; + } + if (code_point <= 0x7ff) + { + Dynarr_add (dst, (code_point >> 6) | 0xc0); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } + else if (code_point <= 0xffff) + { + Dynarr_add (dst, (code_point >> 12) | 0xe0); + Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } + else if (code_point <= 0x1fffff) + { + Dynarr_add (dst, (code_point >> 18) | 0xf0); + Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } + else if (code_point <= 0x3ffffff) + { + Dynarr_add (dst, (code_point >> 24) | 0xf8); + Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } + else + { + Dynarr_add (dst, (code_point >> 30) | 0xfc); + Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80); + Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80); + Dynarr_add (dst, (code_point & 0x3f) | 0x80); + } } } @@ -4883,6 +5196,7 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, case ISO_ESC_LITERAL: COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); DECODE_ADD_BINARY_CHAR (c, dst); break; @@ -4904,6 +5218,7 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, /* Output the (possibly invalid) sequence */ int i; COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); for (i = 0; i < str->iso2022.esc_bytes_index; i++) DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst); flags &= CODING_STATE_ISO2022_LOCK; @@ -4915,6 +5230,7 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, escape sequence; it could mess things up anyway. Just add it now. */ COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); DECODE_ADD_BINARY_CHAR (c, dst); } } @@ -4931,6 +5247,7 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, if (counter) { COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); while (counter > 0) { counter--; @@ -4947,12 +5264,14 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, if (flags & CODING_STATE_SS2) { COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst); flags &= ~CODING_STATE_SS2; } if (flags & CODING_STATE_SS3) { COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst); flags &= ~CODING_STATE_SS3; } @@ -4964,6 +5283,7 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, if (c == '\r') { COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); if (eol_type == EOL_CR) Dynarr_add (dst, '\n'); else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) @@ -4987,6 +5307,7 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1)) { COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); DECODE_ADD_BINARY_CHAR (c, dst); } } @@ -5002,6 +5323,7 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, if (c == '\r') { COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); if (eol_type == EOL_CR) Dynarr_add (dst, '\n'); else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) @@ -5038,6 +5360,7 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, to preserve it for the output. */ { COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); while (counter > 0) { counter--; @@ -5141,6 +5464,7 @@ decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, if (flags & CODING_STATE_END) { COMPOSE_FLUSH_CHARS (str, dst); + decode_flush_er_chars (str, dst); DECODE_OUTPUT_PARTIAL_CHAR (cpos); } str->flags = flags; @@ -5710,6 +6034,10 @@ syms_of_file_coding (void) #endif /* MULE */ #ifdef UTF2000 defsymbol (&Qdisable_composition, "disable-composition"); + defsymbol (&Quse_entity_reference, "use-entity-reference"); + defsymbol (&Qd, "d"); + defsymbol (&Qx, "x"); + defsymbol (&QX, "X"); #endif defsymbol (&Qencode, "encode"); defsymbol (&Qdecode, "decode"); @@ -5764,10 +6092,10 @@ vars_of_file_coding (void) int i; fcd = xnew (struct file_coding_dump); - dumpstruct (&fcd, &fcd_description); + dump_add_root_struct_ptr (&fcd, &fcd_description); /* Initialize to something reasonable ... */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) + for (i = 0; i < CODING_CATEGORY_LAST; i++) { fcd->coding_category_system[i] = Qnil; fcd->coding_category_by_priority[i] = i; @@ -5813,6 +6141,17 @@ Coding system used to convert pathnames when accessing files. */ ); Vfile_name_coding_system = Qnil; + DEFVAR_LISP ("coded-charset-entity-reference-alist", + &Vcoded_charset_entity_reference_alist /* +Alist of coded-charset vs corresponding entity-reference. +Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE). +CCS is coded-charset. +CODE-COLUMNS is columns of code-point of entity-reference. +CODE-TYPE is format type of code-point of entity-reference. +`d' means decimal value and `x' means hexadecimal value. +*/ ); + Vcoded_charset_entity_reference_alist = Qnil; + DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /* Non-nil means the buffer contents are regarded as multi-byte form of characters, not a binary code. This affects the display, file I/O, @@ -5831,7 +6170,7 @@ complex_vars_of_file_coding (void) make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ); the_codesys_prop_dynarr = Dynarr_new (codesys_prop); - dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description); + dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description); #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \ { \ @@ -5869,6 +6208,10 @@ complex_vars_of_file_coding (void) DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode); DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode); +#ifdef UTF2000 + DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition); + DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference); +#endif #endif /* MULE */ /* Need to create this here or we're really screwed. */ Fmake_coding_system @@ -5907,7 +6250,7 @@ complex_vars_of_file_coding (void) #if defined(MULE) && !defined(UTF2000) { - unsigned int i; + size_t i; for (i = 0; i < countof (fcd->ucs_to_mule_table); i++) fcd->ucs_to_mule_table[i] = Qnil;