/* Code conversion functions.
Copyright (C) 1991, 1995 Free Software Foundation, Inc.
Copyright (C) 1995 Sun Microsystems, Inc.
- Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
+ Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko
This file is part of XEmacs.
/* Synched up with: Mule 2.3. Not in FSF. */
/* Rewritten by Ben Wing <ben@xemacs.org>. */
+/* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs UTF-2000. */
#include <config.h>
#include "lisp.h"
Lisp_Object Vcoding_system_for_write;
Lisp_Object Vfile_name_coding_system;
+Lisp_Object Vcoded_charset_entity_reference_alist;
+
/* Table of symbols identifying each coding category. */
Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
#endif
#ifdef UTF2000
Lisp_Object Qdisable_composition;
+Lisp_Object Quse_entity_reference;
+Lisp_Object Qd, Qx, QX;
#endif
Lisp_Object Qencode, Qdecode;
{ XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
{ XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
{ XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
+#ifdef UTF2000
+ { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
+#endif
#endif
{ XD_END }
};
}
mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
+#ifdef UTF2000
+ mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
+#endif
return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
}
CODING_SYSTEM_TYPE (codesys) = type;
CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
#ifdef MULE
+#ifdef UTF2000
+ CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
+#endif
if (type == CODESYS_ISO2022)
{
int i;
If non-nil, composition/decomposition for combining characters
are disabled.
+'use-entity-reference
+ If non-nil, SGML style entity-reference is used for non-system-characters.
+
'post-read-conversion
Function called after a file has been read in, to perform the
decoding. Called with two arguments, START and END, denoting
#ifdef UTF2000
else if (EQ (key, Qdisable_composition))
CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
+ else if (EQ (key, Quse_entity_reference))
+ CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
#endif
#ifdef MULE
else if (ty == CODESYS_ISO2022)
else if (EQ (prop, Qpre_write_conversion))
return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
#ifdef MULE
+#ifdef UTF2000
+ else if (EQ (prop, Qdisable_composition))
+ return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
+ else if (EQ (prop, Quse_entity_reference))
+ return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
+#endif
else if (type == CODESYS_ISO2022)
{
if (EQ (prop, Qcharset_g0))
unsigned char counter;
#endif
#ifdef UTF2000
+ unsigned char er_counter;
+ unsigned char er_buf[16];
+
unsigned combined_char_count;
Emchar combined_chars[16];
Lisp_Object combining_table;
extern Lisp_Object Vcharacter_composition_table;
INLINE_HEADER void
+decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
+INLINE_HEADER void
+decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
+{
+ if ( str->er_counter > 0)
+ {
+ Dynarr_add_many (dst, str->er_buf, str->er_counter);
+ str->er_counter = 0;
+ }
+}
+
+void decode_add_er_char (struct decoding_stream *str, Emchar character,
+ unsigned_char_dynarr* dst);
+void
+decode_add_er_char (struct decoding_stream *str, Emchar c,
+ unsigned_char_dynarr* dst)
+{
+ if (str->er_counter == 0)
+ {
+ if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
+ && (c == '&') )
+ {
+ str->er_buf[0] = '&';
+ str->er_counter++;
+ }
+ else
+ DECODE_ADD_UCS_CHAR (c, dst);
+ }
+ else if (c == ';')
+ {
+ Lisp_Object string = make_string (str->er_buf,
+ str->er_counter);
+ Lisp_Object rest = Vcoded_charset_entity_reference_alist;
+ Lisp_Object cell;
+ Lisp_Object ret;
+ Lisp_Object pat;
+ Lisp_Object ccs;
+ int base;
+
+ while (!NILP (rest))
+ {
+ cell = Fcar (rest);
+ ccs = Fcar (cell);
+ if (NILP (ccs = Ffind_charset (ccs)))
+ continue;
+
+ cell = Fcdr (cell);
+ ret = Fcar (cell);
+ if (STRINGP (ret))
+ pat = ret;
+ else
+ continue;
+
+ cell = Fcdr (cell);
+ cell = Fcdr (cell);
+ ret = Fcar (cell);
+ if (EQ (ret, Qd))
+ {
+ pat = concat3 (build_string ("^&"),
+ pat, build_string ("\\([0-9]+\\)$"));
+ base = 10;
+ }
+ else if (EQ (ret, Qx))
+ {
+ pat = concat3 (build_string ("^&"),
+ pat, build_string ("\\([0-9a-f]+\\)$"));
+ base = 16;
+ }
+ else if (EQ (ret, QX))
+ {
+ pat = concat3 (build_string ("^&"),
+ pat, build_string ("\\([0-9A-F]+\\)$"));
+ base = 16;
+ }
+ else
+ continue;
+
+ if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
+ {
+ int code
+ = XINT (Fstring_to_number
+ (Fsubstring (string,
+ Fmatch_beginning (make_int (1)),
+ Fmatch_end (make_int (1))),
+ make_int (base)));
+
+ DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst);
+ goto decoded;
+ }
+ rest = Fcdr (rest);
+ }
+ if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
+ string, Qnil, Qnil)))
+ {
+ int code
+ = XINT (Fstring_to_number
+ (Fsubstring (string,
+ Fmatch_beginning (make_int (1)),
+ Fmatch_end (make_int (1))),
+ make_int (16)));
+
+ DECODE_ADD_UCS_CHAR (code, dst);
+ }
+ else
+ {
+ Dynarr_add_many (dst, str->er_buf, str->er_counter);
+ Dynarr_add (dst, ';');
+ }
+ decoded:
+ str->er_counter = 0;
+ }
+ else if ( (str->er_counter >= 16) || (c >= 0x7F) )
+ {
+ Dynarr_add_many (dst, str->er_buf, str->er_counter);
+ str->er_counter = 0;
+ DECODE_ADD_UCS_CHAR (c, dst);
+ }
+ else
+ str->er_buf[str->er_counter++] = c;
+}
+
+INLINE_HEADER void
COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
INLINE_HEADER void
COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
unsigned i;
for (i = 0; i < str->combined_char_count; i++)
- DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
+ decode_add_er_char (str, str->combined_chars[i], dst);
str->combined_char_count = 0;
str->combining_table = Qnil;
}
-void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
- unsigned_char_dynarr* dst);
+void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
+ unsigned_char_dynarr* dst);
void
-COMPOSE_ADD_CHAR(struct decoding_stream *str,
- Emchar character, unsigned_char_dynarr* dst)
+COMPOSE_ADD_CHAR (struct decoding_stream *str,
+ Emchar character, unsigned_char_dynarr* dst)
{
if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
- DECODE_ADD_UCS_CHAR (character, dst);
+ decode_add_er_char (str, character, dst);
else if (!CHAR_TABLEP (str->combining_table))
{
Lisp_Object ret
character);
if (NILP (ret))
- DECODE_ADD_UCS_CHAR (character, dst);
+ decode_add_er_char (str, character, dst);
else
{
str->combined_chars[0] = character;
char2);
if (NILP (ret))
{
- DECODE_ADD_UCS_CHAR (char2, dst);
+ decode_add_er_char (str, character, dst);
str->combined_char_count = 0;
str->combining_table = Qnil;
}
else
{
COMPOSE_FLUSH_CHARS (str, dst);
- DECODE_ADD_UCS_CHAR (character, dst);
+ decode_add_er_char (str, character, dst);
}
}
}
str->counter = 0;
#endif /* MULE */
#ifdef UTF2000
+ str->er_counter = 0;
str->combined_char_count = 0;
str->combining_table = Qnil;
#endif
{
#ifdef UTF2000
int code_point = (cpos << 8) | c;
- Emchar char_id = DECODE_DEFINED_CHAR (ccs, code_point);
+ Emchar char_id = decode_defined_char (ccs, code_point);
if (char_id < 0)
char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
unsigned char c = *(unsigned char *)src++;
if (counter == 0)
{
- if ( c < 0xC0 )
+ if ( c < ' ' )
{
+ decode_flush_er_chars (str, dst);
DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
DECODE_ADD_UCS_CHAR (c, dst);
}
- else if ( c < 0xE0 )
- {
- cpos = c & 0x1f;
- counter = 1;
- }
- else if ( c < 0xF0 )
- {
- cpos = c & 0x0f;
- counter = 2;
- }
- else if ( c < 0xF8 )
- {
- cpos = c & 0x07;
- counter = 3;
- }
- else if ( c < 0xFC )
- {
- cpos = c & 0x03;
- counter = 4;
- }
+ else if ( c < 0xC0 )
+ decode_add_er_char (str, c, dst);
else
{
- cpos = c & 0x01;
- counter = 5;
+ decode_flush_er_chars (str, dst);
+ if ( c < 0xE0 )
+ {
+ cpos = c & 0x1f;
+ counter = 1;
+ }
+ else if ( c < 0xF0 )
+ {
+ cpos = c & 0x0f;
+ counter = 2;
+ }
+ else if ( c < 0xF8 )
+ {
+ cpos = c & 0x07;
+ counter = 3;
+ }
+ else if ( c < 0xFC )
+ {
+ cpos = c & 0x03;
+ counter = 4;
+ }
+ else
+ {
+ cpos = c & 0x01;
+ counter = 5;
+ }
}
}
else if ( (c & 0xC0) == 0x80 )
}
if (flags & CODING_STATE_END)
- if (counter > 0)
- {
- decode_output_utf8_partial_char (counter, cpos, dst);
- cpos = 0;
- counter = 0;
- }
+ {
+ decode_flush_er_chars (str, dst);
+ if (counter > 0)
+ {
+ decode_output_utf8_partial_char (counter, cpos, dst);
+ cpos = 0;
+ counter = 0;
+ }
+ }
str->flags = flags;
str->cpos = cpos;
str->counter = counter;
{
Dynarr_add (dst, ch);
}
- else if (ch <= 0x7ff)
- {
- Dynarr_add (dst, (ch >> 6) | 0xc0);
- Dynarr_add (dst, (ch & 0x3f) | 0x80);
- }
- else if (ch <= 0xffff)
- {
- Dynarr_add (dst, (ch >> 12) | 0xe0);
- Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (ch & 0x3f) | 0x80);
- }
- else if (ch <= 0x1fffff)
- {
- Dynarr_add (dst, (ch >> 18) | 0xf0);
- Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (ch & 0x3f) | 0x80);
- }
- else if (ch <= 0x3ffffff)
- {
- Dynarr_add (dst, (ch >> 24) | 0xf8);
- Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
- Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (ch & 0x3f) | 0x80);
- }
else
{
- Dynarr_add (dst, (ch >> 30) | 0xfc);
- Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
- Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
- Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
- Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
- Dynarr_add (dst, (ch & 0x3f) | 0x80);
+ int code_point = charset_code_point (Vcharset_ucs, ch);
+
+ if ( (code_point < 0) || (code_point > 0x10FFFF) )
+ {
+ if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
+ {
+ Lisp_Object rest = Vcoded_charset_entity_reference_alist;
+ Lisp_Object cell;
+ Lisp_Object ret;
+ Lisp_Object ccs;
+ int format_columns, idx;
+ char buf[16], format[16];
+
+ while (!NILP (rest))
+ {
+ cell = Fcar (rest);
+ ccs = Fcar (cell);
+ if (!NILP (ccs = Ffind_charset (ccs)))
+ {
+ if ( (code_point
+ = charset_code_point (ccs, ch)) >= 0 )
+ {
+ cell = Fcdr (cell);
+ ret = Fcar (cell);
+ if (STRINGP (ret)
+ && ((idx =XSTRING_LENGTH (ret)) <= 6))
+ {
+ strncpy (format, XSTRING_DATA (ret), idx);
+ }
+ else
+ continue;
+
+ cell = Fcdr (cell);
+ ret = Fcar (cell);
+ if (INTP (ret))
+ {
+ format [idx++] = '%';
+ format_columns = XINT (ret);
+ if ( (2 <= format_columns)
+ && (format_columns <= 8) )
+ {
+ format [idx++] = '0';
+ format [idx++] = '0' + format_columns;
+ }
+ }
+
+ cell = Fcdr (cell);
+ ret = Fcar (cell);
+ if (EQ (ret, Qd))
+ format [idx++] = 'd';
+ else if (EQ (ret, Qx))
+ format [idx++] = 'x';
+ else if (EQ (ret, QX))
+ format [idx++] = 'X';
+ else
+ continue;
+ format [idx++] = 0;
+
+ sprintf (buf, format, code_point);
+ Dynarr_add (dst, '&');
+ Dynarr_add_many (dst, buf, strlen (buf));
+ Dynarr_add (dst, ';');
+ return;
+ }
+ }
+ rest = Fcdr (rest);
+ }
+ sprintf (buf, "&MCS-%08X;", ch);
+ Dynarr_add_many (dst, buf, strlen (buf));
+ return;
+ }
+ else
+ code_point = ch;
+ }
+ if (code_point <= 0x7ff)
+ {
+ Dynarr_add (dst, (code_point >> 6) | 0xc0);
+ Dynarr_add (dst, (code_point & 0x3f) | 0x80);
+ }
+ else if (code_point <= 0xffff)
+ {
+ Dynarr_add (dst, (code_point >> 12) | 0xe0);
+ Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code_point & 0x3f) | 0x80);
+ }
+ else if (code_point <= 0x1fffff)
+ {
+ Dynarr_add (dst, (code_point >> 18) | 0xf0);
+ Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code_point & 0x3f) | 0x80);
+ }
+ else if (code_point <= 0x3ffffff)
+ {
+ Dynarr_add (dst, (code_point >> 24) | 0xf8);
+ Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code_point & 0x3f) | 0x80);
+ }
+ else
+ {
+ Dynarr_add (dst, (code_point >> 30) | 0xfc);
+ Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code_point & 0x3f) | 0x80);
+ }
}
}
#endif /* MULE */
#ifdef UTF2000
defsymbol (&Qdisable_composition, "disable-composition");
+ defsymbol (&Quse_entity_reference, "use-entity-reference");
+ defsymbol (&Qd, "d");
+ defsymbol (&Qx, "x");
+ defsymbol (&QX, "X");
#endif
defsymbol (&Qencode, "encode");
defsymbol (&Qdecode, "decode");
*/ );
Vfile_name_coding_system = Qnil;
+ DEFVAR_LISP ("coded-charset-entity-reference-alist",
+ &Vcoded_charset_entity_reference_alist /*
+Alist of coded-charset vs corresponding entity-reference.
+Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
+CCS is coded-charset.
+CODE-COLUMNS is columns of code-point of entity-reference.
+CODE-TYPE is format type of code-point of entity-reference.
+`d' means decimal value and `x' means hexadecimal value.
+*/ );
+ Vcoded_charset_entity_reference_alist = Qnil;
+
DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
Non-nil means the buffer contents are regarded as multi-byte form
of characters, not a binary code. This affects the display, file I/O,
DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
+#ifdef UTF2000
+ DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
+ DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
+#endif
#endif /* MULE */
/* Need to create this here or we're really screwed. */
Fmake_coding_system