#include <config.h>
#include "lisp.h"
+
#include "buffer.h"
#include "elhash.h"
#include "insdel.h"
#include "lstream.h"
#ifdef MULE
#include "mule-ccl.h"
+#include "chartab.h"
#endif
#include "file-coding.h"
Lisp_Object Qcoding_system_p;
-Lisp_Object Qno_conversion, Qccl, Qiso2022;
+Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
/* Qinternal in general.c */
Lisp_Object Qmnemonic, Qeol_type;
Lisp_Object Qpre_write_conversion;
#ifdef MULE
+Lisp_Object Qucs4, Qutf8;
Lisp_Object Qbig5, Qshift_jis;
Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
#endif
Lisp_Object Qencode, Qdecode;
-Lisp_Object Vcoding_system_hashtable;
+Lisp_Object Vcoding_system_hash_table;
int enable_multibyte_characters;
/* Index for next byte to store in ISO escape sequence. */
int esc_bytes_index;
+#ifdef ENABLE_COMPOSITE_CHARS
/* Stuff seen so far when composing a string. */
unsigned_char_dynarr *composite_chars;
+#endif
/* If we saw an invalid designation sequence for a particular
register, we flag it here and switch to ASCII. The next time we
static void encode_coding_big5 (Lstream *encoding,
CONST unsigned char *src,
unsigned_char_dynarr *dst, unsigned int n);
+static int detect_coding_ucs4 (struct detection_state *st,
+ CONST unsigned char *src,
+ unsigned int n);
+static void decode_coding_ucs4 (Lstream *decoding,
+ CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n);
+static void encode_coding_ucs4 (Lstream *encoding,
+ CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n);
+static int detect_coding_utf8 (struct detection_state *st,
+ CONST unsigned char *src,
+ unsigned int n);
+static void decode_coding_utf8 (Lstream *decoding,
+ CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n);
+static void encode_coding_utf8 (Lstream *encoding,
+ CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n);
static int postprocess_iso2022_mask (int mask);
static void reset_iso2022 (Lisp_Object coding_system,
struct iso2022_decoder *iso);
static void print_coding_system (Lisp_Object, Lisp_Object, int);
static void finalize_coding_system (void *header, int for_disksave);
+#ifdef MULE
+static const struct lrecord_description ccs_description_1[] = {
+ { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
+ { XD_END }
+};
+
+static const struct struct_description ccs_description = {
+ sizeof(charset_conversion_spec),
+ ccs_description_1
+};
+
+static const struct lrecord_description ccsd_description_1[] = {
+ XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
+ { XD_END }
+};
+
+static const struct struct_description ccsd_description = {
+ sizeof(charset_conversion_spec_dynarr),
+ ccsd_description_1
+};
+#endif
+
+static const struct lrecord_description coding_system_description[] = {
+ { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
+ { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
+ { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
+#ifdef MULE
+ { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
+ { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
+ { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
+ { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
+#endif
+ { XD_END }
+};
+
DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
mark_coding_system, print_coding_system,
finalize_coding_system,
- 0, 0, struct Lisp_Coding_System);
+ 0, 0, coding_system_description,
+ struct Lisp_Coding_System);
static Lisp_Object
mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
{
- struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
+ Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
- (markobj) (CODING_SYSTEM_NAME (codesys));
- (markobj) (CODING_SYSTEM_DOC_STRING (codesys));
- (markobj) (CODING_SYSTEM_MNEMONIC (codesys));
- (markobj) (CODING_SYSTEM_EOL_LF (codesys));
- (markobj) (CODING_SYSTEM_EOL_CRLF (codesys));
- (markobj) (CODING_SYSTEM_EOL_CR (codesys));
+ markobj (CODING_SYSTEM_NAME (codesys));
+ markobj (CODING_SYSTEM_DOC_STRING (codesys));
+ markobj (CODING_SYSTEM_MNEMONIC (codesys));
+ markobj (CODING_SYSTEM_EOL_LF (codesys));
+ markobj (CODING_SYSTEM_EOL_CRLF (codesys));
+ markobj (CODING_SYSTEM_EOL_CR (codesys));
switch (CODING_SYSTEM_TYPE (codesys))
{
int i;
case CODESYS_ISO2022:
for (i = 0; i < 4; i++)
- (markobj) (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
+ markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
if (codesys->iso2022.input_conv)
{
for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
{
struct charset_conversion_spec *ccs =
Dynarr_atp (codesys->iso2022.input_conv, i);
- (markobj) (ccs->from_charset);
- (markobj) (ccs->to_charset);
+ markobj (ccs->from_charset);
+ markobj (ccs->to_charset);
}
}
if (codesys->iso2022.output_conv)
{
struct charset_conversion_spec *ccs =
Dynarr_atp (codesys->iso2022.output_conv, i);
- (markobj) (ccs->from_charset);
- (markobj) (ccs->to_charset);
+ markobj (ccs->from_charset);
+ markobj (ccs->to_charset);
}
}
break;
case CODESYS_CCL:
- (markobj) (CODING_SYSTEM_CCL_DECODE (codesys));
- (markobj) (CODING_SYSTEM_CCL_ENCODE (codesys));
+ markobj (CODING_SYSTEM_CCL_DECODE (codesys));
+ markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
break;
#endif /* MULE */
default:
break;
}
- (markobj) (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
+ markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
}
print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
int escapeflag)
{
- struct Lisp_Coding_System *c = XCODING_SYSTEM (obj);
+ Lisp_Coding_System *c = XCODING_SYSTEM (obj);
if (print_readably)
error ("printing unreadable object #<coding_system 0x%x>",
c->header.uid);
static void
finalize_coding_system (void *header, int for_disksave)
{
- struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header;
+ Lisp_Coding_System *c = (Lisp_Coding_System *) header;
/* Since coding systems never go away, this function is not
necessary. But it would be necessary if we changed things
so that coding systems could go away. */
{
switch (type)
{
+ default: abort ();
case EOL_LF: return Qlf;
case EOL_CRLF: return Qcrlf;
case EOL_CR: return Qcr;
case EOL_AUTODETECT: return Qnil;
- default: abort (); return Qnil; /* not reached */
}
}
static void
-setup_eol_coding_systems (struct Lisp_Coding_System *codesys)
+setup_eol_coding_systems (Lisp_Coding_System *codesys)
{
Lisp_Object codesys_obj;
int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
else
CHECK_SYMBOL (coding_system_or_name);
- return Fgethash (coding_system_or_name, Vcoding_system_hashtable, Qnil);
+ return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
}
DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
};
static int
-add_coding_system_to_list_mapper (CONST void *hash_key, void *hash_contents,
+add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
void *coding_system_list_closure)
{
/* This function can GC */
- Lisp_Object key, contents;
- Lisp_Object *coding_system_list;
struct coding_system_list_closure *cscl =
(struct coding_system_list_closure *) coding_system_list_closure;
- CVOID_TO_LISP (key, hash_key);
- VOID_TO_LISP (contents, hash_contents);
- coding_system_list = cscl->coding_system_list;
+ Lisp_Object *coding_system_list = cscl->coding_system_list;
- *coding_system_list = Fcons (XCODING_SYSTEM (contents)->name,
+ *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
*coding_system_list);
return 0;
}
GCPRO1 (coding_system_list);
coding_system_list_closure.coding_system_list = &coding_system_list;
- elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hashtable,
+ elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
&coding_system_list_closure);
UNGCPRO;
return XCODING_SYSTEM_NAME (coding_system);
}
-static struct Lisp_Coding_System *
+static Lisp_Coding_System *
allocate_coding_system (enum coding_system_type type, Lisp_Object name)
{
- struct Lisp_Coding_System *codesys =
- alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system);
+ Lisp_Coding_System *codesys =
+ alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
zero_lcrecord (codesys);
CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
characters will only be present if you explicitly insert them.)
'shift-jis
Shift-JIS (a Japanese encoding commonly used in PC operating systems).
+'ucs-4
+ ISO 10646 UCS-4 encoding.
+'utf-8
+ ISO 10646 UTF-8 encoding.
'iso2022
Any ISO2022-compliant encoding. Among other things, this includes
JIS (the Japanese encoding commonly used for e-mail), EUC (the
*/
(name, type, doc_string, props))
{
- struct Lisp_Coding_System *codesys;
+ Lisp_Coding_System *codesys;
Lisp_Object rest, key, value;
enum coding_system_type ty;
int need_to_setup_eol_systems = 1;
else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
+ else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
+ else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
#endif
else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
{
Lisp_Object codesys_obj;
XSETCODING_SYSTEM (codesys_obj, codesys);
- Fputhash (name, codesys_obj, Vcoding_system_hashtable);
+ Fputhash (name, codesys_obj, Vcoding_system_hash_table);
return codesys_obj;
}
}
allocate_coding_system
(XCODING_SYSTEM_TYPE (old_coding_system),
new_name));
- Fputhash (new_name, new_coding_system, Vcoding_system_hashtable);
+ Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
}
{
- struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
- struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
+ Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
+ Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
memcpy (((char *) to ) + sizeof (to->header),
((char *) from) + sizeof (from->header),
sizeof (*from) - sizeof (from->header));
return new_coding_system;
}
+DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
+Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
+*/
+ (alias, coding_system))
+{
+ CHECK_SYMBOL (alias);
+ if (!NILP (Ffind_coding_system (alias)))
+ signal_simple_error ("Symbol already names a coding system", alias);
+ coding_system = Fget_coding_system (coding_system);
+ Fputhash (alias, coding_system, Vcoding_system_hash_table);
+
+ /* Set up aliases for subsidiaries. */
+ if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
+ {
+ Lisp_Object str;
+ XSETSTRING (str, symbol_name (XSYMBOL (alias)));
+#define FROB(type, name) \
+ do { \
+ Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
+ if (!NILP (subsidiary)) \
+ Fdefine_coding_system_alias \
+ (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
+ } while (0)
+ FROB (LF, "-unix");
+ FROB (CRLF, "-dos");
+ FROB (CR, "-mac");
+#undef FROB
+ }
+ /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
+ but it doesn't look intentional, so I'd rather return something
+ meaningful or nothing at all. */
+ return Qnil;
+}
+
static Lisp_Object
subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
{
- struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
+ Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
Lisp_Object new_coding_system;
if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
{
switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
{
+ default: abort ();
case CODESYS_AUTODETECT: return Qundecided;
#ifdef MULE
case CODESYS_SHIFT_JIS: return Qshift_jis;
case CODESYS_ISO2022: return Qiso2022;
case CODESYS_BIG5: return Qbig5;
+ case CODESYS_UCS4: return Qucs4;
+ case CODESYS_UTF8: return Qutf8;
case CODESYS_CCL: return Qccl;
#endif
case CODESYS_NO_CONVERSION: return Qno_conversion;
#ifdef DEBUG_XEMACS
case CODESYS_INTERNAL: return Qinternal;
#endif
- default:
- abort ();
}
-
- return Qnil; /* not reached */
}
#ifdef MULE
struct
{
int mask;
+ int in_byte;
+ }
+ ucs4;
+
+ struct
+ {
+ int mask;
+ int in_byte;
+ }
+ utf8;
+
+ struct
+ {
+ int mask;
int initted;
struct iso2022_decoder iso;
unsigned int flags;
#ifdef MULE
st->shift_jis.mask = ~0;
st->big5.mask = ~0;
+ st->ucs4.mask = ~0;
+ st->utf8.mask = ~0;
st->iso2022.mask = ~0;
#endif
break;
st->shift_jis.mask = detect_coding_sjis (st, src, n);
if (!mask_has_at_most_one_bit_p (st->big5.mask))
st->big5.mask = detect_coding_big5 (st, src, n);
-
- st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask;
+ if (!mask_has_at_most_one_bit_p (st->utf8.mask))
+ st->utf8.mask = detect_coding_utf8 (st, src, n);
+ if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
+ st->ucs4.mask = detect_coding_ucs4 (st, src, n);
+
+ st->mask
+ = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
+ | st->utf8.mask | st->ucs4.mask;
#endif
{
int retval = mask_has_at_most_one_bit_p (st->mask);
}
}
if (NILP (retval))
- retval = Fget_coding_system (Qno_conversion);
+ retval = Fget_coding_system (Qraw_text);
return retval;
}
else
if (cat >= 0)
return coding_category_system[cat];
else
- return Fget_coding_system (Qno_conversion);
+ return Fget_coding_system (Qraw_text);
}
}
if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
*eol_type_in_out == EOL_AUTODETECT)
{
+ unsigned char random_buffer[4096];
+ int nread;
+ Lisp_Object coding_system = Qnil;
- while (1)
+ nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
+ if (nread)
{
- unsigned char random_buffer[4096];
- int nread;
+ unsigned char *cp = random_buffer;
- nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
- if (!nread)
- break;
- if (detect_coding_type (&decst, random_buffer, nread,
- XCODING_SYSTEM_TYPE (*codesys_in_out) !=
- CODESYS_AUTODETECT))
- break;
- }
+ while (cp < random_buffer + nread)
+ {
+ if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
+ (*cp++ == 'o') && (cp < random_buffer + nread) &&
+ (*cp++ == 'd') && (cp < random_buffer + nread) &&
+ (*cp++ == 'i') && (cp < random_buffer + nread) &&
+ (*cp++ == 'n') && (cp < random_buffer + nread) &&
+ (*cp++ == 'g') && (cp < random_buffer + nread) &&
+ (*cp++ == ':') && (cp < random_buffer + nread))
+ {
+ unsigned char coding_system_name[4096 - 6];
+ unsigned char *np = coding_system_name;
+ while ( (cp < random_buffer + nread)
+ && ((*cp == ' ') || (*cp == '\t')) )
+ {
+ cp++;
+ }
+ while ( (cp < random_buffer + nread) &&
+ (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
+ {
+ *np++ = *cp++;
+ }
+ *np = 0;
+ coding_system
+ = Ffind_coding_system (intern (coding_system_name));
+ break;
+ }
+ }
+ if (EQ(coding_system, Qnil))
+ do{
+ if (detect_coding_type (&decst, random_buffer, nread,
+ XCODING_SYSTEM_TYPE (*codesys_in_out)
+ != CODESYS_AUTODETECT))
+ break;
+ nread = Lstream_read (stream,
+ random_buffer, sizeof (random_buffer));
+ if (!nread)
+ break;
+ } while(1);
+ }
*eol_type_in_out = decst.eol_type;
if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
- *codesys_in_out = coding_system_from_mask (decst.mask);
+ {
+ if (EQ(coding_system, Qnil))
+ *codesys_in_out = coding_system_from_mask (decst.mask);
+ else
+ *codesys_in_out = coding_system;
+ }
}
-
/* If we absolutely can't determine the EOL type, just assume LF. */
if (*eol_type_in_out == EOL_AUTODETECT)
*eol_type_in_out = EOL_LF;
/* C should be a binary character in the range 0 - 255; convert
to internal format and add to Dynarr DST. */
+#ifdef UTF2000
+#define DECODE_ADD_BINARY_CHAR(c, dst) \
+do { \
+ if (BYTE_ASCII_P (c)) \
+ Dynarr_add (dst, c); \
+ else \
+ { \
+ Dynarr_add (dst, (c >> 6) | 0xc0); \
+ Dynarr_add (dst, (c & 0x3f) | 0x80); \
+ } \
+} while (0)
+
+INLINE void
+DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
+{
+ if ( c <= 0x7f )
+ {
+ Dynarr_add (dst, c);
+ }
+ else if ( c <= 0x7ff )
+ {
+ Dynarr_add (dst, (c >> 6) | 0xc0);
+ Dynarr_add (dst, (c & 0x3f) | 0x80);
+ }
+ else if ( c <= 0xffff )
+ {
+ Dynarr_add (dst, (c >> 12) | 0xe0);
+ Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (c & 0x3f) | 0x80);
+ }
+ else if ( c <= 0x1fffff )
+ {
+ Dynarr_add (dst, (c >> 18) | 0xf0);
+ Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (c & 0x3f) | 0x80);
+ }
+ else if ( c <= 0x3ffffff )
+ {
+ Dynarr_add (dst, (c >> 24) | 0xf8);
+ Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (c & 0x3f) | 0x80);
+ }
+ else
+ {
+ Dynarr_add (dst, (c >> 30) | 0xfc);
+ Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (c & 0x3f) | 0x80);
+ }
+}
+#else
#define DECODE_ADD_BINARY_CHAR(c, dst) \
do { \
if (BYTE_ASCII_P (c)) \
Dynarr_add (dst, c); \
} \
} while (0)
+#endif
#define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
do { \
#define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
do { \
- DECODE_OUTPUT_PARTIAL_CHAR (ch); \
- if ((flags & CODING_STATE_END) && \
- (flags & CODING_STATE_CR)) \
- Dynarr_add (dst, '\r'); \
+ if (flags & CODING_STATE_END) \
+ { \
+ DECODE_OUTPUT_PARTIAL_CHAR (ch); \
+ if (flags & CODING_STATE_CR) \
+ Dynarr_add (dst, '\r'); \
+ } \
} while (0)
#define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
struct decoding_stream
{
/* Coding system that governs the conversion. */
- struct Lisp_Coding_System *codesys;
+ Lisp_Coding_System *codesys;
/* Stream that we read the encoded data from or
write the decoded data to. */
/* Additional information (the state of the running CCL program)
used by the CCL decoder. */
struct ccl_program ccl;
+
+ /* counter for UTF-8 or UCS-4 */
+ unsigned char counter;
#endif
struct detection_state decst;
};
and automatically marked. */
XSETLSTREAM (str_obj, str);
- (markobj) (str_obj);
+ markobj (str_obj);
if (str->imp->marker)
return (str->imp->marker) (str_obj, markobj);
else
{
setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
}
+ str->counter = 0;
#endif /* MULE */
str->flags = str->ch = 0;
}
}
Dynarr_free (str->runoff);
#ifdef MULE
+#ifdef ENABLE_COMPOSITE_CHARS
if (str->iso2022.composite_chars)
Dynarr_free (str->iso2022.composite_chars);
#endif
+#endif
return Lstream_close (str->other_end);
}
void
set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
{
- struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
+ Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
str->codesys = cs;
if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
case CODESYS_BIG5:
decode_coding_big5 (decoding, src, dst, n);
break;
+ case CODESYS_UCS4:
+ decode_coding_ucs4 (decoding, src, dst, n);
+ break;
+ case CODESYS_UTF8:
+ decode_coding_utf8 (decoding, src, dst, n);
+ break;
case CODESYS_CCL:
- ccl_driver (&str->ccl, src, dst, n, 0);
+ str->ccl.last_block = str->flags & CODING_STATE_END;
+ ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
break;
case CODESYS_ISO2022:
decode_coding_iso2022 (decoding, src, dst, n);
struct encoding_stream
{
/* Coding system that governs the conversion. */
- struct Lisp_Coding_System *codesys;
+ Lisp_Coding_System *codesys;
/* Stream that we read the encoded data from or
write the decoded data to. */
and automatically marked. */
XSETLSTREAM (str_obj, str);
- (markobj) (str_obj);
+ markobj (str_obj);
if (str->imp->marker)
return (str->imp->marker) (str_obj, markobj);
else
str->iso2022.register_right = 1;
str->iso2022.current_charset = Qnil;
str->iso2022.current_half = 0;
+#ifdef UTF2000
+ str->iso2022.current_char_boundary = 0;
+#else
str->iso2022.current_char_boundary = 1;
+#endif
break;
}
case CODESYS_CCL:
void
set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
{
- struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
+ Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
str->codesys = cs;
reset_encoding_stream (str);
case CODESYS_BIG5:
encode_coding_big5 (encoding, src, dst, n);
break;
+ case CODESYS_UCS4:
+ encode_coding_ucs4 (encoding, src, dst, n);
+ break;
+ case CODESYS_UTF8:
+ encode_coding_utf8 (encoding, src, dst, n);
+ break;
case CODESYS_CCL:
- ccl_driver (&str->ccl, src, dst, n, 0);
+ str->ccl.last_block = str->flags & CODING_STATE_END;
+ ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
break;
case CODESYS_ISO2022:
encode_coding_iso2022 (encoding, src, dst, n);
/* Shift-JIS is a coding system encoding three character sets: ASCII, right
half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
- as is. A character of JISX0201-Kana (TYPE94 character set) is
+ as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
encoded by "position-code + 0x80". A character of JISX0208
- (TYPE94x94 character set) is encoded in 2-byte but two
+ (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
position-codes are divided and shifted so that it fit in the range
below.
unsigned_char_dynarr *dst, unsigned int n)
{
unsigned char c;
- unsigned int flags, ch;
- enum eol_type eol_type;
struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
-
- CODING_STREAM_DECOMPOSE (str, flags, ch);
- eol_type = str->eol_type;
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = str->eol_type;
while (n--)
{
{
unsigned char e1, e2;
- Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
DECODE_SJIS (ch, c, e1, e2);
+#ifdef UTF2000
+ DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
+ e1 & 0x7F,
+ e2 & 0x7F), dst);
+#else
+ Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
Dynarr_add (dst, e1);
Dynarr_add (dst, e2);
+#endif
}
else
{
ch = c;
else if (BYTE_SJIS_KATAKANA_P (c))
{
+#ifdef UTF2000
+ DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
+ c & 0x7F, 0), dst);
+#else
Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
Dynarr_add (dst, c);
+#endif
}
else
DECODE_ADD_BINARY_CHAR (c, dst);
DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
- CODING_STREAM_COMPOSE (str, flags, ch);
+ str->flags = flags;
+ str->ch = ch;
}
/* Convert internally-formatted data to Shift-JIS. */
{
unsigned char c;
struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- unsigned int flags, ch;
- enum eol_type eol_type;
-
- CODING_STREAM_DECOMPOSE (str, flags, ch);
- eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
+#ifdef UTF2000
+ unsigned char char_boundary = str->iso2022.current_char_boundary;
+#endif
while (n--)
{
c = *src++;
+#ifdef UTF2000
+ switch (char_boundary)
+ {
+ case 0:
+ if ( c >= 0xfc )
+ {
+ ch = c & 0x01;
+ char_boundary = 5;
+ }
+ else if ( c >= 0xf8 )
+ {
+ ch = c & 0x03;
+ char_boundary = 4;
+ }
+ else if ( c >= 0xf0 )
+ {
+ ch = c & 0x07;
+ char_boundary = 3;
+ }
+ else if ( c >= 0xe0 )
+ {
+ ch = c & 0x0f;
+ char_boundary = 2;
+ }
+ else if ( c >= 0xc0 )
+ {
+ ch = c & 0x1f;
+ char_boundary = 1;
+ }
+ else
+ {
+ ch = 0;
+ if (c == '\n')
+ {
+ if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
+ Dynarr_add (dst, '\r');
+ if (eol_type != EOL_CR)
+ Dynarr_add (dst, c);
+ }
+ else
+ Dynarr_add (dst, c);
+ char_boundary = 0;
+ }
+ break;
+ case 1:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ {
+ Lisp_Object charset;
+ unsigned int c1, c2, s1, s2;
+
+ BREAKUP_CHAR (ch, charset, c1, c2);
+ if (EQ(charset, Vcharset_katakana_jisx0201))
+ {
+ Dynarr_add (dst, c1 | 0x80);
+ }
+ else if (EQ(charset, Vcharset_japanese_jisx0208))
+ {
+ ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
+ Dynarr_add (dst, s1);
+ Dynarr_add (dst, s2);
+ }
+ }
+ char_boundary = 0;
+ break;
+ default:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ char_boundary--;
+ }
+#else
if (c == '\n')
{
if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
ch = 0;
}
}
+#endif
}
- CODING_STREAM_COMPOSE (str, flags, ch);
+ str->flags = flags;
+ str->ch = ch;
+#ifdef UTF2000
+ str->iso2022.current_char_boundary = char_boundary;
+#endif
}
DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
Since the number of characters in Big5 is larger than maximum
characters in Emacs' charset (96x96), it can't be handled as one
- charset. So, in Emacs, Big5 is devided into two: `charset-big5-1'
- and `charset-big5-2'. Both <type>s are TYPE94x94. The former
+ charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
+ and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
contains frequently used characters and the latter contains less
frequently used characters. */
unsigned_char_dynarr *dst, unsigned int n)
{
unsigned char c;
- unsigned int flags, ch;
- enum eol_type eol_type;
struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
-
- CODING_STREAM_DECOMPOSE (str, flags, ch);
- eol_type = str->eol_type;
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = str->eol_type;
while (n--)
{
DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
- CODING_STREAM_COMPOSE (str, flags, ch);
+ str->flags = flags;
+ str->ch = ch;
}
/* Convert internally-formatted data to Big5. */
encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
unsigned_char_dynarr *dst, unsigned int n)
{
+#ifndef UTF2000
unsigned char c;
struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- unsigned int flags, ch;
- enum eol_type eol_type;
-
- CODING_STREAM_DECOMPOSE (str, flags, ch);
- eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
while (n--)
{
ch = 0;
}
- CODING_STREAM_COMPOSE (str, flags, ch);
+ str->flags = flags;
+ str->ch = ch;
+#endif
}
if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
BYTE_BIG5_TWO_BYTE_2_P (b2))
{
- int leading_byte;
+ Charset_ID leading_byte;
Lisp_Object charset;
DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
charset = CHARSET_BY_LEADING_BYTE (leading_byte);
\f
/************************************************************************/
-/* ISO2022 methods */
+/* UCS-4 methods */
+/* */
+/* UCS-4 character codes are implemented as nonnegative integers. */
+/* */
/************************************************************************/
-/* The following note describes the coding system ISO2022 briefly.
- Since the intention of this note is to help understanding of the
- programs in this file, some parts are NOT ACCURATE or OVERLY
- SIMPLIFIED. For thorough understanding, please refer to the
- original document of ISO2022.
+Lisp_Object ucs_to_mule_table[65536];
+Lisp_Object mule_to_ucs_table;
- ISO2022 provides many mechanisms to encode several character sets
- in 7-bit and 8-bit environments. If one chooses 7-bit environment,
- all text is encoded by codes of less than 128. This may make the
- encoded text a little bit longer, but the text get more stability
- to pass through several gateways (some of them strip off MSB).
+DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
+Map UCS-4 code CODE to Mule character CHARACTER.
- There are two kind of character sets: control character set and
- graphic character set. The former contains control characters such
- as `newline' and `escape' to provide control functions (control
- functions are provided also by escape sequence). The latter
- contains graphic characters such as 'A' and '-'. Emacs recognizes
- two control character sets and many graphic character sets.
+Return T on success, NIL on failure.
+*/
+ (code, character))
+{
+ unsigned int c;
- Graphic character sets are classified into one of four types,
- according to the dimension and number of characters in the set:
- TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each
- character set is assigned an identification byte, unique for each
- type, called "final character" (denoted as <F> hereafter). The <F>
- of each character set is decided by ECMA(*) when it is registered
- in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for
- private use only).
+ CHECK_CHAR (character);
+ CHECK_INT (code);
+ c = XINT (code);
- Note (*): ECMA = European Computer Manufacturers Association
+ if (c < sizeof (ucs_to_mule_table))
+ {
+ ucs_to_mule_table[c] = character;
+ return Qt;
+ }
+ else
+ return Qnil;
+}
- Here are examples of graphic character set [NAME(<F>)]:
- o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
- o TYPE96 -- right-half-of-ISO8859-1('A'), ...
- o TYPE94x94 -- GB2312('A'), JISX0208('B'), ...
- o TYPE96x96 -- none for the moment
+static Lisp_Object
+ucs_to_char (unsigned long code)
+{
+ if (code < sizeof (ucs_to_mule_table))
+ {
+ return ucs_to_mule_table[code];
+ }
+ else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
+ {
+ unsigned int c;
+
+ code -= 0xe00000;
+ c = code % (94 * 94);
+ return make_char
+ (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
+ (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
+ CHARSET_LEFT_TO_RIGHT),
+ c / 94 + 33, c % 94 + 33));
+ }
+ else
+ return Qnil;
+}
- A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
- C0 [0x00..0x1F] -- control character plane 0
- GL [0x20..0x7F] -- graphic character plane 0
- C1 [0x80..0x9F] -- control character plane 1
- GR [0xA0..0xFF] -- graphic character plane 1
+DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
+Return Mule character corresponding to UCS code CODE (a positive integer).
+*/
+ (code))
+{
+ CHECK_NATNUM (code);
+ return ucs_to_char (XINT (code));
+}
- A control character set is directly designated and invoked to C0 or
- C1 by an escape sequence. The most common case is that:
- - ISO646's control character set is designated/invoked to C0, and
- - ISO6429's control character set is designated/invoked to C1,
- and usually these designations/invocations are omitted in encoded
- text. In a 7-bit environment, only C0 can be used, and a control
- character for C1 is encoded by an appropriate escape sequence to
- fit into the environment. All control characters for C1 are
- defined to have corresponding escape sequences.
+DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
+Map Mule character CHARACTER to UCS code CODE (a positive integer).
+*/
+ (character, code))
+{
+ /* #### Isn't this gilding the lily? Fput_char_table checks its args.
+ Fset_char_ucs is more restrictive on index arg, but should
+ check code arg in a char_table method. */
+ CHECK_CHAR (character);
+ CHECK_NATNUM (code);
+ return Fput_char_table (character, code, mule_to_ucs_table);
+}
- A graphic character set is at first designated to one of four
- graphic registers (G0 through G3), then these graphic registers are
- invoked to GL or GR. These designations and invocations can be
- done independently. The most common case is that G0 is invoked to
- GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
- these invocations and designations are omitted in encoded text.
- In a 7-bit environment, only GL can be used.
+DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
+Return the UCS code (a positive integer) corresponding to CHARACTER.
+*/
+ (character))
+{
+ return Fget_char_table (character, mule_to_ucs_table);
+}
- When a graphic character set of TYPE94 or TYPE94x94 is invoked to
- GL, codes 0x20 and 0x7F of the GL area work as control characters
- SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area
- should not be used.
+#ifdef UTF2000
+#define decode_ucs4 DECODE_ADD_UCS_CHAR
+#else
+/* Decode a UCS-4 character into a buffer. If the lookup fails, use
+ <GETA MARK> (U+3013) of JIS X 0208, which means correct character
+ is not found, instead.
+ #### do something more appropriate (use blob?)
+ Danger, Will Robinson! Data loss. Should we signal user? */
+static void
+decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
+{
+ Lisp_Object chr = ucs_to_char (ch);
- There are two ways of invocation: locking-shift and single-shift.
- With locking-shift, the invocation lasts until the next different
- invocation, whereas with single-shift, the invocation works only
- for the following character and doesn't affect locking-shift.
- Invocations are done by the following control characters or escape
- sequences.
+ if (! NILP (chr))
+ {
+ Bufbyte work[MAX_EMCHAR_LEN];
+ int len;
+
+ ch = XCHAR (chr);
+ len = (ch < 128) ?
+ simple_set_charptr_emchar (work, ch) :
+ non_ascii_set_charptr_emchar (work, ch);
+ Dynarr_add_many (dst, work, len);
+ }
+ else
+ {
+ Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
+ Dynarr_add (dst, 34 + 128);
+ Dynarr_add (dst, 46 + 128);
+ }
+}
+#endif
- ----------------------------------------------------------------------
- abbrev function cntrl escape seq description
- ----------------------------------------------------------------------
- SI/LS0 (shift-in) 0x0F none invoke G0 into GL
- SO/LS1 (shift-out) 0x0E none invoke G1 into GL
- LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR
- LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
- LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR
- LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
- LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR
- SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
- SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
- ----------------------------------------------------------------------
- The first four are for locking-shift. Control characters for these
- functions are defined by macros ISO_CODE_XXX in `coding.h'.
+static unsigned long
+mule_char_to_ucs4 (Lisp_Object charset,
+ unsigned char h, unsigned char l)
+{
+ Lisp_Object code
+ = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
+ mule_to_ucs_table);
- Designations are done by the following escape sequences.
- ----------------------------------------------------------------------
- escape sequence description
- ----------------------------------------------------------------------
- ESC '(' <F> designate TYPE94<F> to G0
- ESC ')' <F> designate TYPE94<F> to G1
- ESC '*' <F> designate TYPE94<F> to G2
- ESC '+' <F> designate TYPE94<F> to G3
- ESC ',' <F> designate TYPE96<F> to G0 (*)
- ESC '-' <F> designate TYPE96<F> to G1
- ESC '.' <F> designate TYPE96<F> to G2
- ESC '/' <F> designate TYPE96<F> to G3
- ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**)
- ESC '$' ')' <F> designate TYPE94x94<F> to G1
- ESC '$' '*' <F> designate TYPE94x94<F> to G2
- ESC '$' '+' <F> designate TYPE94x94<F> to G3
- ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*)
- ESC '$' '-' <F> designate TYPE96x96<F> to G1
- ESC '$' '.' <F> designate TYPE96x96<F> to G2
- ESC '$' '/' <F> designate TYPE96x96<F> to G3
- ----------------------------------------------------------------------
- In this list, "TYPE94<F>" means a graphic character set of type TYPE94
- and final character <F>, and etc.
+ if (INTP (code))
+ {
+ return XINT (code);
+ }
+ else if ( (XCHARSET_DIMENSION (charset) == 2) &&
+ (XCHARSET_CHARS (charset) == 94) )
+ {
+ unsigned char final = XCHARSET_FINAL (charset);
- Note (*): Although these designations are not allowed in ISO2022,
- Emacs accepts them on decoding, and produces them on encoding
- TYPE96 or TYPE96x96 character set in a coding system which is
- characterized as 7-bit environment, non-locking-shift, and
- non-single-shift.
+ if ( ('@' <= final) && (final < 0x7f) )
+ {
+ return 0xe00000 + (final - '@') * 94 * 94
+ + ((h & 127) - 33) * 94 + (l & 127) - 33;
+ }
+ else
+ {
+ return '?';
+ }
+ }
+ else
+ {
+ return '?';
+ }
+}
- Note (**): If <F> is '@', 'A', or 'B', the intermediate character
- '(' can be omitted. We call this as "short-form" here after.
+static void
+encode_ucs4 (Lisp_Object charset,
+ unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
+{
+ unsigned long code = mule_char_to_ucs4 (charset, h, l);
+ Dynarr_add (dst, code >> 24);
+ Dynarr_add (dst, (code >> 16) & 255);
+ Dynarr_add (dst, (code >> 8) & 255);
+ Dynarr_add (dst, code & 255);
+}
- Now you may notice that there are a lot of ways for encoding the
+static int
+detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
+ unsigned int n)
+{
+ while (n--)
+ {
+ int c = *src++;
+ switch (st->ucs4.in_byte)
+ {
+ case 0:
+ if (c >= 128)
+ return 0;
+ else
+ st->ucs4.in_byte++;
+ break;
+ case 3:
+ st->ucs4.in_byte = 0;
+ break;
+ default:
+ st->ucs4.in_byte++;
+ }
+ }
+ return CODING_CATEGORY_UCS4_MASK;
+}
+
+static void
+decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+ struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ unsigned char counter = str->counter;
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (counter)
+ {
+ case 0:
+ ch = c;
+ counter = 3;
+ break;
+ case 1:
+ decode_ucs4 ( ( ch << 8 ) | c, dst);
+ ch = 0;
+ counter = 0;
+ break;
+ default:
+ ch = ( ch << 8 ) | c;
+ counter--;
+ }
+ }
+ if (counter & CODING_STATE_END)
+ DECODE_OUTPUT_PARTIAL_CHAR (ch);
+
+ str->flags = flags;
+ str->ch = ch;
+ str->counter = counter;
+}
+
+static void
+encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+#ifndef UTF2000
+ struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ unsigned char char_boundary = str->iso2022.current_char_boundary;
+ Lisp_Object charset = str->iso2022.current_charset;
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ /* flags for handling composite chars. We do a little switcharoo
+ on the source while we're outputting the composite char. */
+ unsigned int saved_n = 0;
+ CONST unsigned char *saved_src = NULL;
+ int in_composite = 0;
+
+ back_to_square_n:
+#endif
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+
+ if (BYTE_ASCII_P (c))
+ { /* Processing ASCII character */
+ ch = 0;
+ encode_ucs4 (Vcharset_ascii, c, 0, dst);
+ char_boundary = 1;
+ }
+ else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
+ { /* Processing Leading Byte */
+ ch = 0;
+ charset = CHARSET_BY_LEADING_BYTE (c);
+ if (LEADING_BYTE_PREFIX_P(c))
+ ch = c;
+ char_boundary = 0;
+ }
+ else
+ { /* Processing Non-ASCII character */
+ char_boundary = 1;
+ if (EQ (charset, Vcharset_control_1))
+ {
+ encode_ucs4 (Vcharset_control_1, c, 0, dst);
+ }
+ else
+ {
+ switch (XCHARSET_REP_BYTES (charset))
+ {
+ case 2:
+ encode_ucs4 (charset, c, 0, dst);
+ break;
+ case 3:
+ if (XCHARSET_PRIVATE_P (charset))
+ {
+ encode_ucs4 (charset, c, 0, dst);
+ ch = 0;
+ }
+ else if (ch)
+ {
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (EQ (charset, Vcharset_composite))
+ {
+ if (in_composite)
+ {
+ /* #### Bother! We don't know how to
+ handle this yet. */
+ Dynarr_add (dst, 0);
+ Dynarr_add (dst, 0);
+ Dynarr_add (dst, 0);
+ Dynarr_add (dst, '~');
+ }
+ else
+ {
+ Emchar emch = MAKE_CHAR (Vcharset_composite,
+ ch & 0x7F, c & 0x7F);
+ Lisp_Object lstr = composite_char_string (emch);
+ saved_n = n;
+ saved_src = src;
+ in_composite = 1;
+ src = XSTRING_DATA (lstr);
+ n = XSTRING_LENGTH (lstr);
+ }
+ }
+ else
+#endif /* ENABLE_COMPOSITE_CHARS */
+ {
+ encode_ucs4(charset, ch, c, dst);
+ }
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ case 4:
+ if (ch)
+ {
+ encode_ucs4 (charset, ch, c, dst);
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ default:
+ abort ();
+ }
+ }
+ }
+ }
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (in_composite)
+ {
+ n = saved_n;
+ src = saved_src;
+ in_composite = 0;
+ goto back_to_square_n; /* Wheeeeeeeee ..... */
+ }
+#endif /* ENABLE_COMPOSITE_CHARS */
+
+ str->flags = flags;
+ str->ch = ch;
+ str->iso2022.current_char_boundary = char_boundary;
+ str->iso2022.current_charset = charset;
+
+ /* Verbum caro factum est! */
+#endif
+}
+
+\f
+/************************************************************************/
+/* UTF-8 methods */
+/************************************************************************/
+
+static int
+detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
+ unsigned int n)
+{
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (st->utf8.in_byte)
+ {
+ case 0:
+ if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+ return 0;
+ else if (c >= 0xfc)
+ st->utf8.in_byte = 5;
+ else if (c >= 0xf8)
+ st->utf8.in_byte = 4;
+ else if (c >= 0xf0)
+ st->utf8.in_byte = 3;
+ else if (c >= 0xe0)
+ st->utf8.in_byte = 2;
+ else if (c >= 0xc0)
+ st->utf8.in_byte = 1;
+ else if (c >= 0x80)
+ return 0;
+ break;
+ default:
+ if ((c & 0xc0) != 0x80)
+ return 0;
+ else
+ st->utf8.in_byte--;
+ }
+ }
+ return CODING_CATEGORY_UTF8_MASK;
+}
+
+static void
+decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+ struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = str->eol_type;
+ unsigned char counter = str->counter;
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (counter)
+ {
+ case 0:
+ if ( c >= 0xfc )
+ {
+ ch = c & 0x01;
+ counter = 5;
+ }
+ else if ( c >= 0xf8 )
+ {
+ ch = c & 0x03;
+ counter = 4;
+ }
+ else if ( c >= 0xf0 )
+ {
+ ch = c & 0x07;
+ counter = 3;
+ }
+ else if ( c >= 0xe0 )
+ {
+ ch = c & 0x0f;
+ counter = 2;
+ }
+ else if ( c >= 0xc0 )
+ {
+ ch = c & 0x1f;
+ counter = 1;
+ }
+ else
+ {
+ DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
+ decode_ucs4 (c, dst);
+ }
+ break;
+ case 1:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ decode_ucs4 (ch, dst);
+ ch = 0;
+ counter = 0;
+ break;
+ default:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ counter--;
+ }
+ label_continue_loop:;
+ }
+
+ if (flags & CODING_STATE_END)
+ DECODE_OUTPUT_PARTIAL_CHAR (ch);
+
+ str->flags = flags;
+ str->ch = ch;
+ str->counter = counter;
+}
+
+#ifndef UTF2000
+static void
+encode_utf8 (Lisp_Object charset,
+ unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
+{
+ unsigned long code = mule_char_to_ucs4 (charset, h, l);
+ if ( code <= 0x7f )
+ {
+ Dynarr_add (dst, code);
+ }
+ else if ( code <= 0x7ff )
+ {
+ Dynarr_add (dst, (code >> 6) | 0xc0);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else if ( code <= 0xffff )
+ {
+ Dynarr_add (dst, (code >> 12) | 0xe0);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else if ( code <= 0x1fffff )
+ {
+ Dynarr_add (dst, (code >> 18) | 0xf0);
+ Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else if ( code <= 0x3ffffff )
+ {
+ Dynarr_add (dst, (code >> 24) | 0xf8);
+ Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+ else
+ {
+ Dynarr_add (dst, (code >> 30) | 0xfc);
+ Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
+ Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
+ Dynarr_add (dst, (code & 0x3f) | 0x80);
+ }
+}
+#endif
+
+static void
+encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
+ unsigned_char_dynarr *dst, unsigned int n)
+{
+ struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
+ unsigned char char_boundary = str->iso2022.current_char_boundary;
+#ifdef UTF2000
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+ switch (char_boundary)
+ {
+ case 0:
+ if ( c >= 0xfc )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 5;
+ }
+ else if ( c >= 0xf8 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 4;
+ }
+ else if ( c >= 0xf0 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 3;
+ }
+ else if ( c >= 0xe0 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 2;
+ }
+ else if ( c >= 0xc0 )
+ {
+ Dynarr_add (dst, c);
+ char_boundary = 1;
+ }
+ else
+ {
+ if (c == '\n')
+ {
+ if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
+ Dynarr_add (dst, '\r');
+ if (eol_type != EOL_CR)
+ Dynarr_add (dst, c);
+ }
+ else
+ Dynarr_add (dst, c);
+ char_boundary = 0;
+ }
+ break;
+ case 1:
+ Dynarr_add (dst, c);
+ char_boundary = 0;
+ break;
+ default:
+ Dynarr_add (dst, c);
+ char_boundary--;
+ }
+ }
+#else /* not UTF2000 */
+ Lisp_Object charset = str->iso2022.current_charset;
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ /* flags for handling composite chars. We do a little switcharoo
+ on the source while we're outputting the composite char. */
+ unsigned int saved_n = 0;
+ CONST unsigned char *saved_src = NULL;
+ int in_composite = 0;
+
+ back_to_square_n:
+#endif /* ENABLE_COMPOSITE_CHARS */
+
+ while (n--)
+ {
+ unsigned char c = *src++;
+
+ if (BYTE_ASCII_P (c))
+ { /* Processing ASCII character */
+ ch = 0;
+ if (c == '\n')
+ {
+ if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
+ Dynarr_add (dst, '\r');
+ if (eol_type != EOL_CR)
+ Dynarr_add (dst, c);
+ }
+ else
+ encode_utf8 (Vcharset_ascii, c, 0, dst);
+ char_boundary = 1;
+ }
+ else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
+ { /* Processing Leading Byte */
+ ch = 0;
+ charset = CHARSET_BY_LEADING_BYTE (c);
+ if (LEADING_BYTE_PREFIX_P(c))
+ ch = c;
+ char_boundary = 0;
+ }
+ else
+ { /* Processing Non-ASCII character */
+ char_boundary = 1;
+ if (EQ (charset, Vcharset_control_1))
+ {
+ encode_utf8 (Vcharset_control_1, c, 0, dst);
+ }
+ else
+ {
+ switch (XCHARSET_REP_BYTES (charset))
+ {
+ case 2:
+ encode_utf8 (charset, c, 0, dst);
+ break;
+ case 3:
+ if (XCHARSET_PRIVATE_P (charset))
+ {
+ encode_utf8 (charset, c, 0, dst);
+ ch = 0;
+ }
+ else if (ch)
+ {
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (EQ (charset, Vcharset_composite))
+ {
+ if (in_composite)
+ {
+ /* #### Bother! We don't know how to
+ handle this yet. */
+ encode_utf8 (Vcharset_ascii, '~', 0, dst);
+ }
+ else
+ {
+ Emchar emch = MAKE_CHAR (Vcharset_composite,
+ ch & 0x7F, c & 0x7F);
+ Lisp_Object lstr = composite_char_string (emch);
+ saved_n = n;
+ saved_src = src;
+ in_composite = 1;
+ src = XSTRING_DATA (lstr);
+ n = XSTRING_LENGTH (lstr);
+ }
+ }
+ else
+#endif /* ENABLE_COMPOSITE_CHARS */
+ {
+ encode_utf8 (charset, ch, c, dst);
+ }
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ case 4:
+ if (ch)
+ {
+ encode_utf8 (charset, ch, c, dst);
+ ch = 0;
+ }
+ else
+ {
+ ch = c;
+ char_boundary = 0;
+ }
+ break;
+ default:
+ abort ();
+ }
+ }
+ }
+ }
+
+#ifdef ENABLE_COMPOSITE_CHARS
+ if (in_composite)
+ {
+ n = saved_n;
+ src = saved_src;
+ in_composite = 0;
+ goto back_to_square_n; /* Wheeeeeeeee ..... */
+ }
+#endif
+
+#endif /* not UTF2000 */
+ str->flags = flags;
+ str->ch = ch;
+ str->iso2022.current_char_boundary = char_boundary;
+#ifndef UTF2000
+ str->iso2022.current_charset = charset;
+#endif
+
+ /* Verbum caro factum est! */
+}
+
+\f
+/************************************************************************/
+/* ISO2022 methods */
+/************************************************************************/
+
+/* The following note describes the coding system ISO2022 briefly.
+ Since the intention of this note is to help understand the
+ functions in this file, some parts are NOT ACCURATE or OVERLY
+ SIMPLIFIED. For thorough understanding, please refer to the
+ original document of ISO2022.
+
+ ISO2022 provides many mechanisms to encode several character sets
+ in 7-bit and 8-bit environments. For 7-bit environments, all text
+ is encoded using bytes less than 128. This may make the encoded
+ text a little bit longer, but the text passes more easily through
+ several gateways, some of which strip off MSB (Most Signigant Bit).
+
+ There are two kinds of character sets: control character set and
+ graphic character set. The former contains control characters such
+ as `newline' and `escape' to provide control functions (control
+ functions are also provided by escape sequences). The latter
+ contains graphic characters such as 'A' and '-'. Emacs recognizes
+ two control character sets and many graphic character sets.
+
+ Graphic character sets are classified into one of the following
+ four classes, according to the number of bytes (DIMENSION) and
+ number of characters in one dimension (CHARS) of the set:
+ - DIMENSION1_CHARS94
+ - DIMENSION1_CHARS96
+ - DIMENSION2_CHARS94
+ - DIMENSION2_CHARS96
+
+ In addition, each character set is assigned an identification tag,
+ unique for each set, called "final character" (denoted as <F>
+ hereafter). The <F> of each character set is decided by ECMA(*)
+ when it is registered in ISO. The code range of <F> is 0x30..0x7F
+ (0x30..0x3F are for private use only).
+
+ Note (*): ECMA = European Computer Manufacturers Association
+
+ Here are examples of graphic character set [NAME(<F>)]:
+ o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
+ o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
+ o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
+ o DIMENSION2_CHARS96 -- none for the moment
+
+ A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
+ C0 [0x00..0x1F] -- control character plane 0
+ GL [0x20..0x7F] -- graphic character plane 0
+ C1 [0x80..0x9F] -- control character plane 1
+ GR [0xA0..0xFF] -- graphic character plane 1
+
+ A control character set is directly designated and invoked to C0 or
+ C1 by an escape sequence. The most common case is that:
+ - ISO646's control character set is designated/invoked to C0, and
+ - ISO6429's control character set is designated/invoked to C1,
+ and usually these designations/invocations are omitted in encoded
+ text. In a 7-bit environment, only C0 can be used, and a control
+ character for C1 is encoded by an appropriate escape sequence to
+ fit into the environment. All control characters for C1 are
+ defined to have corresponding escape sequences.
+
+ A graphic character set is at first designated to one of four
+ graphic registers (G0 through G3), then these graphic registers are
+ invoked to GL or GR. These designations and invocations can be
+ done independently. The most common case is that G0 is invoked to
+ GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
+ these invocations and designations are omitted in encoded text.
+ In a 7-bit environment, only GL can be used.
+
+ When a graphic character set of CHARS94 is invoked to GL, codes
+ 0x20 and 0x7F of the GL area work as control characters SPACE and
+ DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
+ be used.
+
+ There are two ways of invocation: locking-shift and single-shift.
+ With locking-shift, the invocation lasts until the next different
+ invocation, whereas with single-shift, the invocation affects the
+ following character only and doesn't affect the locking-shift
+ state. Invocations are done by the following control characters or
+ escape sequences:
+
+ ----------------------------------------------------------------------
+ abbrev function cntrl escape seq description
+ ----------------------------------------------------------------------
+ SI/LS0 (shift-in) 0x0F none invoke G0 into GL
+ SO/LS1 (shift-out) 0x0E none invoke G1 into GL
+ LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
+ LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
+ LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
+ LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
+ LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
+ SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
+ SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
+ ----------------------------------------------------------------------
+ (*) These are not used by any known coding system.
+
+ Control characters for these functions are defined by macros
+ ISO_CODE_XXX in `coding.h'.
+
+ Designations are done by the following escape sequences:
+ ----------------------------------------------------------------------
+ escape sequence description
+ ----------------------------------------------------------------------
+ ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
+ ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
+ ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
+ ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
+ ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
+ ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
+ ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
+ ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
+ ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
+ ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
+ ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
+ ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
+ ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
+ ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
+ ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
+ ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
+ ----------------------------------------------------------------------
+
+ In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
+ of dimension 1, chars 94, and final character <F>, etc...
+
+ Note (*): Although these designations are not allowed in ISO2022,
+ Emacs accepts them on decoding, and produces them on encoding
+ CHARS96 character sets in a coding system which is characterized as
+ 7-bit environment, non-locking-shift, and non-single-shift.
+
+ Note (**): If <F> is '@', 'A', or 'B', the intermediate character
+ '(' can be omitted. We refer to this as "short-form" hereafter.
+
+ Now you may notice that there are a lot of ways for encoding the
same multilingual text in ISO2022. Actually, there exist many
- coding systems such as Compound Text (used in X's inter client
+ coding systems such as Compound Text (used in X11's inter client
communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
(used in Korean internet), EUC (Extended UNIX Code, used in Asian
localized platforms), and all of these are variants of ISO2022.
sequences: ISO6429's direction specification and Emacs' private
sequence for specifying character composition.
- ISO6429's direction specification takes the following format:
+ ISO6429's direction specification takes the following form:
o CSI ']' -- end of the current direction
o CSI '0' ']' -- end of the current direction
o CSI '1' ']' -- start of left-to-right text
o CSI '2' ']' -- start of right-to-left text
The control character CSI (0x9B: control sequence introducer) is
- abbreviated to the escape sequence ESC '[' in 7-bit environment.
+ abbreviated to the escape sequence ESC '[' in a 7-bit environment.
- Character composition specification takes the following format:
+ Character composition specification takes the following form:
o ESC '0' -- start character composition
o ESC '1' -- end character composition
- Since these are not standard escape sequences of any ISO, the use
- of them for these meanings is restricted to Emacs only. */
+ Since these are not standard escape sequences of any ISO standard,
+ their use with these meanings is restricted to Emacs only. */
static void
reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
iso->invalid_switch_dir = 0;
iso->output_direction_sequence = 0;
iso->output_literally = 0;
+#ifdef ENABLE_COMPOSITE_CHARS
if (iso->composite_chars)
Dynarr_reset (iso->composite_chars);
+#endif
}
static int
reg = 3; half = 1;
goto locking_shift;
+#ifdef ENABLE_COMPOSITE_CHARS
/**** composite ****/
case '0':
*flags = (*flags & CODING_STATE_ISO2022_LOCK) &
~CODING_STATE_COMPOSITE;
return 1;
+#endif /* ENABLE_COMPOSITE_CHARS */
/**** directionality ****/
detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
unsigned int n)
{
- int c;
int mask;
/* #### There are serious deficiencies in the recognition mechanism
- here. This needs to be much smarter if it's going to cut it. */
+ here. This needs to be much smarter if it's going to cut it.
+ The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
+ it should be detected as Latin-1.
+ All the ISO2022 stuff in this file should be synced up with the
+ code from FSF Emacs-20.4, in which Mule should be more or less stable.
+ Perhaps we should wait till R2L works in FSF Emacs? */
if (!st->iso2022.initted)
{
while (n--)
{
- c = *src++;
+ int c = *src++;
if (c >= 0xA0)
{
mask &= ~CODING_CATEGORY_ISO_7_MASK;
need to handle the CSI differently. */
static void
-restore_left_to_right_direction (struct Lisp_Coding_System *codesys,
+restore_left_to_right_direction (Lisp_Coding_System *codesys,
unsigned_char_dynarr *dst,
unsigned int *flags,
int internal_p)
need to handle the CSI differently. */
static void
-ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys,
+ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
unsigned_char_dynarr *dst, unsigned int *flags,
int internal_p)
{
decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
unsigned_char_dynarr *dst, unsigned int n)
{
- unsigned char c;
- unsigned int flags, ch;
- enum eol_type eol_type;
struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
- Lisp_Object coding_system;
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = str->eol_type;
+#ifdef ENABLE_COMPOSITE_CHARS
unsigned_char_dynarr *real_dst = dst;
+#endif
+ Lisp_Object coding_system;
- CODING_STREAM_DECOMPOSE (str, flags, ch);
- eol_type = str->eol_type;
XSETCODING_SYSTEM (coding_system, str->codesys);
+#ifdef ENABLE_COMPOSITE_CHARS
if (flags & CODING_STATE_COMPOSITE)
dst = str->iso2022.composite_chars;
+#endif /* ENABLE_COMPOSITE_CHARS */
while (n--)
{
- c = *src++;
+ unsigned char c = *src++;
if (flags & CODING_STATE_ESCAPE)
{ /* Within ESC sequence */
int retval = parse_iso2022_esc (coding_system, &str->iso2022,
{
switch (str->iso2022.esc)
{
+#ifdef ENABLE_COMPOSITE_CHARS
case ISO_ESC_START_COMPOSITE:
if (str->iso2022.composite_chars)
Dynarr_reset (str->iso2022.composite_chars);
Dynarr_add_many (dst, comstr, len);
break;
}
+#endif /* ENABLE_COMPOSITE_CHARS */
case ISO_ESC_LITERAL:
DECODE_ADD_BINARY_CHAR (c, dst);
else
{ /* Graphic characters */
Lisp_Object charset;
- int lb;
+#ifndef UTF2000
+ Charset_ID lb;
+#endif
int reg;
DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
charset = str->iso2022.charset[reg];
/* Error checking: */
- if (NILP (charset) || str->iso2022.invalid_designated[reg]
+ if (! CHARSETP (charset)
+ || str->iso2022.invalid_designated[reg]
|| (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
&& XCHARSET_CHARS (charset) == 94))
/* Mrmph. We are trying to invoke a register that has no
charset = new_charset;
}
+#ifdef UTF2000
+ if (XCHARSET_DIMENSION (charset) == 1)
+ {
+ DECODE_OUTPUT_PARTIAL_CHAR (ch);
+ DECODE_ADD_UCS_CHAR
+ (MAKE_CHAR (charset, c & 0x7F, 0), dst);
+ }
+ else if (ch)
+ {
+ DECODE_ADD_UCS_CHAR
+ (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
+ ch = 0;
+ }
+ else
+ ch = c;
+#else
lb = XCHARSET_LEADING_BYTE (charset);
switch (XCHARSET_REP_BYTES (charset))
{
else
ch = c;
}
+#endif
}
if (!ch)
if (flags & CODING_STATE_END)
DECODE_OUTPUT_PARTIAL_CHAR (ch);
- CODING_STREAM_COMPOSE (str, flags, ch);
+ str->flags = flags;
+ str->ch = ch;
}
iso2022_designate (Lisp_Object charset, unsigned char reg,
struct encoding_stream *str, unsigned_char_dynarr *dst)
{
- CONST char *inter94 = "()*+", *inter96= ",-./";
+ static CONST char inter94[] = "()*+";
+ static CONST char inter96[] = ",-./";
unsigned int type;
unsigned char final;
Lisp_Object old_charset = str->iso2022.charset[reg];
unsigned_char_dynarr *dst, unsigned int n)
{
unsigned char charmask, c;
- unsigned int flags, ch;
- enum eol_type eol_type;
unsigned char char_boundary;
struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- struct Lisp_Coding_System *codesys = str->codesys;
+ unsigned int flags = str->flags;
+ Emchar ch = str->ch;
+ Lisp_Coding_System *codesys = str->codesys;
+ eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
int i;
Lisp_Object charset;
int half;
+#ifdef UTF2000
+ unsigned int byte1, byte2;
+#endif
+#ifdef ENABLE_COMPOSITE_CHARS
/* flags for handling composite chars. We do a little switcharoo
on the source while we're outputting the composite char. */
unsigned int saved_n = 0;
CONST unsigned char *saved_src = NULL;
int in_composite = 0;
+#endif /* ENABLE_COMPOSITE_CHARS */
- CODING_STREAM_DECOMPOSE (str, flags, ch);
- eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
char_boundary = str->iso2022.current_char_boundary;
charset = str->iso2022.current_charset;
half = str->iso2022.current_half;
+#ifdef ENABLE_COMPOSITE_CHARS
back_to_square_n:
+#endif
+#ifdef UTF2000
+ while (n--)
+ {
+ c = *src++;
+
+ switch (char_boundary)
+ {
+ case 0:
+ if ( c >= 0xfc )
+ {
+ ch = c & 0x01;
+ char_boundary = 5;
+ }
+ else if ( c >= 0xf8 )
+ {
+ ch = c & 0x03;
+ char_boundary = 4;
+ }
+ else if ( c >= 0xf0 )
+ {
+ ch = c & 0x07;
+ char_boundary = 3;
+ }
+ else if ( c >= 0xe0 )
+ {
+ ch = c & 0x0f;
+ char_boundary = 2;
+ }
+ else if ( c >= 0xc0 )
+ {
+ ch = c & 0x1f;
+ char_boundary = 1;
+ }
+ else
+ {
+ ch = 0;
+
+ restore_left_to_right_direction (codesys, dst, &flags, 0);
+
+ /* Make sure G0 contains ASCII */
+ if ((c > ' ' && c < ISO_CODE_DEL) ||
+ !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
+ {
+ ensure_normal_shift (str, dst);
+ iso2022_designate (Vcharset_ascii, 0, str, dst);
+ }
+
+ /* If necessary, restore everything to the default state
+ at end-of-line */
+ if (c == '\n' &&
+ !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
+ {
+ restore_left_to_right_direction (codesys, dst, &flags, 0);
+
+ ensure_normal_shift (str, dst);
+
+ for (i = 0; i < 4; i++)
+ {
+ Lisp_Object initial_charset =
+ CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
+ iso2022_designate (initial_charset, i, str, dst);
+ }
+ }
+ if (c == '\n')
+ {
+ if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
+ Dynarr_add (dst, '\r');
+ if (eol_type != EOL_CR)
+ Dynarr_add (dst, c);
+ }
+ else
+ {
+ if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
+ && fit_to_be_escape_quoted (c))
+ Dynarr_add (dst, ISO_CODE_ESC);
+ Dynarr_add (dst, c);
+ }
+ char_boundary = 0;
+ }
+ break;
+ case 1:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+
+ char_boundary = 0;
+ if ( (0x80 <= ch) && (ch <= 0x9f) )
+ {
+ charmask = (half == 0 ? 0x00 : 0x80);
+
+ if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
+ && fit_to_be_escape_quoted (ch))
+ Dynarr_add (dst, ISO_CODE_ESC);
+ /* you asked for it ... */
+ Dynarr_add (dst, ch);
+ }
+ else
+ {
+ int reg;
+
+ BREAKUP_CHAR (ch, charset, byte1, byte2);
+ ensure_correct_direction (XCHARSET_DIRECTION (charset),
+ codesys, dst, &flags, 0);
+
+ /* Now determine which register to use. */
+ reg = -1;
+ for (i = 0; i < 4; i++)
+ {
+ if (EQ (charset, str->iso2022.charset[i]) ||
+ EQ (charset,
+ CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
+ {
+ reg = i;
+ break;
+ }
+ }
+
+ if (reg == -1)
+ {
+ if (XCHARSET_GRAPHIC (charset) != 0)
+ {
+ if (!NILP (str->iso2022.charset[1]) &&
+ (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
+ CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
+ reg = 1;
+ else if (!NILP (str->iso2022.charset[2]))
+ reg = 2;
+ else if (!NILP (str->iso2022.charset[3]))
+ reg = 3;
+ else
+ reg = 0;
+ }
+ else
+ reg = 0;
+ }
+
+ iso2022_designate (charset, reg, str, dst);
+
+ /* Now invoke that register. */
+ switch (reg)
+ {
+ case 0:
+ ensure_normal_shift (str, dst);
+ half = 0;
+ break;
+
+ case 1:
+ if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
+ {
+ ensure_shift_out (str, dst);
+ half = 0;
+ }
+ else
+ half = 1;
+ break;
+
+ case 2:
+ if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
+ {
+ Dynarr_add (dst, ISO_CODE_ESC);
+ Dynarr_add (dst, 'N');
+ half = 0;
+ }
+ else
+ {
+ Dynarr_add (dst, ISO_CODE_SS2);
+ half = 1;
+ }
+ break;
+
+ case 3:
+ if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
+ {
+ Dynarr_add (dst, ISO_CODE_ESC);
+ Dynarr_add (dst, 'O');
+ half = 0;
+ }
+ else
+ {
+ Dynarr_add (dst, ISO_CODE_SS3);
+ half = 1;
+ }
+ break;
+
+ default:
+ abort ();
+ }
+
+ charmask = (half == 0 ? 0x00 : 0x80);
+
+ switch (XCHARSET_DIMENSION (charset))
+ {
+ case 1:
+ Dynarr_add (dst, byte1 | charmask);
+ break;
+ case 2:
+ Dynarr_add (dst, byte1 | charmask);
+ Dynarr_add (dst, byte2 | charmask);
+ break;
+ default:
+ abort ();
+ }
+ }
+ ch =0;
+ break;
+ default:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ char_boundary--;
+ }
+ }
+#else /* not UTF2000 */
+
while (n--)
{
c = *src++;
if (LEADING_BYTE_PREFIX_P(c))
ch = c;
else if (!EQ (charset, Vcharset_control_1)
- && !EQ (charset, Vcharset_composite))
+#ifdef ENABLE_COMPOSITE_CHARS
+ && !EQ (charset, Vcharset_composite)
+#endif
+ )
{
int reg;
}
else if (ch)
{
+#ifdef ENABLE_COMPOSITE_CHARS
if (EQ (charset, Vcharset_composite))
{
if (in_composite)
}
}
else
+#endif /* ENABLE_COMPOSITE_CHARS */
{
Dynarr_add (dst, ch & charmask);
Dynarr_add (dst, c & charmask);
}
}
}
+#endif /* not UTF2000 */
+#ifdef ENABLE_COMPOSITE_CHARS
if (in_composite)
{
n = saved_n;
Dynarr_add (dst, '1'); /* end composing */
goto back_to_square_n; /* Wheeeeeeeee ..... */
}
+#endif /* ENABLE_COMPOSITE_CHARS */
+#ifdef UTF2000
+ if ( (char_boundary == 0) && flags & CODING_STATE_END)
+#else
if (char_boundary && flags & CODING_STATE_END)
+#endif
{
restore_left_to_right_direction (codesys, dst, &flags, 0);
ensure_normal_shift (str, dst);
}
}
- CODING_STREAM_COMPOSE (str, flags, ch);
+ str->flags = flags;
+ str->ch = ch;
str->iso2022.current_char_boundary = char_boundary;
str->iso2022.current_charset = charset;
str->iso2022.current_half = half;
unsigned_char_dynarr *dst, unsigned int n)
{
unsigned char c;
- unsigned int flags, ch;
- enum eol_type eol_type;
struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
-
- CODING_STREAM_DECOMPOSE (str, flags, ch);
- eol_type = str->eol_type;
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = str->eol_type;
while (n--)
{
DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
- CODING_STREAM_COMPOSE (str, flags, ch);
+ str->flags = flags;
+ str->ch = ch;
}
static void
{
unsigned char c;
struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
- unsigned int flags, ch;
- enum eol_type eol_type;
-
- CODING_STREAM_DECOMPOSE (str, flags, ch);
- eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
+ unsigned int flags = str->flags;
+ unsigned int ch = str->ch;
+ eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
+#ifdef UTF2000
+ unsigned char char_boundary = str->iso2022.current_char_boundary;
+#endif
while (n--)
{
- c = *src++;
+ c = *src++;
+#ifdef UTF2000
+ switch (char_boundary)
+ {
+ case 0:
+ if ( c >= 0xfc )
+ {
+ ch = c & 0x01;
+ char_boundary = 5;
+ }
+ else if ( c >= 0xf8 )
+ {
+ ch = c & 0x03;
+ char_boundary = 4;
+ }
+ else if ( c >= 0xf0 )
+ {
+ ch = c & 0x07;
+ char_boundary = 3;
+ }
+ else if ( c >= 0xe0 )
+ {
+ ch = c & 0x0f;
+ char_boundary = 2;
+ }
+ else if ( c >= 0xc0 )
+ {
+ ch = c & 0x1f;
+ char_boundary = 1;
+ }
+ else
+ {
+ ch = 0;
+
+ if (c == '\n')
+ {
+ if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
+ Dynarr_add (dst, '\r');
+ if (eol_type != EOL_CR)
+ Dynarr_add (dst, c);
+ }
+ else
+ Dynarr_add (dst, c);
+ char_boundary = 0;
+ }
+ break;
+ case 1:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ Dynarr_add (dst, ch & 0xff);
+ char_boundary = 0;
+ break;
+ default:
+ ch = ( ch << 6 ) | ( c & 0x3f );
+ char_boundary--;
+ }
+#else /* not UTF2000 */
if (c == '\n')
{
if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
untranslatable character, so ignore it */
ch = 0;
}
+#endif /* not UTF2000 */
}
- CODING_STREAM_COMPOSE (str, flags, ch);
+ str->flags = flags;
+ str->ch = ch;
+#ifdef UTF2000
+ str->iso2022.current_char_boundary = char_boundary;
+#endif
}
\f
/* Determine coding system from coding format */
-#define FILE_NAME_CODING_SYSTEM \
- ((NILP (Vfile_name_coding_system) || \
- (EQ ((Vfile_name_coding_system), Qbinary))) ? \
- Qnil : Fget_coding_system (Vfile_name_coding_system))
-
/* #### not correct for all values of `fmt'! */
+static Lisp_Object
+external_data_format_to_coding_system (enum external_data_format fmt)
+{
+ switch (fmt)
+ {
+ case FORMAT_FILENAME:
+ case FORMAT_TERMINAL:
+ if (EQ (Vfile_name_coding_system, Qnil) ||
+ EQ (Vfile_name_coding_system, Qbinary))
+ return Qnil;
+ else
+ return Fget_coding_system (Vfile_name_coding_system);
#ifdef MULE
-#define FMT_CODING_SYSTEM(fmt) \
- (((fmt) == FORMAT_FILENAME) ? FILE_NAME_CODING_SYSTEM : \
- ((fmt) == FORMAT_CTEXT ) ? Fget_coding_system (Qctext) : \
- ((fmt) == FORMAT_TERMINAL) ? FILE_NAME_CODING_SYSTEM : \
- Qnil)
-#else
-#define FMT_CODING_SYSTEM(fmt) \
- (((fmt) == FORMAT_FILENAME) ? FILE_NAME_CODING_SYSTEM : \
- ((fmt) == FORMAT_TERMINAL) ? FILE_NAME_CODING_SYSTEM : \
- Qnil)
+ case FORMAT_CTEXT:
+ return Fget_coding_system (Qctext);
#endif
+ default:
+ return Qnil;
+ }
+}
Extbyte *
convert_to_external_format (CONST Bufbyte *ptr,
Extcount *len_out,
enum external_data_format fmt)
{
- Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
+ Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
if (!conversion_out_dynarr)
conversion_out_dynarr = Dynarr_new (Extbyte);
for (; ptr < end;)
{
+#ifdef UTF2000
+ Bufbyte c =
+ (*ptr < 0xc0) ? *ptr :
+ ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
+#else
Bufbyte c =
(BYTE_ASCII_P (*ptr)) ? *ptr :
(*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
(*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
'~';
-
+#endif
Dynarr_add (conversion_out_dynarr, (Extbyte) c);
INC_CHARPTR (ptr);
}
Bytecount *len_out,
enum external_data_format fmt)
{
- Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
+ Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
if (!conversion_in_dynarr)
conversion_in_dynarr = Dynarr_new (Bufbyte);
/************************************************************************/
void
-syms_of_mule_coding (void)
+syms_of_file_coding (void)
{
defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
deferror (&Qcoding_system_error, "coding-system-error",
DEFSUBR (Fcoding_system_name);
DEFSUBR (Fmake_coding_system);
DEFSUBR (Fcopy_coding_system);
+ DEFSUBR (Fdefine_coding_system_alias);
DEFSUBR (Fsubsidiary_coding_system);
DEFSUBR (Fcoding_system_type);
DEFSUBR (Fencode_shift_jis_char);
DEFSUBR (Fdecode_big5_char);
DEFSUBR (Fencode_big5_char);
+ DEFSUBR (Fset_ucs_char);
+ DEFSUBR (Fucs_char);
+ DEFSUBR (Fset_char_ucs);
+ DEFSUBR (Fchar_ucs);
#endif /* MULE */
defsymbol (&Qcoding_system_p, "coding-system-p");
defsymbol (&Qno_conversion, "no-conversion");
+ defsymbol (&Qraw_text, "raw-text");
#ifdef MULE
defsymbol (&Qbig5, "big5");
defsymbol (&Qshift_jis, "shift-jis");
+ defsymbol (&Qucs4, "ucs-4");
+ defsymbol (&Qutf8, "utf-8");
defsymbol (&Qccl, "ccl");
defsymbol (&Qiso2022, "iso2022");
#endif /* MULE */
"shift-jis");
defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
"big5");
+ defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
+ "ucs-4");
+ defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
+ "utf-8");
defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
"iso-7");
defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
}
void
-lstream_type_create_mule_coding (void)
+lstream_type_create_file_coding (void)
{
LSTREAM_HAS_METHOD (decoding, reader);
LSTREAM_HAS_METHOD (decoding, writer);
}
void
-vars_of_mule_coding (void)
+vars_of_file_coding (void)
{
int i;
}
void
-complex_vars_of_mule_coding (void)
+complex_vars_of_file_coding (void)
{
- staticpro (&Vcoding_system_hashtable);
- Vcoding_system_hashtable = make_lisp_hashtable (50, HASHTABLE_NONWEAK,
- HASHTABLE_EQ);
+ staticpro (&Vcoding_system_hash_table);
+ Vcoding_system_hash_table =
+ make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
#endif /* MULE */
/* Need to create this here or we're really screwed. */
- Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
- list2 (Qmnemonic, build_string ("Noconv")));
+ Fmake_coding_system
+ (Qraw_text, Qno_conversion,
+ build_string ("Raw text, which means it converts only line-break-codes."),
+ list2 (Qmnemonic, build_string ("Raw")));
+
+ Fmake_coding_system
+ (Qbinary, Qno_conversion,
+ build_string ("Binary, which means it does not convert anything."),
+ list4 (Qeol_type, Qlf,
+ Qmnemonic, build_string ("Binary")));
+
+#ifdef UTF2000
+ Fmake_coding_system
+ (Qutf8, Qutf8,
+ build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
+ list2 (Qmnemonic, build_string ("UTF8")));
+#endif
- Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
- Qbinary);
+ Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
/* Need this for bootstrapping */
coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
- Fget_coding_system (Qno_conversion);
+ Fget_coding_system (Qraw_text);
+
+#ifdef UTF2000
+ coding_category_system[CODING_CATEGORY_UTF8]
+ = Fget_coding_system (Qutf8);
+#endif
+
+#ifdef MULE
+ {
+ unsigned int i;
+
+ for (i = 0; i < 65536; i++)
+ ucs_to_mule_table[i] = Qnil;
+ }
+ staticpro (&mule_to_ucs_table);
+ mule_to_ucs_table = Fmake_char_table(Qgeneric);
+#endif /* MULE */
}