From: tomo Date: Mon, 17 May 1999 09:42:48 +0000 (+0000) Subject: Delete mule-coding.c and mule-coding.h. X-Git-Tag: r21-2-13-tomo-1~3 X-Git-Url: http://git.chise.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c6094656d3014bce89afa9512c22256ebbf6a1d1;p=chise%2Fxemacs-chise.git- Delete mule-coding.c and mule-coding.h. --- diff --git a/src/mule-coding.c b/src/mule-coding.c deleted file mode 100644 index 635b381..0000000 --- a/src/mule-coding.c +++ /dev/null @@ -1,4815 +0,0 @@ -/* Code conversion functions. - Copyright (C) 1991, 1995 Free Software Foundation, Inc. - Copyright (C) 1995 Sun Microsystems, Inc. - -This file is part of XEmacs. - -XEmacs is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 2, or (at your option) any -later version. - -XEmacs is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with XEmacs; see the file COPYING. If not, write to -the Free Software Foundation, Inc., 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ - -/* Synched up with: Mule 2.3. Not in FSF. */ - -/* Rewritten by Ben Wing . */ - -#if 0 /* while file-coding not split up */ - -#include -#include "lisp.h" - -#include "buffer.h" -#include "elhash.h" -#include "insdel.h" -#include "lstream.h" -#include "mule-ccl.h" -#include "mule-coding.h" - -Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error; - -Lisp_Object Vkeyboard_coding_system; -Lisp_Object Vterminal_coding_system; -Lisp_Object Vcoding_system_for_read; -Lisp_Object Vcoding_system_for_write; -Lisp_Object Vfile_name_coding_system; - -/* Table of symbols identifying each coding category. */ -Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1]; - -/* Coding system currently associated with each coding category. */ -Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1]; - -/* Table of all coding categories in decreasing order of priority. - This describes a permutation of the possible coding categories. */ -int coding_category_by_priority[CODING_CATEGORY_LAST + 1]; - -Lisp_Object Qcoding_system_p; - -Lisp_Object Qbig5, Qshift_jis, Qno_conversion, Qccl, Qiso2022; -/* Qinternal in general.c */ - -Lisp_Object Qmnemonic, Qeol_type; -Lisp_Object Qcr, Qcrlf, Qlf; -Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf; -Lisp_Object Qpost_read_conversion; -Lisp_Object Qpre_write_conversion; - -Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3; -Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output; -Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output; -Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift; -Lisp_Object Qno_iso6429, Qescape_quoted; -Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion; - -Lisp_Object Qencode, Qdecode; - -Lisp_Object Qctext; - -Lisp_Object Vcoding_system_hash_table; - -int enable_multibyte_characters; - -/* Additional information used by the ISO2022 decoder and detector. */ -struct iso2022_decoder -{ - /* CHARSET holds the character sets currently assigned to the G0 - through G3 variables. It is initialized from the array - INITIAL_CHARSET in CODESYS. */ - Lisp_Object charset[4]; - - /* Which registers are currently invoked into the left (GL) and - right (GR) halves of the 8-bit encoding space? */ - int register_left, register_right; - - /* ISO_ESC holds a value indicating part of an escape sequence - that has already been seen. */ - enum iso_esc_flag esc; - - /* This records the bytes we've seen so far in an escape sequence, - in case the sequence is invalid (we spit out the bytes unchanged). */ - unsigned char esc_bytes[8]; - - /* Index for next byte to store in ISO escape sequence. */ - int esc_bytes_index; - - /* Stuff seen so far when composing a string. */ - unsigned_char_dynarr *composite_chars; - - /* If we saw an invalid designation sequence for a particular - register, we flag it here and switch to ASCII. The next time we - see a valid designation for this register, we turn off the flag - and do the designation normally, but pretend the sequence was - invalid. The effect of all this is that (most of the time) the - escape sequences for both the switch to the unknown charset, and - the switch back to the known charset, get inserted literally into - the buffer and saved out as such. The hope is that we can - preserve the escape sequences so that the resulting written out - file makes sense. If we don't do any of this, the designation - to the invalid charset will be preserved but that switch back - to the known charset will probably get eaten because it was - the same charset that was already present in the register. */ - unsigned char invalid_designated[4]; - - /* We try to do similar things as above for direction-switching - sequences. If we encountered a direction switch while an - invalid designation was present, or an invalid designation - just after a direction switch (i.e. no valid designation - encountered yet), we insert the direction-switch escape - sequence literally into the output stream, and later on - insert the corresponding direction-restoring escape sequence - literally also. */ - unsigned int switched_dir_and_no_valid_charset_yet :1; - unsigned int invalid_switch_dir :1; - - /* Tells the decoder to output the escape sequence literally - even though it was valid. Used in the games we play to - avoid lossage when we encounter invalid designations. */ - unsigned int output_literally :1; - /* We encountered a direction switch followed by an invalid - designation. We didn't output the direction switch - literally because we didn't know about the invalid designation; - but we have to do so now. */ - unsigned int output_direction_sequence :1; -}; - -EXFUN (Fcopy_coding_system, 2); -struct detection_state; -static int detect_coding_sjis (struct detection_state *st, - CONST unsigned char *src, - unsigned int n); -static void decode_coding_sjis (Lstream *decoding, - CONST unsigned char *src, - unsigned_char_dynarr *dst, - unsigned int n); -static void encode_coding_sjis (Lstream *encoding, - CONST unsigned char *src, - unsigned_char_dynarr *dst, - unsigned int n); -static int detect_coding_big5 (struct detection_state *st, - CONST unsigned char *src, - unsigned int n); -static void decode_coding_big5 (Lstream *decoding, - CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static void encode_coding_big5 (Lstream *encoding, - CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static int postprocess_iso2022_mask (int mask); -static void reset_iso2022 (Lisp_Object coding_system, - struct iso2022_decoder *iso); -static int detect_coding_iso2022 (struct detection_state *st, - CONST unsigned char *src, - unsigned int n); -static void decode_coding_iso2022 (Lstream *decoding, - CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static void encode_coding_iso2022 (Lstream *encoding, - CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static void decode_coding_no_conversion (Lstream *decoding, - CONST unsigned char *src, - unsigned_char_dynarr *dst, - unsigned int n); -static void encode_coding_no_conversion (Lstream *encoding, - CONST unsigned char *src, - unsigned_char_dynarr *dst, - unsigned int n); -static void mule_decode (Lstream *decoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); -static void mule_encode (Lstream *encoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n); - -typedef struct codesys_prop codesys_prop; -struct codesys_prop -{ - Lisp_Object sym; - int prop_type; -}; - -typedef struct -{ - Dynarr_declare (codesys_prop); -} codesys_prop_dynarr; - -codesys_prop_dynarr *the_codesys_prop_dynarr; - -enum codesys_prop_enum -{ - CODESYS_PROP_ALL_OK, - CODESYS_PROP_ISO2022, - CODESYS_PROP_CCL -}; - - -/************************************************************************/ -/* Coding system functions */ -/************************************************************************/ - -static Lisp_Object -mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object)) -{ - struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj); - - markobj (CODING_SYSTEM_NAME (codesys)); - markobj (CODING_SYSTEM_DOC_STRING (codesys)); - markobj (CODING_SYSTEM_MNEMONIC (codesys)); - markobj (CODING_SYSTEM_EOL_LF (codesys)); - markobj (CODING_SYSTEM_EOL_CRLF (codesys)); - markobj (CODING_SYSTEM_EOL_CR (codesys)); - - switch (CODING_SYSTEM_TYPE (codesys)) - { - int i; - case CODESYS_ISO2022: - for (i = 0; i < 4; i++) - markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)); - if (codesys->iso2022.input_conv) - { - for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++) - { - struct charset_conversion_spec *ccs = - Dynarr_atp (codesys->iso2022.input_conv, i); - markobj (ccs->from_charset); - markobj (ccs->to_charset); - } - } - if (codesys->iso2022.output_conv) - { - for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++) - { - struct charset_conversion_spec *ccs = - Dynarr_atp (codesys->iso2022.output_conv, i); - markobj (ccs->from_charset); - markobj (ccs->to_charset); - } - } - break; - - case CODESYS_CCL: - markobj (CODING_SYSTEM_CCL_DECODE (codesys)); - markobj (CODING_SYSTEM_CCL_ENCODE (codesys)); - break; - default: - break; - } - - markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys)); - return CODING_SYSTEM_POST_READ_CONVERSION (codesys); -} - -static void -print_coding_system (Lisp_Object obj, Lisp_Object printcharfun, - int escapeflag) -{ - struct Lisp_Coding_System *c = XCODING_SYSTEM (obj); - if (print_readably) - error ("printing unreadable object #", - c->header.uid); - - write_c_string ("#name, printcharfun, 1); - write_c_string (">", printcharfun); -} - -static void -finalize_coding_system (void *header, int for_disksave) -{ - struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header; - /* Since coding systems never go away, this function is not - necessary. But it would be necessary if we changed things - so that coding systems could go away. */ - if (!for_disksave) /* see comment in lstream.c */ - { - switch (CODING_SYSTEM_TYPE (c)) - { - case CODESYS_ISO2022: - if (c->iso2022.input_conv) - { - Dynarr_free (c->iso2022.input_conv); - c->iso2022.input_conv = 0; - } - if (c->iso2022.output_conv) - { - Dynarr_free (c->iso2022.output_conv); - c->iso2022.output_conv = 0; - } - break; - - default: - break; - } - } -} - -DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system, - mark_coding_system, print_coding_system, - finalize_coding_system, - 0, 0, struct Lisp_Coding_System); - -static enum eol_type -symbol_to_eol_type (Lisp_Object symbol) -{ - CHECK_SYMBOL (symbol); - if (NILP (symbol)) return EOL_AUTODETECT; - if (EQ (symbol, Qlf)) return EOL_LF; - if (EQ (symbol, Qcrlf)) return EOL_CRLF; - if (EQ (symbol, Qcr)) return EOL_CR; - - signal_simple_error ("Unrecognized eol type", symbol); - return EOL_AUTODETECT; /* not reached */ -} - -static Lisp_Object -eol_type_to_symbol (enum eol_type type) -{ - switch (type) - { - default: abort (); - case EOL_LF: return Qlf; - case EOL_CRLF: return Qcrlf; - case EOL_CR: return Qcr; - case EOL_AUTODETECT: return Qnil; - } -} - -static void -setup_eol_coding_systems (struct Lisp_Coding_System *codesys) -{ - Lisp_Object codesys_obj; - int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name); - char *codesys_name = (char *) alloca (len + 7); - Lisp_Object codesys_name_sym, sub_codesys_obj; - - /* kludge */ - - XSETCODING_SYSTEM (codesys_obj, codesys); - - memcpy (codesys_name, - string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len); - -#define DEFINE_SUB_CODESYS(op_sys, Type) do { \ - strcpy (codesys_name + len, "-" op_sys); \ - codesys_name_sym = intern (codesys_name); \ - sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \ - XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \ - CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \ -} while (0) - - DEFINE_SUB_CODESYS("unix", EOL_LF); - DEFINE_SUB_CODESYS("dos", EOL_CRLF); - DEFINE_SUB_CODESYS("mac", EOL_CR); -} - -DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /* -Return t if OBJECT is a coding system. -A coding system is an object that defines how text containing multiple -character sets is encoded into a stream of (typically 8-bit) bytes. -The coding system is used to decode the stream into a series of -characters (which may be from multiple charsets) when the text is read -from a file or process, and is used to encode the text back into the -same format when it is written out to a file or process. - -For example, many ISO2022-compliant coding systems (such as Compound -Text, which is used for inter-client data under the X Window System) -use escape sequences to switch between different charsets -- Japanese -Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked -with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See -`make-coding-system' for more information. - -Coding systems are normally identified using a symbol, and the -symbol is accepted in place of the actual coding system object whenever -a coding system is called for. (This is similar to how faces work.) -*/ - (object)) -{ - return CODING_SYSTEMP (object) ? Qt : Qnil; -} - -DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /* -Retrieve the coding system of the given name. - -If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply -returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol. -If there is no such coding system, nil is returned. Otherwise the -associated coding system object is returned. -*/ - (coding_system_or_name)) -{ - if (NILP (coding_system_or_name)) - coding_system_or_name = Qbinary; - if (CODING_SYSTEMP (coding_system_or_name)) - return coding_system_or_name; - CHECK_SYMBOL (coding_system_or_name); - - return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil); -} - -DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /* -Retrieve the coding system of the given name. -Same as `find-coding-system' except that if there is no such -coding system, an error is signaled instead of returning nil. -*/ - (name)) -{ - Lisp_Object coding_system = Ffind_coding_system (name); - - if (NILP (coding_system)) - signal_simple_error ("No such coding system", name); - return coding_system; -} - -/* We store the coding systems in hash tables with the names as the key and the - actual coding system object as the value. Occasionally we need to use them - in a list format. These routines provide us with that. */ -struct coding_system_list_closure -{ - Lisp_Object *coding_system_list; -}; - -static int -add_coding_system_to_list_mapper (CONST void *hash_key, void *hash_contents, - void *coding_system_list_closure) -{ - /* This function can GC */ - Lisp_Object key, contents; - Lisp_Object *coding_system_list; - struct coding_system_list_closure *cscl = - (struct coding_system_list_closure *) coding_system_list_closure; - CVOID_TO_LISP (key, hash_key); - VOID_TO_LISP (contents, hash_contents); - coding_system_list = cscl->coding_system_list; - - *coding_system_list = Fcons (XCODING_SYSTEM (contents)->name, - *coding_system_list); - return 0; -} - -DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /* -Return a list of the names of all defined coding systems. -*/ - ()) -{ - Lisp_Object coding_system_list = Qnil; - struct gcpro gcpro1; - struct coding_system_list_closure coding_system_list_closure; - - GCPRO1 (coding_system_list); - coding_system_list_closure.coding_system_list = &coding_system_list; - elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table, - &coding_system_list_closure); - UNGCPRO; - - return coding_system_list; -} - -DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /* -Return the name of the given coding system. -*/ - (coding_system)) -{ - coding_system = Fget_coding_system (coding_system); - return XCODING_SYSTEM_NAME (coding_system); -} - -static struct Lisp_Coding_System * -allocate_coding_system (enum coding_system_type type, Lisp_Object name) -{ - struct Lisp_Coding_System *codesys = - alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system); - - zero_lcrecord (codesys); - CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil; - CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil; - CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT; - CODING_SYSTEM_EOL_CRLF (codesys) = Qnil; - CODING_SYSTEM_EOL_CR (codesys) = Qnil; - CODING_SYSTEM_EOL_LF (codesys) = Qnil; - CODING_SYSTEM_TYPE (codesys) = type; - - if (type == CODESYS_ISO2022) - { - int i; - for (i = 0; i < 4; i++) - CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil; - } - else if (type == CODESYS_CCL) - { - CODING_SYSTEM_CCL_DECODE (codesys) = Qnil; - CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil; - } - - CODING_SYSTEM_NAME (codesys) = name; - - return codesys; -} - -/* Given a list of charset conversion specs as specified in a Lisp - program, parse it into STORE_HERE. */ - -static void -parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here, - Lisp_Object spec_list) -{ - Lisp_Object rest; - - EXTERNAL_LIST_LOOP (rest, spec_list) - { - Lisp_Object car = XCAR (rest); - Lisp_Object from, to; - struct charset_conversion_spec spec; - - if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car)))) - signal_simple_error ("Invalid charset conversion spec", car); - from = Fget_charset (XCAR (car)); - to = Fget_charset (XCAR (XCDR (car))); - if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to)) - signal_simple_error_2 - ("Attempted conversion between different charset types", - from, to); - spec.from_charset = from; - spec.to_charset = to; - - Dynarr_add (store_here, spec); - } -} - -/* Given a dynarr LOAD_HERE of internally-stored charset conversion - specs, return the equivalent as the Lisp programmer would see it. - - If LOAD_HERE is 0, return Qnil. */ - -static Lisp_Object -unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here) -{ - int i; - Lisp_Object result = Qnil; - - if (!load_here) - return Qnil; - for (i = 0; i < Dynarr_length (load_here); i++) - { - struct charset_conversion_spec *ccs = - Dynarr_atp (load_here, i); - result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result); - } - - return Fnreverse (result); -} - -DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /* -Register symbol NAME as a coding system. - -TYPE describes the conversion method used and should be one of - -nil or 'undecided - Automatic conversion. XEmacs attempts to detect the coding system - used in the file. -'no-conversion - No conversion. Use this for binary files and such. On output, - graphic characters that are not in ASCII or Latin-1 will be - replaced by a ?. (For a no-conversion-encoded buffer, these - characters will only be present if you explicitly insert them.) -'shift-jis - Shift-JIS (a Japanese encoding commonly used in PC operating systems). -'iso2022 - Any ISO2022-compliant encoding. Among other things, this includes - JIS (the Japanese encoding commonly used for e-mail), EUC (the - standard Unix encoding for Japanese and other languages), and - Compound Text (the encoding used in X11). You can specify more - specific information about the conversion with the FLAGS argument. -'big5 - Big5 (the encoding commonly used for Taiwanese). -'ccl - The conversion is performed using a user-written pseudo-code - program. CCL (Code Conversion Language) is the name of this - pseudo-code. -'internal - Write out or read in the raw contents of the memory representing - the buffer's text. This is primarily useful for debugging - purposes, and is only enabled when XEmacs has been compiled with - DEBUG_XEMACS defined (via the --debug configure option). - WARNING: Reading in a file using 'internal conversion can result - in an internal inconsistency in the memory representing a - buffer's text, which will produce unpredictable results and may - cause XEmacs to crash. Under normal circumstances you should - never use 'internal conversion. - -DOC-STRING is a string describing the coding system. - -PROPS is a property list, describing the specific nature of the -character set. Recognized properties are: - -'mnemonic - String to be displayed in the modeline when this coding system is - active. - -'eol-type - End-of-line conversion to be used. It should be one of - - nil - Automatically detect the end-of-line type (LF, CRLF, - or CR). Also generate subsidiary coding systems named - `NAME-unix', `NAME-dos', and `NAME-mac', that are - identical to this coding system but have an EOL-TYPE - value of 'lf, 'crlf, and 'cr, respectively. - 'lf - The end of a line is marked externally using ASCII LF. - Since this is also the way that XEmacs represents an - end-of-line internally, specifying this option results - in no end-of-line conversion. This is the standard - format for Unix text files. - 'crlf - The end of a line is marked externally using ASCII - CRLF. This is the standard format for MS-DOS text - files. - 'cr - The end of a line is marked externally using ASCII CR. - This is the standard format for Macintosh text files. - t - Automatically detect the end-of-line type but do not - generate subsidiary coding systems. (This value is - converted to nil when stored internally, and - `coding-system-property' will return nil.) - -'post-read-conversion - Function called after a file has been read in, to perform the - decoding. Called with two arguments, BEG and END, denoting - a region of the current buffer to be decoded. - -'pre-write-conversion - Function called before a file is written out, to perform the - encoding. Called with two arguments, BEG and END, denoting - a region of the current buffer to be encoded. - - -The following additional properties are recognized if TYPE is 'iso2022: - -'charset-g0 -'charset-g1 -'charset-g2 -'charset-g3 - The character set initially designated to the G0 - G3 registers. - The value should be one of - - -- A charset object (designate that character set) - -- nil (do not ever use this register) - -- t (no character set is initially designated to - the register, but may be later on; this automatically - sets the corresponding `force-g*-on-output' property) - -'force-g0-on-output -'force-g1-on-output -'force-g2-on-output -'force-g2-on-output - If non-nil, send an explicit designation sequence on output before - using the specified register. - -'short - If non-nil, use the short forms "ESC $ @", "ESC $ A", and - "ESC $ B" on output in place of the full designation sequences - "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B". - -'no-ascii-eol - If non-nil, don't designate ASCII to G0 at each end of line on output. - Setting this to non-nil also suppresses other state-resetting that - normally happens at the end of a line. - -'no-ascii-cntl - If non-nil, don't designate ASCII to G0 before control chars on output. - -'seven - If non-nil, use 7-bit environment on output. Otherwise, use 8-bit - environment. - -'lock-shift - If non-nil, use locking-shift (SO/SI) instead of single-shift - or designation by escape sequence. - -'no-iso6429 - If non-nil, don't use ISO6429's direction specification. - -'escape-quoted - If non-nil, literal control characters that are the same as - the beginning of a recognized ISO2022 or ISO6429 escape sequence - (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E), - SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character - so that they can be properly distinguished from an escape sequence. - (Note that doing this results in a non-portable encoding.) This - encoding flag is used for byte-compiled files. Note that ESC - is a good choice for a quoting character because there are no - escape sequences whose second byte is a character from the Control-0 - or Control-1 character sets; this is explicitly disallowed by the - ISO2022 standard. - -'input-charset-conversion - A list of conversion specifications, specifying conversion of - characters in one charset to another when decoding is performed. - Each specification is a list of two elements: the source charset, - and the destination charset. - -'output-charset-conversion - A list of conversion specifications, specifying conversion of - characters in one charset to another when encoding is performed. - The form of each specification is the same as for - 'input-charset-conversion. - - -The following additional properties are recognized (and required) -if TYPE is 'ccl: - -'decode - CCL program used for decoding (converting to internal format). - -'encode - CCL program used for encoding (converting to external format). -*/ - (name, type, doc_string, props)) -{ - struct Lisp_Coding_System *codesys; - Lisp_Object rest, key, value; - enum coding_system_type ty; - int need_to_setup_eol_systems = 1; - - /* Convert type to constant */ - if (NILP (type) || EQ (type, Qundecided)) - { ty = CODESYS_AUTODETECT; } - else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; } - else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; } - else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; } - else if (EQ (type, Qccl)) { ty = CODESYS_CCL; } - else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; } -#ifdef DEBUG_XEMACS - else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; } -#endif - else - signal_simple_error ("Invalid coding system type", type); - - CHECK_SYMBOL (name); - - codesys = allocate_coding_system (ty, name); - - if (NILP (doc_string)) - doc_string = build_string (""); - else - CHECK_STRING (doc_string); - CODING_SYSTEM_DOC_STRING (codesys) = doc_string; - - EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props) - { - if (EQ (key, Qmnemonic)) - { - if (!NILP (value)) - CHECK_STRING (value); - CODING_SYSTEM_MNEMONIC (codesys) = value; - } - - else if (EQ (key, Qeol_type)) - { - need_to_setup_eol_systems = NILP (value); - if (EQ (value, Qt)) - value = Qnil; - CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value); - } - - else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value; - else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value; - else if (ty == CODESYS_ISO2022) - { -#define FROB_INITIAL_CHARSET(charset_num) \ - CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \ - ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value)) - - if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0); - else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1); - else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2); - else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3); - -#define FROB_FORCE_CHARSET(charset_num) \ - CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value) - - else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0); - else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1); - else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2); - else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3); - -#define FROB_BOOLEAN_PROPERTY(prop) \ - CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value) - - else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT); - else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL); - else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL); - else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN); - else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT); - else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429); - else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED); - - else if (EQ (key, Qinput_charset_conversion)) - { - codesys->iso2022.input_conv = - Dynarr_new (charset_conversion_spec); - parse_charset_conversion_specs (codesys->iso2022.input_conv, - value); - } - else if (EQ (key, Qoutput_charset_conversion)) - { - codesys->iso2022.output_conv = - Dynarr_new (charset_conversion_spec); - parse_charset_conversion_specs (codesys->iso2022.output_conv, - value); - } - else - signal_simple_error ("Unrecognized property", key); - } - else if (EQ (type, Qccl)) - { - if (EQ (key, Qdecode)) - { - CHECK_VECTOR (value); - CODING_SYSTEM_CCL_DECODE (codesys) = value; - } - else if (EQ (key, Qencode)) - { - CHECK_VECTOR (value); - CODING_SYSTEM_CCL_ENCODE (codesys) = value; - } - else - signal_simple_error ("Unrecognized property", key); - } - else - signal_simple_error ("Unrecognized property", key); - } - - if (need_to_setup_eol_systems) - setup_eol_coding_systems (codesys); - - { - Lisp_Object codesys_obj; - XSETCODING_SYSTEM (codesys_obj, codesys); - Fputhash (name, codesys_obj, Vcoding_system_hash_table); - return codesys_obj; - } -} - -DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /* -Copy OLD-CODING-SYSTEM to NEW-NAME. -If NEW-NAME does not name an existing coding system, a new one will -be created. -*/ - (old_coding_system, new_name)) -{ - Lisp_Object new_coding_system; - old_coding_system = Fget_coding_system (old_coding_system); - new_coding_system = Ffind_coding_system (new_name); - if (NILP (new_coding_system)) - { - XSETCODING_SYSTEM (new_coding_system, - allocate_coding_system - (XCODING_SYSTEM_TYPE (old_coding_system), - new_name)); - Fputhash (new_name, new_coding_system, Vcoding_system_hash_table); - } - - { - struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system); - struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system); - memcpy (((char *) to ) + sizeof (to->header), - ((char *) from) + sizeof (from->header), - sizeof (*from) - sizeof (from->header)); - to->name = new_name; - } - return new_coding_system; -} - -static Lisp_Object -subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type) -{ - struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system); - Lisp_Object new_coding_system; - - if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) - return coding_system; - - switch (type) - { - case EOL_AUTODETECT: return coding_system; - case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break; - case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break; - case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break; - default: abort (); - } - - return NILP (new_coding_system) ? coding_system : new_coding_system; -} - -DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /* -Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE. -*/ - (coding_system, eol_type)) -{ - coding_system = Fget_coding_system (coding_system); - - return subsidiary_coding_system (coding_system, - symbol_to_eol_type (eol_type)); -} - - -/************************************************************************/ -/* Coding system accessors */ -/************************************************************************/ - -DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /* -Return the doc string for CODING-SYSTEM. -*/ - (coding_system)) -{ - coding_system = Fget_coding_system (coding_system); - return XCODING_SYSTEM_DOC_STRING (coding_system); -} - -DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /* -Return the type of CODING-SYSTEM. -*/ - (coding_system)) -{ - switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system))) - { - case CODESYS_AUTODETECT: return Qundecided; - case CODESYS_SHIFT_JIS: return Qshift_jis; - case CODESYS_ISO2022: return Qiso2022; - case CODESYS_BIG5: return Qbig5; - case CODESYS_CCL: return Qccl; - case CODESYS_NO_CONVERSION: return Qno_conversion; -#ifdef DEBUG_XEMACS - case CODESYS_INTERNAL: return Qinternal; -#endif - default: - abort (); - } - - return Qnil; /* not reached */ -} - -static -Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum) -{ - Lisp_Object cs - = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum); - - return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil; -} - -DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /* -Return initial charset of CODING-SYSTEM designated to GNUM. -GNUM allows 0 .. 3. -*/ - (coding_system, gnum)) -{ - coding_system = Fget_coding_system (coding_system); - CHECK_INT (gnum); - - return coding_system_charset (coding_system, XINT (gnum)); -} - -DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /* -Return the PROP property of CODING-SYSTEM. -*/ - (coding_system, prop)) -{ - int i, ok = 0; - enum coding_system_type type; - - coding_system = Fget_coding_system (coding_system); - CHECK_SYMBOL (prop); - type = XCODING_SYSTEM_TYPE (coding_system); - - for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++) - if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop)) - { - ok = 1; - switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type) - { - case CODESYS_PROP_ALL_OK: - break; - - case CODESYS_PROP_ISO2022: - if (type != CODESYS_ISO2022) - signal_simple_error - ("Property only valid in ISO2022 coding systems", - prop); - break; - - case CODESYS_PROP_CCL: - if (type != CODESYS_CCL) - signal_simple_error - ("Property only valid in CCL coding systems", - prop); - break; - - default: - abort (); - } - } - - if (!ok) - signal_simple_error ("Unrecognized property", prop); - - if (EQ (prop, Qname)) - return XCODING_SYSTEM_NAME (coding_system); - else if (EQ (prop, Qtype)) - return Fcoding_system_type (coding_system); - else if (EQ (prop, Qdoc_string)) - return XCODING_SYSTEM_DOC_STRING (coding_system); - else if (EQ (prop, Qmnemonic)) - return XCODING_SYSTEM_MNEMONIC (coding_system); - else if (EQ (prop, Qeol_type)) - return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system)); - else if (EQ (prop, Qeol_lf)) - return XCODING_SYSTEM_EOL_LF (coding_system); - else if (EQ (prop, Qeol_crlf)) - return XCODING_SYSTEM_EOL_CRLF (coding_system); - else if (EQ (prop, Qeol_cr)) - return XCODING_SYSTEM_EOL_CR (coding_system); - else if (EQ (prop, Qpost_read_conversion)) - return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system); - else if (EQ (prop, Qpre_write_conversion)) - return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system); - else if (type == CODESYS_ISO2022) - { - if (EQ (prop, Qcharset_g0)) - return coding_system_charset (coding_system, 0); - else if (EQ (prop, Qcharset_g1)) - return coding_system_charset (coding_system, 1); - else if (EQ (prop, Qcharset_g2)) - return coding_system_charset (coding_system, 2); - else if (EQ (prop, Qcharset_g3)) - return coding_system_charset (coding_system, 3); - -#define FORCE_CHARSET(charset_num) \ - (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \ - (coding_system, charset_num) ? Qt : Qnil) - - else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0); - else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1); - else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2); - else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3); - -#define LISP_BOOLEAN(prop) \ - (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil) - - else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT); - else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL); - else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL); - else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN); - else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT); - else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429); - else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED); - - else if (EQ (prop, Qinput_charset_conversion)) - return - unparse_charset_conversion_specs - (XCODING_SYSTEM (coding_system)->iso2022.input_conv); - else if (EQ (prop, Qoutput_charset_conversion)) - return - unparse_charset_conversion_specs - (XCODING_SYSTEM (coding_system)->iso2022.output_conv); - else - abort (); - } - else if (type == CODESYS_CCL) - { - if (EQ (prop, Qdecode)) - return XCODING_SYSTEM_CCL_DECODE (coding_system); - else if (EQ (prop, Qencode)) - return XCODING_SYSTEM_CCL_ENCODE (coding_system); - else - abort (); - } - else - abort (); - - return Qnil; /* not reached */ -} - - -/************************************************************************/ -/* Coding category functions */ -/************************************************************************/ - -static int -decode_coding_category (Lisp_Object symbol) -{ - int i; - - CHECK_SYMBOL (symbol); - for (i = 0; i <= CODING_CATEGORY_LAST; i++) - if (EQ (coding_category_symbol[i], symbol)) - return i; - - signal_simple_error ("Unrecognized coding category", symbol); - return 0; /* not reached */ -} - -DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /* -Return a list of all recognized coding categories. -*/ - ()) -{ - int i; - Lisp_Object list = Qnil; - - for (i = CODING_CATEGORY_LAST; i >= 0; i--) - list = Fcons (coding_category_symbol[i], list); - return list; -} - -DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /* -Change the priority order of the coding categories. -LIST should be list of coding categories, in descending order of -priority. Unspecified coding categories will be lower in priority -than all specified ones, in the same relative order they were in -previously. -*/ - (list)) -{ - int category_to_priority[CODING_CATEGORY_LAST + 1]; - int i, j; - Lisp_Object rest; - - /* First generate a list that maps coding categories to priorities. */ - - for (i = 0; i <= CODING_CATEGORY_LAST; i++) - category_to_priority[i] = -1; - - /* Highest priority comes from the specified list. */ - i = 0; - EXTERNAL_LIST_LOOP (rest, list) - { - int cat = decode_coding_category (XCAR (rest)); - - if (category_to_priority[cat] >= 0) - signal_simple_error ("Duplicate coding category in list", XCAR (rest)); - category_to_priority[cat] = i++; - } - - /* Now go through the existing categories by priority to retrieve - the categories not yet specified and preserve their priority - order. */ - for (j = 0; j <= CODING_CATEGORY_LAST; j++) - { - int cat = coding_category_by_priority[j]; - if (category_to_priority[cat] < 0) - category_to_priority[cat] = i++; - } - - /* Now we need to construct the inverse of the mapping we just - constructed. */ - - for (i = 0; i <= CODING_CATEGORY_LAST; i++) - coding_category_by_priority[category_to_priority[i]] = i; - - /* Phew! That was confusing. */ - return Qnil; -} - -DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /* -Return a list of coding categories in descending order of priority. -*/ - ()) -{ - int i; - Lisp_Object list = Qnil; - - for (i = CODING_CATEGORY_LAST; i >= 0; i--) - list = Fcons (coding_category_symbol[coding_category_by_priority[i]], - list); - return list; -} - -DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /* -Change the coding system associated with a coding category. -*/ - (coding_category, coding_system)) -{ - int cat = decode_coding_category (coding_category); - - coding_system = Fget_coding_system (coding_system); - coding_category_system[cat] = coding_system; - return Qnil; -} - -DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /* -Return the coding system associated with a coding category. -*/ - (coding_category)) -{ - int cat = decode_coding_category (coding_category); - Lisp_Object sys = coding_category_system[cat]; - - if (!NILP (sys)) - return XCODING_SYSTEM_NAME (sys); - return Qnil; -} - - -/************************************************************************/ -/* Detecting the encoding of data */ -/************************************************************************/ - -struct detection_state -{ - enum eol_type eol_type; - int seen_non_ascii; - int mask; - - struct - { - int mask; - int in_second_byte; - } - big5; - - struct - { - int mask; - int in_second_byte; - } - shift_jis; - - struct - { - int mask; - int initted; - struct iso2022_decoder iso; - unsigned int flags; - int high_byte_count; - unsigned int saw_single_shift:1; - } - iso2022; - - struct - { - int seen_anything; - int just_saw_cr; - } - eol; -}; - -static int -acceptable_control_char_p (int c) -{ - switch (c) - { - /* Allow and ignore control characters that you might - reasonably see in a text file */ - case '\r': - case '\n': - case '\t': - case 7: /* bell */ - case 8: /* backspace */ - case 11: /* vertical tab */ - case 12: /* form feed */ - case 26: /* MS-DOS C-z junk */ - case 31: /* '^_' -- for info */ - return 1; - default: - return 0; - } -} - -static int -mask_has_at_most_one_bit_p (int mask) -{ - /* Perhaps the only thing useful you learn from intensive Microsoft - technical interviews */ - return (mask & (mask - 1)) == 0; -} - -static enum eol_type -detect_eol_type (struct detection_state *st, CONST unsigned char *src, - unsigned int n) -{ - int c; - - while (n--) - { - c = *src++; - if (c == '\r') - st->eol.just_saw_cr = 1; - else - { - if (c == '\n') - { - if (st->eol.just_saw_cr) - return EOL_CRLF; - else if (st->eol.seen_anything) - return EOL_LF; - } - else if (st->eol.just_saw_cr) - return EOL_CR; - st->eol.just_saw_cr = 0; - } - st->eol.seen_anything = 1; - } - - return EOL_AUTODETECT; -} - -/* Attempt to determine the encoding and EOL type of the given text. - Before calling this function for the first type, you must initialize - st->eol_type as appropriate and initialize st->mask to ~0. - - st->eol_type holds the determined EOL type, or EOL_AUTODETECT if - not yet known. - - st->mask holds the determined coding category mask, or ~0 if only - ASCII has been seen so far. - - Returns: - - 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category - is present in st->mask - 1 == definitive answers are here for both st->eol_type and st->mask -*/ - -static int -detect_coding_type (struct detection_state *st, CONST unsigned char *src, - unsigned int n, int just_do_eol) -{ - int c; - - if (st->eol_type == EOL_AUTODETECT) - st->eol_type = detect_eol_type (st, src, n); - - if (just_do_eol) - return st->eol_type != EOL_AUTODETECT; - - if (!st->seen_non_ascii) - { - for (; n; n--, src++) - { - c = *src; - if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80) - { - st->seen_non_ascii = 1; - st->shift_jis.mask = ~0; - st->big5.mask = ~0; - st->iso2022.mask = ~0; - break; - } - } - } - - if (!n) - return 0; - - if (!mask_has_at_most_one_bit_p (st->iso2022.mask)) - st->iso2022.mask = detect_coding_iso2022 (st, src, n); - if (!mask_has_at_most_one_bit_p (st->shift_jis.mask)) - st->shift_jis.mask = detect_coding_sjis (st, src, n); - if (!mask_has_at_most_one_bit_p (st->big5.mask)) - st->big5.mask = detect_coding_big5 (st, src, n); - - st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask; - - { - int retval = mask_has_at_most_one_bit_p (st->mask); - st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK; - return retval && st->eol_type != EOL_AUTODETECT; - } -} - -static Lisp_Object -coding_system_from_mask (int mask) -{ - if (mask == ~0) - { - /* If the file was entirely or basically ASCII, use the - default value of `buffer-file-coding-system'. */ - Lisp_Object retval = - XBUFFER (Vbuffer_defaults)->buffer_file_coding_system; - if (!NILP (retval)) - { - retval = Ffind_coding_system (retval); - if (NILP (retval)) - { - warn_when_safe - (Qbad_variable, Qwarning, - "Invalid `default-buffer-file-coding-system', set to nil"); - XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil; - } - } - if (NILP (retval)) - retval = Fget_coding_system (Qno_conversion); - return retval; - } - else - { - int i; - int cat = -1; - - mask = postprocess_iso2022_mask (mask); - - /* Look through the coding categories by priority and find - the first one that is allowed. */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) - { - cat = coding_category_by_priority[i]; - if ((mask & (1 << cat)) && - !NILP (coding_category_system[cat])) - break; - } - if (cat >= 0) - return coding_category_system[cat]; - else - return Fget_coding_system (Qno_conversion); - } -} - -/* Given a seekable read stream and potential coding system and EOL type - as specified, do any autodetection that is called for. If the - coding system and/or EOL type are not autodetect, they will be left - alone; but this function will never return an autodetect coding system - or EOL type. - - This function does not automatically fetch subsidiary coding systems; - that should be unnecessary with the explicit eol-type argument. */ - -void -determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, - enum eol_type *eol_type_in_out) -{ - struct detection_state decst; - - if (*eol_type_in_out == EOL_AUTODETECT) - *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out); - - xzero (decst); - decst.eol_type = *eol_type_in_out; - decst.mask = ~0; - - /* If autodetection is called for, do it now. */ - if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT || - *eol_type_in_out == EOL_AUTODETECT) - { - - while (1) - { - unsigned char random_buffer[4096]; - int nread; - - nread = Lstream_read (stream, random_buffer, sizeof (random_buffer)); - if (!nread) - break; - if (detect_coding_type (&decst, random_buffer, nread, - XCODING_SYSTEM_TYPE (*codesys_in_out) != - CODESYS_AUTODETECT)) - break; - } - - *eol_type_in_out = decst.eol_type; - if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT) - *codesys_in_out = coding_system_from_mask (decst.mask); - } - - /* If we absolutely can't determine the EOL type, just assume LF. */ - if (*eol_type_in_out == EOL_AUTODETECT) - *eol_type_in_out = EOL_LF; - - Lstream_rewind (stream); -} - -DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /* -Detect coding system of the text in the region between START and END. -Returned a list of possible coding systems ordered by priority. -If only ASCII characters are found, it returns 'undecided or one of -its subsidiary coding systems according to a detected end-of-line -type. Optional arg BUFFER defaults to the current buffer. -*/ - (start, end, buffer)) -{ - Lisp_Object val = Qnil; - struct buffer *buf = decode_buffer (buffer, 0); - Bufpos b, e; - Lisp_Object instream, lb_instream; - Lstream *istr, *lb_istr; - struct detection_state decst; - struct gcpro gcpro1, gcpro2; - - get_buffer_range_char (buf, start, end, &b, &e, 0); - lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0); - lb_istr = XLSTREAM (lb_instream); - instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary)); - istr = XLSTREAM (instream); - GCPRO2 (instream, lb_instream); - xzero (decst); - decst.eol_type = EOL_AUTODETECT; - decst.mask = ~0; - while (1) - { - unsigned char random_buffer[4096]; - int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer)); - - if (!nread) - break; - if (detect_coding_type (&decst, random_buffer, nread, 0)) - break; - } - - if (decst.mask == ~0) - val = subsidiary_coding_system (Fget_coding_system (Qundecided), - decst.eol_type); - else - { - int i; - - val = Qnil; - - decst.mask = postprocess_iso2022_mask (decst.mask); - - for (i = CODING_CATEGORY_LAST; i >= 0; i--) - { - int sys = coding_category_by_priority[i]; - if (decst.mask & (1 << sys)) - { - Lisp_Object codesys = coding_category_system[sys]; - if (!NILP (codesys)) - codesys = subsidiary_coding_system (codesys, decst.eol_type); - val = Fcons (codesys, val); - } - } - } - Lstream_close (istr); - UNGCPRO; - Lstream_delete (istr); - Lstream_delete (lb_istr); - return val; -} - - -/************************************************************************/ -/* Converting to internal Mule format ("decoding") */ -/************************************************************************/ - -/* A decoding stream is a stream used for decoding text (i.e. - converting from some external format to internal format). - The decoding-stream object keeps track of the actual coding - stream, the stream that is at the other end, and data that - needs to be persistent across the lifetime of the stream. */ - -/* Handle the EOL stuff related to just-read-in character C. - EOL_TYPE is the EOL type of the coding stream. - FLAGS is the current value of FLAGS in the coding stream, and may - be modified by this macro. (The macro only looks at the - CODING_STATE_CR flag.) DST is the Dynarr to which the decoded - bytes are to be written. You need to also define a local goto - label "label_continue_loop" that is at the end of the main - character-reading loop. - - If C is a CR character, then this macro handles it entirely and - jumps to label_continue_loop. Otherwise, this macro does not add - anything to DST, and continues normally. You should continue - processing C normally after this macro. */ - -#define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \ -do { \ - if (c == '\r') \ - { \ - if (eol_type == EOL_CR) \ - Dynarr_add (dst, '\n'); \ - else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \ - Dynarr_add (dst, c); \ - else \ - flags |= CODING_STATE_CR; \ - goto label_continue_loop; \ - } \ - else if (flags & CODING_STATE_CR) \ - { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \ - if (c != '\n') \ - Dynarr_add (dst, '\r'); \ - flags &= ~CODING_STATE_CR; \ - } \ -} while (0) - -/* C should be a binary character in the range 0 - 255; convert - to internal format and add to Dynarr DST. */ - -#define DECODE_ADD_BINARY_CHAR(c, dst) \ -do { \ - if (BYTE_ASCII_P (c)) \ - Dynarr_add (dst, c); \ - else if (BYTE_C1_P (c)) \ - { \ - Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \ - Dynarr_add (dst, c + 0x20); \ - } \ - else \ - { \ - Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \ - Dynarr_add (dst, c); \ - } \ -} while (0) - -#define DECODE_OUTPUT_PARTIAL_CHAR(ch) \ -do { \ - if (ch) \ - { \ - DECODE_ADD_BINARY_CHAR (ch, dst); \ - ch = 0; \ - } \ -} while (0) - -#define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \ -do { \ - DECODE_OUTPUT_PARTIAL_CHAR (ch); \ - if ((flags & CODING_STATE_END) && \ - (flags & CODING_STATE_CR)) \ - Dynarr_add (dst, '\r'); \ -} while (0) - -#define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding) - -struct decoding_stream -{ - /* Coding system that governs the conversion. */ - struct Lisp_Coding_System *codesys; - - /* Stream that we read the encoded data from or - write the decoded data to. */ - Lstream *other_end; - - /* If we are reading, then we can return only a fixed amount of - data, so if the conversion resulted in too much data, we store it - here for retrieval the next time around. */ - unsigned_char_dynarr *runoff; - - /* FLAGS holds flags indicating the current state of the decoding. - Some of these flags are dependent on the coding system. */ - unsigned int flags; - - /* CH holds a partially built-up character. Since we only deal - with one- and two-byte characters at the moment, we only use - this to store the first byte of a two-byte character. */ - unsigned int ch; - - /* EOL_TYPE specifies the type of end-of-line conversion that - currently applies. We need to keep this separate from the - EOL type stored in CODESYS because the latter might indicate - automatic EOL-type detection while the former will always - indicate a particular EOL type. */ - enum eol_type eol_type; - - /* Additional ISO2022 information. We define the structure above - because it's also needed by the detection routines. */ - struct iso2022_decoder iso2022; - - /* Additional information (the state of the running CCL program) - used by the CCL decoder. */ - struct ccl_program ccl; - - struct detection_state decst; -}; - -static int decoding_reader (Lstream *stream, unsigned char *data, size_t size); -static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size); -static int decoding_rewinder (Lstream *stream); -static int decoding_seekable_p (Lstream *stream); -static int decoding_flusher (Lstream *stream); -static int decoding_closer (Lstream *stream); -static Lisp_Object decoding_marker (Lisp_Object stream, - void (*markobj) (Lisp_Object)); - -DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding, - sizeof (struct decoding_stream)); - -static Lisp_Object -decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object)) -{ - Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end; - Lisp_Object str_obj; - - /* We do not need to mark the coding systems or charsets stored - within the stream because they are stored in a global list - and automatically marked. */ - - XSETLSTREAM (str_obj, str); - markobj (str_obj); - if (str->imp->marker) - return (str->imp->marker) (str_obj, markobj); - else - return Qnil; -} - -/* Read SIZE bytes of data and store it into DATA. We are a decoding stream - so we read data from the other end, decode it, and store it into DATA. */ - -static int -decoding_reader (Lstream *stream, unsigned char *data, size_t size) -{ - struct decoding_stream *str = DECODING_STREAM_DATA (stream); - unsigned char *orig_data = data; - int read_size; - int error_occurred = 0; - - /* We need to interface to mule_decode(), which expects to take some - amount of data and store the result into a Dynarr. We have - mule_decode() store into str->runoff, and take data from there - as necessary. */ - - /* We loop until we have enough data, reading chunks from the other - end and decoding it. */ - while (1) - { - /* Take data from the runoff if we can. Make sure to take at - most SIZE bytes, and delete the data from the runoff. */ - if (Dynarr_length (str->runoff) > 0) - { - size_t chunk = min (size, (size_t) Dynarr_length (str->runoff)); - memcpy (data, Dynarr_atp (str->runoff, 0), chunk); - Dynarr_delete_many (str->runoff, 0, chunk); - data += chunk; - size -= chunk; - } - - if (size == 0) - break; /* No more room for data */ - - if (str->flags & CODING_STATE_END) - /* This means that on the previous iteration, we hit the EOF on - the other end. We loop once more so that mule_decode() can - output any final stuff it may be holding, or any "go back - to a sane state" escape sequences. (This latter makes sense - during encoding.) */ - break; - - /* Exhausted the runoff, so get some more. DATA has at least - SIZE bytes left of storage in it, so it's OK to read directly - into it. (We'll be overwriting above, after we've decoded it - into the runoff.) */ - read_size = Lstream_read (str->other_end, data, size); - if (read_size < 0) - { - error_occurred = 1; - break; - } - if (read_size == 0) - /* There might be some more end data produced in the translation. - See the comment above. */ - str->flags |= CODING_STATE_END; - mule_decode (stream, data, str->runoff, read_size); - } - - if (data - orig_data == 0) - return error_occurred ? -1 : 0; - else - return data - orig_data; -} - -static int -decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size) -{ - struct decoding_stream *str = DECODING_STREAM_DATA (stream); - int retval; - - /* Decode all our data into the runoff, and then attempt to write - it all out to the other end. Remove whatever chunk we succeeded - in writing. */ - mule_decode (stream, data, str->runoff, size); - retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0), - Dynarr_length (str->runoff)); - if (retval > 0) - Dynarr_delete_many (str->runoff, 0, retval); - /* Do NOT return retval. The return value indicates how much - of the incoming data was written, not how many bytes were - written. */ - return size; -} - -static void -reset_decoding_stream (struct decoding_stream *str) -{ - if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022) - { - Lisp_Object coding_system; - XSETCODING_SYSTEM (coding_system, str->codesys); - reset_iso2022 (coding_system, &str->iso2022); - } - else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL) - { - setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys)); - } - - str->flags = str->ch = 0; -} - -static int -decoding_rewinder (Lstream *stream) -{ - struct decoding_stream *str = DECODING_STREAM_DATA (stream); - reset_decoding_stream (str); - Dynarr_reset (str->runoff); - return Lstream_rewind (str->other_end); -} - -static int -decoding_seekable_p (Lstream *stream) -{ - struct decoding_stream *str = DECODING_STREAM_DATA (stream); - return Lstream_seekable_p (str->other_end); -} - -static int -decoding_flusher (Lstream *stream) -{ - struct decoding_stream *str = DECODING_STREAM_DATA (stream); - return Lstream_flush (str->other_end); -} - -static int -decoding_closer (Lstream *stream) -{ - struct decoding_stream *str = DECODING_STREAM_DATA (stream); - if (stream->flags & LSTREAM_FL_WRITE) - { - str->flags |= CODING_STATE_END; - decoding_writer (stream, 0, 0); - } - Dynarr_free (str->runoff); - if (str->iso2022.composite_chars) - Dynarr_free (str->iso2022.composite_chars); - return Lstream_close (str->other_end); -} - -Lisp_Object -decoding_stream_coding_system (Lstream *stream) -{ - Lisp_Object coding_system; - struct decoding_stream *str = DECODING_STREAM_DATA (stream); - - XSETCODING_SYSTEM (coding_system, str->codesys); - return subsidiary_coding_system (coding_system, str->eol_type); -} - -void -set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) -{ - struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); - struct decoding_stream *str = DECODING_STREAM_DATA (lstr); - str->codesys = cs; - if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) - str->eol_type = CODING_SYSTEM_EOL_TYPE (cs); - reset_decoding_stream (str); -} - -/* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding - stream for writing, no automatic code detection will be performed. - The reason for this is that automatic code detection requires a - seekable input. Things will also fail if you open a decoding - stream for reading using a non-fully-specified coding system and - a non-seekable input stream. */ - -static Lisp_Object -make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys, - CONST char *mode) -{ - Lstream *lstr = Lstream_new (lstream_decoding, mode); - struct decoding_stream *str = DECODING_STREAM_DATA (lstr); - Lisp_Object obj; - - xzero (*str); - str->other_end = stream; - str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char); - str->eol_type = EOL_AUTODETECT; - if (!strcmp (mode, "r") - && Lstream_seekable_p (stream)) - /* We can determine the coding system now. */ - determine_real_coding_system (stream, &codesys, &str->eol_type); - set_decoding_stream_coding_system (lstr, codesys); - str->decst.eol_type = str->eol_type; - str->decst.mask = ~0; - XSETLSTREAM (obj, lstr); - return obj; -} - -Lisp_Object -make_decoding_input_stream (Lstream *stream, Lisp_Object codesys) -{ - return make_decoding_stream_1 (stream, codesys, "r"); -} - -Lisp_Object -make_decoding_output_stream (Lstream *stream, Lisp_Object codesys) -{ - return make_decoding_stream_1 (stream, codesys, "w"); -} - -/* Note: the decode_coding_* functions all take the same - arguments as mule_decode(), which is to say some SRC data of - size N, which is to be stored into dynamic array DST. - DECODING is the stream within which the decoding is - taking place, but no data is actually read from or - written to that stream; that is handled in decoding_reader() - or decoding_writer(). This allows the same functions to - be used for both reading and writing. */ - -static void -mule_decode (Lstream *decoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - struct decoding_stream *str = DECODING_STREAM_DATA (decoding); - - /* If necessary, do encoding-detection now. We do this when - we're a writing stream or a non-seekable reading stream, - meaning that we can't just process the whole input, - rewind, and start over. */ - - if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT || - str->eol_type == EOL_AUTODETECT) - { - Lisp_Object codesys; - - XSETCODING_SYSTEM (codesys, str->codesys); - detect_coding_type (&str->decst, src, n, - CODING_SYSTEM_TYPE (str->codesys) != - CODESYS_AUTODETECT); - if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT && - str->decst.mask != ~0) - /* #### This is cheesy. What we really ought to do is - buffer up a certain amount of data so as to get a - less random result. */ - codesys = coding_system_from_mask (str->decst.mask); - str->eol_type = str->decst.eol_type; - if (XCODING_SYSTEM (codesys) != str->codesys) - { - /* Preserve the CODING_STATE_END flag in case it was set. - If we erase it, bad things might happen. */ - int was_end = str->flags & CODING_STATE_END; - set_decoding_stream_coding_system (decoding, codesys); - if (was_end) - str->flags |= CODING_STATE_END; - } - } - - switch (CODING_SYSTEM_TYPE (str->codesys)) - { -#ifdef DEBUG_XEMACS - case CODESYS_INTERNAL: - Dynarr_add_many (dst, src, n); - break; -#endif - case CODESYS_AUTODETECT: - /* If we got this far and still haven't decided on the coding - system, then do no conversion. */ - case CODESYS_NO_CONVERSION: - decode_coding_no_conversion (decoding, src, dst, n); - break; - case CODESYS_SHIFT_JIS: - decode_coding_sjis (decoding, src, dst, n); - break; - case CODESYS_BIG5: - decode_coding_big5 (decoding, src, dst, n); - break; - case CODESYS_CCL: - ccl_driver (&str->ccl, src, dst, n, 0); - break; - case CODESYS_ISO2022: - decode_coding_iso2022 (decoding, src, dst, n); - break; - default: - abort (); - } -} - -DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /* -Decode the text between START and END which is encoded in CODING-SYSTEM. -This is useful if you've read in encoded text from a file without decoding -it (e.g. you read in a JIS-formatted file but used the `binary' or -`no-conversion' coding system, so that it shows up as "^[$B! [ENCODE AS BINARY] - ------> [DECODE AS SPECIFIED] - ------> [BUFFER] - */ - - while (1) - { - char tempbuf[1024]; /* some random amount */ - Bufpos newpos, even_newer_pos; - Bufpos oldpos = lisp_buffer_stream_startpos (istr); - int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); - - if (!size_in_bytes) - break; - newpos = lisp_buffer_stream_startpos (istr); - Lstream_write (ostr, tempbuf, size_in_bytes); - even_newer_pos = lisp_buffer_stream_startpos (istr); - buffer_delete_range (buf, even_newer_pos - (newpos - oldpos), - even_newer_pos, 0); - } - Lstream_close (istr); - Lstream_close (ostr); - UNGCPRO; - Lstream_delete (istr); - Lstream_delete (ostr); - Lstream_delete (XLSTREAM (de_outstream)); - Lstream_delete (XLSTREAM (lb_outstream)); - return Qnil; -} - - -/************************************************************************/ -/* Converting to an external encoding ("encoding") */ -/************************************************************************/ - -/* An encoding stream is an output stream. When you create the - stream, you specify the coding system that governs the encoding - and another stream that the resulting encoded data is to be - sent to, and then start sending data to it. */ - -#define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding) - -struct encoding_stream -{ - /* Coding system that governs the conversion. */ - struct Lisp_Coding_System *codesys; - - /* Stream that we read the encoded data from or - write the decoded data to. */ - Lstream *other_end; - - /* If we are reading, then we can return only a fixed amount of - data, so if the conversion resulted in too much data, we store it - here for retrieval the next time around. */ - unsigned_char_dynarr *runoff; - - /* FLAGS holds flags indicating the current state of the encoding. - Some of these flags are dependent on the coding system. */ - unsigned int flags; - - /* CH holds a partially built-up character. Since we only deal - with one- and two-byte characters at the moment, we only use - this to store the first byte of a two-byte character. */ - unsigned int ch; - - /* Additional information used by the ISO2022 encoder. */ - struct - { - /* CHARSET holds the character sets currently assigned to the G0 - through G3 registers. It is initialized from the array - INITIAL_CHARSET in CODESYS. */ - Lisp_Object charset[4]; - - /* Which registers are currently invoked into the left (GL) and - right (GR) halves of the 8-bit encoding space? */ - int register_left, register_right; - - /* Whether we need to explicitly designate the charset in the - G? register before using it. It is initialized from the - array FORCE_CHARSET_ON_OUTPUT in CODESYS. */ - unsigned char force_charset_on_output[4]; - - /* Other state variables that need to be preserved across - invocations. */ - Lisp_Object current_charset; - int current_half; - int current_char_boundary; - } iso2022; - - /* Additional information (the state of the running CCL program) - used by the CCL encoder. */ - struct ccl_program ccl; -}; - -static int encoding_reader (Lstream *stream, unsigned char *data, size_t size); -static int encoding_writer (Lstream *stream, CONST unsigned char *data, - size_t size); -static int encoding_rewinder (Lstream *stream); -static int encoding_seekable_p (Lstream *stream); -static int encoding_flusher (Lstream *stream); -static int encoding_closer (Lstream *stream); -static Lisp_Object encoding_marker (Lisp_Object stream, - void (*markobj) (Lisp_Object)); - -DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding, - sizeof (struct encoding_stream)); - -static Lisp_Object -encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object)) -{ - Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end; - Lisp_Object str_obj; - - /* We do not need to mark the coding systems or charsets stored - within the stream because they are stored in a global list - and automatically marked. */ - - XSETLSTREAM (str_obj, str); - markobj (str_obj); - if (str->imp->marker) - return (str->imp->marker) (str_obj, markobj); - else - return Qnil; -} - -/* Read SIZE bytes of data and store it into DATA. We are a encoding stream - so we read data from the other end, encode it, and store it into DATA. */ - -static int -encoding_reader (Lstream *stream, unsigned char *data, size_t size) -{ - struct encoding_stream *str = ENCODING_STREAM_DATA (stream); - unsigned char *orig_data = data; - int read_size; - int error_occurred = 0; - - /* We need to interface to mule_encode(), which expects to take some - amount of data and store the result into a Dynarr. We have - mule_encode() store into str->runoff, and take data from there - as necessary. */ - - /* We loop until we have enough data, reading chunks from the other - end and encoding it. */ - while (1) - { - /* Take data from the runoff if we can. Make sure to take at - most SIZE bytes, and delete the data from the runoff. */ - if (Dynarr_length (str->runoff) > 0) - { - int chunk = min ((int) size, Dynarr_length (str->runoff)); - memcpy (data, Dynarr_atp (str->runoff, 0), chunk); - Dynarr_delete_many (str->runoff, 0, chunk); - data += chunk; - size -= chunk; - } - - if (size == 0) - break; /* No more room for data */ - - if (str->flags & CODING_STATE_END) - /* This means that on the previous iteration, we hit the EOF on - the other end. We loop once more so that mule_encode() can - output any final stuff it may be holding, or any "go back - to a sane state" escape sequences. (This latter makes sense - during encoding.) */ - break; - - /* Exhausted the runoff, so get some more. DATA at least SIZE bytes - left of storage in it, so it's OK to read directly into it. - (We'll be overwriting above, after we've encoded it into the - runoff.) */ - read_size = Lstream_read (str->other_end, data, size); - if (read_size < 0) - { - error_occurred = 1; - break; - } - if (read_size == 0) - /* There might be some more end data produced in the translation. - See the comment above. */ - str->flags |= CODING_STATE_END; - mule_encode (stream, data, str->runoff, read_size); - } - - if (data == orig_data) - return error_occurred ? -1 : 0; - else - return data - orig_data; -} - -static int -encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size) -{ - struct encoding_stream *str = ENCODING_STREAM_DATA (stream); - int retval; - - /* Encode all our data into the runoff, and then attempt to write - it all out to the other end. Remove whatever chunk we succeeded - in writing. */ - mule_encode (stream, data, str->runoff, size); - retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0), - Dynarr_length (str->runoff)); - if (retval > 0) - Dynarr_delete_many (str->runoff, 0, retval); - /* Do NOT return retval. The return value indicates how much - of the incoming data was written, not how many bytes were - written. */ - return size; -} - -static void -reset_encoding_stream (struct encoding_stream *str) -{ - switch (CODING_SYSTEM_TYPE (str->codesys)) - { - case CODESYS_ISO2022: - { - int i; - - for (i = 0; i < 4; i++) - { - str->iso2022.charset[i] = - CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i); - str->iso2022.force_charset_on_output[i] = - CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i); - } - str->iso2022.register_left = 0; - str->iso2022.register_right = 1; - str->iso2022.current_charset = Qnil; - str->iso2022.current_half = 0; - str->iso2022.current_char_boundary = 1; - break; - } - case CODESYS_CCL: - setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys)); - break; - default: - break; - } - - str->flags = str->ch = 0; -} - -static int -encoding_rewinder (Lstream *stream) -{ - struct encoding_stream *str = ENCODING_STREAM_DATA (stream); - reset_encoding_stream (str); - Dynarr_reset (str->runoff); - return Lstream_rewind (str->other_end); -} - -static int -encoding_seekable_p (Lstream *stream) -{ - struct encoding_stream *str = ENCODING_STREAM_DATA (stream); - return Lstream_seekable_p (str->other_end); -} - -static int -encoding_flusher (Lstream *stream) -{ - struct encoding_stream *str = ENCODING_STREAM_DATA (stream); - return Lstream_flush (str->other_end); -} - -static int -encoding_closer (Lstream *stream) -{ - struct encoding_stream *str = ENCODING_STREAM_DATA (stream); - if (stream->flags & LSTREAM_FL_WRITE) - { - str->flags |= CODING_STATE_END; - encoding_writer (stream, 0, 0); - } - Dynarr_free (str->runoff); - return Lstream_close (str->other_end); -} - -Lisp_Object -encoding_stream_coding_system (Lstream *stream) -{ - Lisp_Object coding_system; - struct encoding_stream *str = ENCODING_STREAM_DATA (stream); - - XSETCODING_SYSTEM (coding_system, str->codesys); - return coding_system; -} - -void -set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) -{ - struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); - struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); - str->codesys = cs; - reset_encoding_stream (str); -} - -static Lisp_Object -make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys, - CONST char *mode) -{ - Lstream *lstr = Lstream_new (lstream_encoding, mode); - struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); - Lisp_Object obj; - - xzero (*str); - str->runoff = Dynarr_new (unsigned_char); - str->other_end = stream; - set_encoding_stream_coding_system (lstr, codesys); - XSETLSTREAM (obj, lstr); - return obj; -} - -Lisp_Object -make_encoding_input_stream (Lstream *stream, Lisp_Object codesys) -{ - return make_encoding_stream_1 (stream, codesys, "r"); -} - -Lisp_Object -make_encoding_output_stream (Lstream *stream, Lisp_Object codesys) -{ - return make_encoding_stream_1 (stream, codesys, "w"); -} - -/* Convert N bytes of internally-formatted data stored in SRC to an - external format, according to the encoding stream ENCODING. - Store the encoded data into DST. */ - -static void -mule_encode (Lstream *encoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); - - switch (CODING_SYSTEM_TYPE (str->codesys)) - { -#ifdef DEBUG_XEMACS - case CODESYS_INTERNAL: - Dynarr_add_many (dst, src, n); - break; -#endif - case CODESYS_AUTODETECT: - /* If we got this far and still haven't decided on the coding - system, then do no conversion. */ - case CODESYS_NO_CONVERSION: - encode_coding_no_conversion (encoding, src, dst, n); - break; - case CODESYS_SHIFT_JIS: - encode_coding_sjis (encoding, src, dst, n); - break; - case CODESYS_BIG5: - encode_coding_big5 (encoding, src, dst, n); - break; - case CODESYS_CCL: - ccl_driver (&str->ccl, src, dst, n, 0); - break; - case CODESYS_ISO2022: - encode_coding_iso2022 (encoding, src, dst, n); - break; - default: - abort (); - } -} - -DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /* -Encode the text between START and END using CODING-SYSTEM. -This will, for example, convert Japanese characters into stuff such as -"^[$B! [ENCODE AS SPECIFIED] - ------> [DECODE AS BINARY] - ------> [BUFFER] - */ - while (1) - { - char tempbuf[1024]; /* some random amount */ - Bufpos newpos, even_newer_pos; - Bufpos oldpos = lisp_buffer_stream_startpos (istr); - int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); - - if (!size_in_bytes) - break; - newpos = lisp_buffer_stream_startpos (istr); - Lstream_write (ostr, tempbuf, size_in_bytes); - even_newer_pos = lisp_buffer_stream_startpos (istr); - buffer_delete_range (buf, even_newer_pos - (newpos - oldpos), - even_newer_pos, 0); - } - - { - Charcount retlen = - lisp_buffer_stream_startpos (XLSTREAM (instream)) - b; - Lstream_close (istr); - Lstream_close (ostr); - UNGCPRO; - Lstream_delete (istr); - Lstream_delete (ostr); - Lstream_delete (XLSTREAM (de_outstream)); - Lstream_delete (XLSTREAM (lb_outstream)); - return make_int (retlen); - } -} - - -/************************************************************************/ -/* Shift-JIS methods */ -/************************************************************************/ - -/* Shift-JIS is a coding system encoding three character sets: ASCII, right - half of JISX0201-Kana, and JISX0208. An ASCII character is encoded - as is. A character of JISX0201-Kana (TYPE94 character set) is - encoded by "position-code + 0x80". A character of JISX0208 - (TYPE94x94 character set) is encoded in 2-byte but two - position-codes are divided and shifted so that it fit in the range - below. - - --- CODE RANGE of Shift-JIS --- - (character set) (range) - ASCII 0x00 .. 0x7F - JISX0201-Kana 0xA0 .. 0xDF - JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF - (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC - ------------------------------- - -*/ - -/* Is this the first byte of a Shift-JIS two-byte char? */ - -#define BYTE_SJIS_TWO_BYTE_1_P(c) \ - (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF)) - -/* Is this the second byte of a Shift-JIS two-byte char? */ - -#define BYTE_SJIS_TWO_BYTE_2_P(c) \ - (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC)) - -#define BYTE_SJIS_KATAKANA_P(c) \ - ((c) >= 0xA1 && (c) <= 0xDF) - -static int -detect_coding_sjis (struct detection_state *st, CONST unsigned char *src, - unsigned int n) -{ - int c; - - while (n--) - { - c = *src++; - if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) - return 0; - if (st->shift_jis.in_second_byte) - { - st->shift_jis.in_second_byte = 0; - if (c < 0x40) - return 0; - } - else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0) - st->shift_jis.in_second_byte = 1; - } - return CODING_CATEGORY_SHIFT_JIS_MASK; -} - -/* Convert Shift-JIS data to internal format. */ - -static void -decode_coding_sjis (Lstream *decoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - unsigned char c; - unsigned int flags, ch; - enum eol_type eol_type; - struct decoding_stream *str = DECODING_STREAM_DATA (decoding); - - CODING_STREAM_DECOMPOSE (str, flags, ch); - eol_type = str->eol_type; - - while (n--) - { - c = *src++; - - if (ch) - { - /* Previous character was first byte of Shift-JIS Kanji char. */ - if (BYTE_SJIS_TWO_BYTE_2_P (c)) - { - unsigned char e1, e2; - - Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); - DECODE_SJIS (ch, c, e1, e2); - Dynarr_add (dst, e1); - Dynarr_add (dst, e2); - } - else - { - DECODE_ADD_BINARY_CHAR (ch, dst); - DECODE_ADD_BINARY_CHAR (c, dst); - } - ch = 0; - } - else - { - DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); - if (BYTE_SJIS_TWO_BYTE_1_P (c)) - ch = c; - else if (BYTE_SJIS_KATAKANA_P (c)) - { - Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201); - Dynarr_add (dst, c); - } - else - DECODE_ADD_BINARY_CHAR (c, dst); - } - label_continue_loop:; - } - - DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); - - CODING_STREAM_COMPOSE (str, flags, ch); -} - -/* Convert internally-formatted data to Shift-JIS. */ - -static void -encode_coding_sjis (Lstream *encoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - unsigned char c; - struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); - unsigned int flags, ch; - enum eol_type eol_type; - - CODING_STREAM_DECOMPOSE (str, flags, ch); - eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); - - while (n--) - { - c = *src++; - if (c == '\n') - { - if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) - Dynarr_add (dst, '\r'); - if (eol_type != EOL_CR) - Dynarr_add (dst, '\n'); - ch = 0; - } - else if (BYTE_ASCII_P (c)) - { - Dynarr_add (dst, c); - ch = 0; - } - else if (BUFBYTE_LEADING_BYTE_P (c)) - ch = (c == LEADING_BYTE_KATAKANA_JISX0201 || - c == LEADING_BYTE_JAPANESE_JISX0208_1978 || - c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0; - else if (ch) - { - if (ch == LEADING_BYTE_KATAKANA_JISX0201) - { - Dynarr_add (dst, c); - ch = 0; - } - else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 || - ch == LEADING_BYTE_JAPANESE_JISX0208) - ch = c; - else - { - unsigned char j1, j2; - ENCODE_SJIS (ch, c, j1, j2); - Dynarr_add (dst, j1); - Dynarr_add (dst, j2); - ch = 0; - } - } - } - - CODING_STREAM_COMPOSE (str, flags, ch); -} - -DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /* -Decode a JISX0208 character of Shift-JIS coding-system. -CODE is the character code in Shift-JIS as a cons of type bytes. -Return the corresponding character. -*/ - (code)) -{ - unsigned char c1, c2, s1, s2; - - CHECK_CONS (code); - CHECK_INT (XCAR (code)); - CHECK_INT (XCDR (code)); - s1 = XINT (XCAR (code)); - s2 = XINT (XCDR (code)); - if (BYTE_SJIS_TWO_BYTE_1_P (s1) && - BYTE_SJIS_TWO_BYTE_2_P (s2)) - { - DECODE_SJIS (s1, s2, c1, c2); - return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208, - c1 & 0x7F, c2 & 0x7F)); - } - else - return Qnil; -} - -DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /* -Encode a JISX0208 character CHAR to SHIFT-JIS coding-system. -Return the corresponding character code in SHIFT-JIS as a cons of two bytes. -*/ - (ch)) -{ - Lisp_Object charset; - int c1, c2, s1, s2; - - CHECK_CHAR_COERCE_INT (ch); - BREAKUP_CHAR (XCHAR (ch), charset, c1, c2); - if (EQ (charset, Vcharset_japanese_jisx0208)) - { - ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2); - return Fcons (make_int (s1), make_int (s2)); - } - else - return Qnil; -} - - -/************************************************************************/ -/* Big5 methods */ -/************************************************************************/ - -/* BIG5 is a coding system encoding two character sets: ASCII and - Big5. An ASCII character is encoded as is. Big5 is a two-byte - character set and is encoded in two-byte. - - --- CODE RANGE of BIG5 --- - (character set) (range) - ASCII 0x00 .. 0x7F - Big5 (1st byte) 0xA1 .. 0xFE - (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE - -------------------------- - - Since the number of characters in Big5 is larger than maximum - characters in Emacs' charset (96x96), it can't be handled as one - charset. So, in Emacs, Big5 is divided into two: `charset-big5-1' - and `charset-big5-2'. Both s are TYPE94x94. The former - contains frequently used characters and the latter contains less - frequently used characters. */ - -#define BYTE_BIG5_TWO_BYTE_1_P(c) \ - ((c) >= 0xA1 && (c) <= 0xFE) - -/* Is this the second byte of a Shift-JIS two-byte char? */ - -#define BYTE_BIG5_TWO_BYTE_2_P(c) \ - (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE)) - -/* Number of Big5 characters which have the same code in 1st byte. */ - -#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40) - -/* Code conversion macros. These are macros because they are used in - inner loops during code conversion. - - Note that temporary variables in macros introduce the classic - dynamic-scoping problems with variable names. We use capital- - lettered variables in the assumption that XEmacs does not use - capital letters in variables except in a very formalized way - (e.g. Qstring). */ - -/* Convert Big5 code (b1, b2) into its internal string representation - (lb, c1, c2). */ - -/* There is a much simpler way to split the Big5 charset into two. - For the moment I'm going to leave the algorithm as-is because it - claims to separate out the most-used characters into a single - charset, which perhaps will lead to optimizations in various - places. - - The way the algorithm works is something like this: - - Big5 can be viewed as a 94x157 charset, where the row is - encoded into the bytes 0xA1 .. 0xFE and the column is encoded - into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency, - the split between low and high column numbers is apparently - meaningless; ascending rows produce less and less frequent chars. - Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to - the first charset, and the upper half (0xC9 .. 0xFE) to the - second. To do the conversion, we convert the character into - a single number where 0 .. 156 is the first row, 157 .. 313 - is the second, etc. That way, the characters are ordered by - decreasing frequency. Then we just chop the space in two - and coerce the result into a 94x94 space. - */ - -#define DECODE_BIG5(b1, b2, lb, c1, c2) do \ -{ \ - int B1 = b1, B2 = b2; \ - unsigned int I \ - = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \ - \ - if (B1 < 0xC9) \ - { \ - lb = LEADING_BYTE_CHINESE_BIG5_1; \ - } \ - else \ - { \ - lb = LEADING_BYTE_CHINESE_BIG5_2; \ - I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \ - } \ - c1 = I / (0xFF - 0xA1) + 0xA1; \ - c2 = I % (0xFF - 0xA1) + 0xA1; \ -} while (0) - -/* Convert the internal string representation of a Big5 character - (lb, c1, c2) into Big5 code (b1, b2). */ - -#define ENCODE_BIG5(lb, c1, c2, b1, b2) do \ -{ \ - unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \ - \ - if (lb == LEADING_BYTE_CHINESE_BIG5_2) \ - { \ - I += BIG5_SAME_ROW * (0xC9 - 0xA1); \ - } \ - b1 = I / BIG5_SAME_ROW + 0xA1; \ - b2 = I % BIG5_SAME_ROW; \ - b2 += b2 < 0x3F ? 0x40 : 0x62; \ -} while (0) - -static int -detect_coding_big5 (struct detection_state *st, CONST unsigned char *src, - unsigned int n) -{ - int c; - - while (n--) - { - c = *src++; - if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO || - (c >= 0x80 && c <= 0xA0)) - return 0; - if (st->big5.in_second_byte) - { - st->big5.in_second_byte = 0; - if (c < 0x40 || (c >= 0x80 && c <= 0xA0)) - return 0; - } - else if (c >= 0xA1) - st->big5.in_second_byte = 1; - } - return CODING_CATEGORY_BIG5_MASK; -} - -/* Convert Big5 data to internal format. */ - -static void -decode_coding_big5 (Lstream *decoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - unsigned char c; - unsigned int flags, ch; - enum eol_type eol_type; - struct decoding_stream *str = DECODING_STREAM_DATA (decoding); - - CODING_STREAM_DECOMPOSE (str, flags, ch); - eol_type = str->eol_type; - - while (n--) - { - c = *src++; - if (ch) - { - /* Previous character was first byte of Big5 char. */ - if (BYTE_BIG5_TWO_BYTE_2_P (c)) - { - unsigned char b1, b2, b3; - DECODE_BIG5 (ch, c, b1, b2, b3); - Dynarr_add (dst, b1); - Dynarr_add (dst, b2); - Dynarr_add (dst, b3); - } - else - { - DECODE_ADD_BINARY_CHAR (ch, dst); - DECODE_ADD_BINARY_CHAR (c, dst); - } - ch = 0; - } - else - { - DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); - if (BYTE_BIG5_TWO_BYTE_1_P (c)) - ch = c; - else - DECODE_ADD_BINARY_CHAR (c, dst); - } - label_continue_loop:; - } - - DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); - - CODING_STREAM_COMPOSE (str, flags, ch); -} - -/* Convert internally-formatted data to Big5. */ - -static void -encode_coding_big5 (Lstream *encoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - unsigned char c; - struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); - unsigned int flags, ch; - enum eol_type eol_type; - - CODING_STREAM_DECOMPOSE (str, flags, ch); - eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); - - while (n--) - { - c = *src++; - if (c == '\n') - { - if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) - Dynarr_add (dst, '\r'); - if (eol_type != EOL_CR) - Dynarr_add (dst, '\n'); - } - else if (BYTE_ASCII_P (c)) - { - /* ASCII. */ - Dynarr_add (dst, c); - } - else if (BUFBYTE_LEADING_BYTE_P (c)) - { - if (c == LEADING_BYTE_CHINESE_BIG5_1 || - c == LEADING_BYTE_CHINESE_BIG5_2) - { - /* A recognized leading byte. */ - ch = c; - continue; /* not done with this character. */ - } - /* otherwise just ignore this character. */ - } - else if (ch == LEADING_BYTE_CHINESE_BIG5_1 || - ch == LEADING_BYTE_CHINESE_BIG5_2) - { - /* Previous char was a recognized leading byte. */ - ch = (ch << 8) | c; - continue; /* not done with this character. */ - } - else if (ch) - { - /* Encountering second byte of a Big5 character. */ - unsigned char b1, b2; - - ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2); - Dynarr_add (dst, b1); - Dynarr_add (dst, b2); - } - - ch = 0; - } - - CODING_STREAM_COMPOSE (str, flags, ch); -} - - -DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /* -Decode a Big5 character CODE of BIG5 coding-system. -CODE is the character code in BIG5, a cons of two integers. -Return the corresponding character. -*/ - (code)) -{ - unsigned char c1, c2, b1, b2; - - CHECK_CONS (code); - CHECK_INT (XCAR (code)); - CHECK_INT (XCDR (code)); - b1 = XINT (XCAR (code)); - b2 = XINT (XCDR (code)); - if (BYTE_BIG5_TWO_BYTE_1_P (b1) && - BYTE_BIG5_TWO_BYTE_2_P (b2)) - { - int leading_byte; - Lisp_Object charset; - DECODE_BIG5 (b1, b2, leading_byte, c1, c2); - charset = CHARSET_BY_LEADING_BYTE (leading_byte); - return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F)); - } - else - return Qnil; -} - -DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /* -Encode the Big5 character CH to BIG5 coding-system. -Return the corresponding character code in Big5. -*/ - (ch)) -{ - Lisp_Object charset; - int c1, c2, b1, b2; - - CHECK_CHAR_COERCE_INT (ch); - BREAKUP_CHAR (XCHAR (ch), charset, c1, c2); - if (EQ (charset, Vcharset_chinese_big5_1) || - EQ (charset, Vcharset_chinese_big5_2)) - { - ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80, - b1, b2); - return Fcons (make_int (b1), make_int (b2)); - } - else - return Qnil; -} - - -/************************************************************************/ -/* ISO2022 methods */ -/************************************************************************/ - -/* The following note describes the coding system ISO2022 briefly. - Since the intention of this note is to help understanding of the - programs in this file, some parts are NOT ACCURATE or OVERLY - SIMPLIFIED. For thorough understanding, please refer to the - original document of ISO2022. - - ISO2022 provides many mechanisms to encode several character sets - in 7-bit and 8-bit environments. If one chooses 7-bit environment, - all text is encoded by codes of less than 128. This may make the - encoded text a little bit longer, but the text get more stability - to pass through several gateways (some of them strip off MSB). - - There are two kind of character sets: control character set and - graphic character set. The former contains control characters such - as `newline' and `escape' to provide control functions (control - functions are provided also by escape sequence). The latter - contains graphic characters such as 'A' and '-'. Emacs recognizes - two control character sets and many graphic character sets. - - Graphic character sets are classified into one of four types, - according to the dimension and number of characters in the set: - TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each - character set is assigned an identification byte, unique for each - type, called "final character" (denoted as hereafter). The - of each character set is decided by ECMA(*) when it is registered - in ISO. Code range of is 0x30..0x7F (0x30..0x3F are for - private use only). - - Note (*): ECMA = European Computer Manufacturers Association - - Here are examples of graphic character set [NAME()]: - o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ... - o TYPE96 -- right-half-of-ISO8859-1('A'), ... - o TYPE94x94 -- GB2312('A'), JISX0208('B'), ... - o TYPE96x96 -- none for the moment - - A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR. - C0 [0x00..0x1F] -- control character plane 0 - GL [0x20..0x7F] -- graphic character plane 0 - C1 [0x80..0x9F] -- control character plane 1 - GR [0xA0..0xFF] -- graphic character plane 1 - - A control character set is directly designated and invoked to C0 or - C1 by an escape sequence. The most common case is that: - - ISO646's control character set is designated/invoked to C0, and - - ISO6429's control character set is designated/invoked to C1, - and usually these designations/invocations are omitted in encoded - text. In a 7-bit environment, only C0 can be used, and a control - character for C1 is encoded by an appropriate escape sequence to - fit into the environment. All control characters for C1 are - defined to have corresponding escape sequences. - - A graphic character set is at first designated to one of four - graphic registers (G0 through G3), then these graphic registers are - invoked to GL or GR. These designations and invocations can be - done independently. The most common case is that G0 is invoked to - GL, G1 is invoked to GR, and ASCII is designated to G0. Usually - these invocations and designations are omitted in encoded text. - In a 7-bit environment, only GL can be used. - - When a graphic character set of TYPE94 or TYPE94x94 is invoked to - GL, codes 0x20 and 0x7F of the GL area work as control characters - SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area - should not be used. - - There are two ways of invocation: locking-shift and single-shift. - With locking-shift, the invocation lasts until the next different - invocation, whereas with single-shift, the invocation works only - for the following character and doesn't affect locking-shift. - Invocations are done by the following control characters or escape - sequences. - - ---------------------------------------------------------------------- - abbrev function cntrl escape seq description - ---------------------------------------------------------------------- - SI/LS0 (shift-in) 0x0F none invoke G0 into GL - SO/LS1 (shift-out) 0x0E none invoke G1 into GL - LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR - LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL - LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR - LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL - LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR - SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char - SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char - ---------------------------------------------------------------------- - The first four are for locking-shift. Control characters for these - functions are defined by macros ISO_CODE_XXX in `coding.h'. - - Designations are done by the following escape sequences. - ---------------------------------------------------------------------- - escape sequence description - ---------------------------------------------------------------------- - ESC '(' designate TYPE94 to G0 - ESC ')' designate TYPE94 to G1 - ESC '*' designate TYPE94 to G2 - ESC '+' designate TYPE94 to G3 - ESC ',' designate TYPE96 to G0 (*) - ESC '-' designate TYPE96 to G1 - ESC '.' designate TYPE96 to G2 - ESC '/' designate TYPE96 to G3 - ESC '$' '(' designate TYPE94x94 to G0 (**) - ESC '$' ')' designate TYPE94x94 to G1 - ESC '$' '*' designate TYPE94x94 to G2 - ESC '$' '+' designate TYPE94x94 to G3 - ESC '$' ',' designate TYPE96x96 to G0 (*) - ESC '$' '-' designate TYPE96x96 to G1 - ESC '$' '.' designate TYPE96x96 to G2 - ESC '$' '/' designate TYPE96x96 to G3 - ---------------------------------------------------------------------- - In this list, "TYPE94" means a graphic character set of type TYPE94 - and final character , and etc. - - Note (*): Although these designations are not allowed in ISO2022, - Emacs accepts them on decoding, and produces them on encoding - TYPE96 or TYPE96x96 character set in a coding system which is - characterized as 7-bit environment, non-locking-shift, and - non-single-shift. - - Note (**): If is '@', 'A', or 'B', the intermediate character - '(' can be omitted. We call this as "short-form" here after. - - Now you may notice that there are a lot of ways for encoding the - same multilingual text in ISO2022. Actually, there exist many - coding systems such as Compound Text (used in X's inter client - communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR - (used in Korean internet), EUC (Extended UNIX Code, used in Asian - localized platforms), and all of these are variants of ISO2022. - - In addition to the above, Emacs handles two more kinds of escape - sequences: ISO6429's direction specification and Emacs' private - sequence for specifying character composition. - - ISO6429's direction specification takes the following format: - o CSI ']' -- end of the current direction - o CSI '0' ']' -- end of the current direction - o CSI '1' ']' -- start of left-to-right text - o CSI '2' ']' -- start of right-to-left text - The control character CSI (0x9B: control sequence introducer) is - abbreviated to the escape sequence ESC '[' in 7-bit environment. - - Character composition specification takes the following format: - o ESC '0' -- start character composition - o ESC '1' -- end character composition - Since these are not standard escape sequences of any ISO, the use - of them for these meanings is restricted to Emacs only. */ - -static void -reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (!NILP (coding_system)) - iso->charset[i] = - XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i); - else - iso->charset[i] = Qt; - iso->invalid_designated[i] = 0; - } - iso->esc = ISO_ESC_NOTHING; - iso->esc_bytes_index = 0; - iso->register_left = 0; - iso->register_right = 1; - iso->switched_dir_and_no_valid_charset_yet = 0; - iso->invalid_switch_dir = 0; - iso->output_direction_sequence = 0; - iso->output_literally = 0; - if (iso->composite_chars) - Dynarr_reset (iso->composite_chars); -} - -static int -fit_to_be_escape_quoted (unsigned char c) -{ - switch (c) - { - case ISO_CODE_ESC: - case ISO_CODE_CSI: - case ISO_CODE_SS2: - case ISO_CODE_SS3: - case ISO_CODE_SO: - case ISO_CODE_SI: - return 1; - - default: - return 0; - } -} - -/* Parse one byte of an ISO2022 escape sequence. - If the result is an invalid escape sequence, return 0 and - do not change anything in STR. Otherwise, if the result is - an incomplete escape sequence, update ISO2022.ESC and - ISO2022.ESC_BYTES and return -1. Otherwise, update - all the state variables (but not ISO2022.ESC_BYTES) and - return 1. - - If CHECK_INVALID_CHARSETS is non-zero, check for designation - or invocation of an invalid character set and treat that as - an unrecognized escape sequence. */ - -static int -parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso, - unsigned char c, unsigned int *flags, - int check_invalid_charsets) -{ - /* (1) If we're at the end of a designation sequence, CS is the - charset being designated and REG is the register to designate - it to. - - (2) If we're at the end of a locking-shift sequence, REG is - the register to invoke and HALF (0 == left, 1 == right) is - the half to invoke it into. - - (3) If we're at the end of a single-shift sequence, REG is - the register to invoke. */ - Lisp_Object cs = Qnil; - int reg, half; - - /* NOTE: This code does goto's all over the fucking place. - The reason for this is that we're basically implementing - a state machine here, and hierarchical languages like C - don't really provide a clean way of doing this. */ - - if (! (*flags & CODING_STATE_ESCAPE)) - /* At beginning of escape sequence; we need to reset our - escape-state variables. */ - iso->esc = ISO_ESC_NOTHING; - - iso->output_literally = 0; - iso->output_direction_sequence = 0; - - switch (iso->esc) - { - case ISO_ESC_NOTHING: - iso->esc_bytes_index = 0; - switch (c) - { - case ISO_CODE_ESC: /* Start escape sequence */ - *flags |= CODING_STATE_ESCAPE; - iso->esc = ISO_ESC; - goto not_done; - - case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */ - *flags |= CODING_STATE_ESCAPE; - iso->esc = ISO_ESC_5_11; - goto not_done; - - case ISO_CODE_SO: /* locking shift 1 */ - reg = 1; half = 0; - goto locking_shift; - case ISO_CODE_SI: /* locking shift 0 */ - reg = 0; half = 0; - goto locking_shift; - - case ISO_CODE_SS2: /* single shift */ - reg = 2; - goto single_shift; - case ISO_CODE_SS3: /* single shift */ - reg = 3; - goto single_shift; - - default: /* Other control characters */ - return 0; - } - - case ISO_ESC: - switch (c) - { - /**** single shift ****/ - - case 'N': /* single shift 2 */ - reg = 2; - goto single_shift; - case 'O': /* single shift 3 */ - reg = 3; - goto single_shift; - - /**** locking shift ****/ - - case '~': /* locking shift 1 right */ - reg = 1; half = 1; - goto locking_shift; - case 'n': /* locking shift 2 */ - reg = 2; half = 0; - goto locking_shift; - case '}': /* locking shift 2 right */ - reg = 2; half = 1; - goto locking_shift; - case 'o': /* locking shift 3 */ - reg = 3; half = 0; - goto locking_shift; - case '|': /* locking shift 3 right */ - reg = 3; half = 1; - goto locking_shift; - - /**** composite ****/ - - case '0': - iso->esc = ISO_ESC_START_COMPOSITE; - *flags = (*flags & CODING_STATE_ISO2022_LOCK) | - CODING_STATE_COMPOSITE; - return 1; - - case '1': - iso->esc = ISO_ESC_END_COMPOSITE; - *flags = (*flags & CODING_STATE_ISO2022_LOCK) & - ~CODING_STATE_COMPOSITE; - return 1; - - /**** directionality ****/ - - case '[': - iso->esc = ISO_ESC_5_11; - goto not_done; - - /**** designation ****/ - - case '$': /* multibyte charset prefix */ - iso->esc = ISO_ESC_2_4; - goto not_done; - - default: - if (0x28 <= c && c <= 0x2F) - { - iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8); - goto not_done; - } - - /* This function is called with CODESYS equal to nil when - doing coding-system detection. */ - if (!NILP (codesys) - && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) - && fit_to_be_escape_quoted (c)) - { - iso->esc = ISO_ESC_LITERAL; - *flags &= CODING_STATE_ISO2022_LOCK; - return 1; - } - - /* bzzzt! */ - return 0; - } - - - - /**** directionality ****/ - - case ISO_ESC_5_11: /* ISO6429 direction control */ - if (c == ']') - { - *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); - goto directionality; - } - if (c == '0') iso->esc = ISO_ESC_5_11_0; - else if (c == '1') iso->esc = ISO_ESC_5_11_1; - else if (c == '2') iso->esc = ISO_ESC_5_11_2; - else return 0; - goto not_done; - - case ISO_ESC_5_11_0: - if (c == ']') - { - *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); - goto directionality; - } - return 0; - - case ISO_ESC_5_11_1: - if (c == ']') - { - *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); - goto directionality; - } - return 0; - - case ISO_ESC_5_11_2: - if (c == ']') - { - *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L; - goto directionality; - } - return 0; - - directionality: - iso->esc = ISO_ESC_DIRECTIONALITY; - /* Various junk here to attempt to preserve the direction sequences - literally in the text if they would otherwise be swallowed due - to invalid designations that don't show up as actual charset - changes in the text. */ - if (iso->invalid_switch_dir) - { - /* We already inserted a direction switch literally into the - text. We assume (#### this may not be right) that the - next direction switch is the one going the other way, - and we need to output that literally as well. */ - iso->output_literally = 1; - iso->invalid_switch_dir = 0; - } - else - { - int jj; - - /* If we are in the thrall of an invalid designation, - then stick the directionality sequence literally into the - output stream so it ends up in the original text again. */ - for (jj = 0; jj < 4; jj++) - if (iso->invalid_designated[jj]) - break; - if (jj < 4) - { - iso->output_literally = 1; - iso->invalid_switch_dir = 1; - } - else - /* Indicate that we haven't yet seen a valid designation, - so that if a switch-dir is directly followed by an - invalid designation, both get inserted literally. */ - iso->switched_dir_and_no_valid_charset_yet = 1; - } - return 1; - - - /**** designation ****/ - - case ISO_ESC_2_4: - if (0x28 <= c && c <= 0x2F) - { - iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8); - goto not_done; - } - if (0x40 <= c && c <= 0x42) - { - cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c, - *flags & CODING_STATE_R2L ? - CHARSET_RIGHT_TO_LEFT : - CHARSET_LEFT_TO_RIGHT); - reg = 0; - goto designated; - } - return 0; - - default: - { - int type =-1; - - if (c < '0' || c > '~') - return 0; /* bad final byte */ - - if (iso->esc >= ISO_ESC_2_8 && - iso->esc <= ISO_ESC_2_15) - { - type = ((iso->esc >= ISO_ESC_2_12) ? - CHARSET_TYPE_96 : CHARSET_TYPE_94); - reg = (iso->esc - ISO_ESC_2_8) & 3; - } - else if (iso->esc >= ISO_ESC_2_4_8 && - iso->esc <= ISO_ESC_2_4_15) - { - type = ((iso->esc >= ISO_ESC_2_4_12) ? - CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94); - reg = (iso->esc - ISO_ESC_2_4_8) & 3; - } - else - { - /* Can this ever be reached? -slb */ - abort(); - } - - cs = CHARSET_BY_ATTRIBUTES (type, c, - *flags & CODING_STATE_R2L ? - CHARSET_RIGHT_TO_LEFT : - CHARSET_LEFT_TO_RIGHT); - goto designated; - } - } - - not_done: - iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c; - return -1; - - single_shift: - if (check_invalid_charsets && !CHARSETP (iso->charset[reg])) - /* can't invoke something that ain't there. */ - return 0; - iso->esc = ISO_ESC_SINGLE_SHIFT; - *flags &= CODING_STATE_ISO2022_LOCK; - if (reg == 2) - *flags |= CODING_STATE_SS2; - else - *flags |= CODING_STATE_SS3; - return 1; - - locking_shift: - if (check_invalid_charsets && - !CHARSETP (iso->charset[reg])) - /* can't invoke something that ain't there. */ - return 0; - if (half) - iso->register_right = reg; - else - iso->register_left = reg; - *flags &= CODING_STATE_ISO2022_LOCK; - iso->esc = ISO_ESC_LOCKING_SHIFT; - return 1; - - designated: - if (NILP (cs) && check_invalid_charsets) - { - iso->invalid_designated[reg] = 1; - iso->charset[reg] = Vcharset_ascii; - iso->esc = ISO_ESC_DESIGNATE; - *flags &= CODING_STATE_ISO2022_LOCK; - iso->output_literally = 1; - if (iso->switched_dir_and_no_valid_charset_yet) - { - /* We encountered a switch-direction followed by an - invalid designation. Ensure that the switch-direction - gets outputted; otherwise it will probably get eaten - when the text is written out again. */ - iso->switched_dir_and_no_valid_charset_yet = 0; - iso->output_direction_sequence = 1; - /* And make sure that the switch-dir going the other - way gets outputted, as well. */ - iso->invalid_switch_dir = 1; - } - return 1; - } - /* This function is called with CODESYS equal to nil when - doing coding-system detection. */ - if (!NILP (codesys)) - { - charset_conversion_spec_dynarr *dyn = - XCODING_SYSTEM (codesys)->iso2022.input_conv; - - if (dyn) - { - int i; - - for (i = 0; i < Dynarr_length (dyn); i++) - { - struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); - if (EQ (cs, spec->from_charset)) - cs = spec->to_charset; - } - } - } - - iso->charset[reg] = cs; - iso->esc = ISO_ESC_DESIGNATE; - *flags &= CODING_STATE_ISO2022_LOCK; - if (iso->invalid_designated[reg]) - { - iso->invalid_designated[reg] = 0; - iso->output_literally = 1; - } - if (iso->switched_dir_and_no_valid_charset_yet) - iso->switched_dir_and_no_valid_charset_yet = 0; - return 1; -} - -static int -detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src, - unsigned int n) -{ - int c; - int mask; - - /* #### There are serious deficiencies in the recognition mechanism - here. This needs to be much smarter if it's going to cut it. */ - - if (!st->iso2022.initted) - { - reset_iso2022 (Qnil, &st->iso2022.iso); - st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK | - CODING_CATEGORY_ISO_8_DESIGNATE_MASK | - CODING_CATEGORY_ISO_8_1_MASK | - CODING_CATEGORY_ISO_8_2_MASK | - CODING_CATEGORY_ISO_LOCK_SHIFT_MASK); - st->iso2022.flags = 0; - st->iso2022.high_byte_count = 0; - st->iso2022.saw_single_shift = 0; - st->iso2022.initted = 1; - } - - mask = st->iso2022.mask; - - while (n--) - { - c = *src++; - if (c >= 0xA0) - { - mask &= ~CODING_CATEGORY_ISO_7_MASK; - st->iso2022.high_byte_count++; - } - else - { - if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift) - { - if (st->iso2022.high_byte_count & 1) - /* odd number of high bytes; assume not iso-8-2 */ - mask &= ~CODING_CATEGORY_ISO_8_2_MASK; - } - st->iso2022.high_byte_count = 0; - st->iso2022.saw_single_shift = 0; - if (c > 0x80) - mask &= ~CODING_CATEGORY_ISO_7_MASK; - } - if (!(st->iso2022.flags & CODING_STATE_ESCAPE) - && (BYTE_C0_P (c) || BYTE_C1_P (c))) - { /* control chars */ - switch (c) - { - /* Allow and ignore control characters that you might - reasonably see in a text file */ - case '\r': - case '\n': - case '\t': - case 7: /* bell */ - case 8: /* backspace */ - case 11: /* vertical tab */ - case 12: /* form feed */ - case 26: /* MS-DOS C-z junk */ - case 31: /* '^_' -- for info */ - goto label_continue_loop; - - default: - break; - } - } - - if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c) - || BYTE_C1_P (c)) - { - if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c, - &st->iso2022.flags, 0)) - { - switch (st->iso2022.iso.esc) - { - case ISO_ESC_DESIGNATE: - mask &= ~CODING_CATEGORY_ISO_8_1_MASK; - mask &= ~CODING_CATEGORY_ISO_8_2_MASK; - break; - case ISO_ESC_LOCKING_SHIFT: - mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK; - goto ran_out_of_chars; - case ISO_ESC_SINGLE_SHIFT: - mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK; - st->iso2022.saw_single_shift = 1; - break; - default: - break; - } - } - else - { - mask = 0; - goto ran_out_of_chars; - } - } - label_continue_loop:; - } - - ran_out_of_chars: - - return mask; -} - -static int -postprocess_iso2022_mask (int mask) -{ - /* #### kind of cheesy */ - /* If seven-bit ISO is allowed, then assume that the encoding is - entirely seven-bit and turn off the eight-bit ones. */ - if (mask & CODING_CATEGORY_ISO_7_MASK) - mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK | - CODING_CATEGORY_ISO_8_1_MASK | - CODING_CATEGORY_ISO_8_2_MASK); - return mask; -} - -/* If FLAGS is a null pointer or specifies right-to-left motion, - output a switch-dir-to-left-to-right sequence to DST. - Also update FLAGS if it is not a null pointer. - If INTERNAL_P is set, we are outputting in internal format and - need to handle the CSI differently. */ - -static void -restore_left_to_right_direction (struct Lisp_Coding_System *codesys, - unsigned_char_dynarr *dst, - unsigned int *flags, - int internal_p) -{ - if (!flags || (*flags & CODING_STATE_R2L)) - { - if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) - { - Dynarr_add (dst, ISO_CODE_ESC); - Dynarr_add (dst, '['); - } - else if (internal_p) - DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); - else - Dynarr_add (dst, ISO_CODE_CSI); - Dynarr_add (dst, '0'); - Dynarr_add (dst, ']'); - if (flags) - *flags &= ~CODING_STATE_R2L; - } -} - -/* If FLAGS is a null pointer or specifies a direction different from - DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or - CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape - sequence to DST. Also update FLAGS if it is not a null pointer. - If INTERNAL_P is set, we are outputting in internal format and - need to handle the CSI differently. */ - -static void -ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys, - unsigned_char_dynarr *dst, unsigned int *flags, - int internal_p) -{ - if ((!flags || (*flags & CODING_STATE_R2L)) && - direction == CHARSET_LEFT_TO_RIGHT) - restore_left_to_right_direction (codesys, dst, flags, internal_p); - else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys) - && (!flags || !(*flags & CODING_STATE_R2L)) && - direction == CHARSET_RIGHT_TO_LEFT) - { - if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) - { - Dynarr_add (dst, ISO_CODE_ESC); - Dynarr_add (dst, '['); - } - else if (internal_p) - DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); - else - Dynarr_add (dst, ISO_CODE_CSI); - Dynarr_add (dst, '2'); - Dynarr_add (dst, ']'); - if (flags) - *flags |= CODING_STATE_R2L; - } -} - -/* Convert ISO2022-format data to internal format. */ - -static void -decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - unsigned char c; - unsigned int flags, ch; - enum eol_type eol_type; - struct decoding_stream *str = DECODING_STREAM_DATA (decoding); - Lisp_Object coding_system; - unsigned_char_dynarr *real_dst = dst; - - CODING_STREAM_DECOMPOSE (str, flags, ch); - eol_type = str->eol_type; - XSETCODING_SYSTEM (coding_system, str->codesys); - - if (flags & CODING_STATE_COMPOSITE) - dst = str->iso2022.composite_chars; - - while (n--) - { - c = *src++; - if (flags & CODING_STATE_ESCAPE) - { /* Within ESC sequence */ - int retval = parse_iso2022_esc (coding_system, &str->iso2022, - c, &flags, 1); - - if (retval) - { - switch (str->iso2022.esc) - { - case ISO_ESC_START_COMPOSITE: - if (str->iso2022.composite_chars) - Dynarr_reset (str->iso2022.composite_chars); - else - str->iso2022.composite_chars = Dynarr_new (unsigned_char); - dst = str->iso2022.composite_chars; - break; - case ISO_ESC_END_COMPOSITE: - { - Bufbyte comstr[MAX_EMCHAR_LEN]; - Bytecount len; - Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0), - Dynarr_length (dst)); - dst = real_dst; - len = set_charptr_emchar (comstr, emch); - Dynarr_add_many (dst, comstr, len); - break; - } - - case ISO_ESC_LITERAL: - DECODE_ADD_BINARY_CHAR (c, dst); - break; - - default: - /* Everything else handled already */ - break; - } - } - - /* Attempted error recovery. */ - if (str->iso2022.output_direction_sequence) - ensure_correct_direction (flags & CODING_STATE_R2L ? - CHARSET_RIGHT_TO_LEFT : - CHARSET_LEFT_TO_RIGHT, - str->codesys, dst, 0, 1); - /* More error recovery. */ - if (!retval || str->iso2022.output_literally) - { - /* Output the (possibly invalid) sequence */ - int i; - for (i = 0; i < str->iso2022.esc_bytes_index; i++) - DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst); - flags &= CODING_STATE_ISO2022_LOCK; - if (!retval) - n++, src--;/* Repeat the loop with the same character. */ - else - { - /* No sense in reprocessing the final byte of the - escape sequence; it could mess things up anyway. - Just add it now. */ - DECODE_ADD_BINARY_CHAR (c, dst); - } - } - ch = 0; - } - else if (BYTE_C0_P (c) || BYTE_C1_P (c)) - { /* Control characters */ - - /***** Error-handling *****/ - - /* If we were in the middle of a character, dump out the - partial character. */ - DECODE_OUTPUT_PARTIAL_CHAR (ch); - - /* If we just saw a single-shift character, dump it out. - This may dump out the wrong sort of single-shift character, - but least it will give an indication that something went - wrong. */ - if (flags & CODING_STATE_SS2) - { - DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst); - flags &= ~CODING_STATE_SS2; - } - if (flags & CODING_STATE_SS3) - { - DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst); - flags &= ~CODING_STATE_SS3; - } - - /***** Now handle the control characters. *****/ - - /* Handle CR/LF */ - DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); - - flags &= CODING_STATE_ISO2022_LOCK; - - if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1)) - DECODE_ADD_BINARY_CHAR (c, dst); - } - else - { /* Graphic characters */ - Lisp_Object charset; - int lb; - int reg; - - DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); - - /* Now determine the charset. */ - reg = ((flags & CODING_STATE_SS2) ? 2 - : (flags & CODING_STATE_SS3) ? 3 - : !BYTE_ASCII_P (c) ? str->iso2022.register_right - : str->iso2022.register_left); - charset = str->iso2022.charset[reg]; - - /* Error checking: */ - if (NILP (charset) || str->iso2022.invalid_designated[reg] - || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL) - && XCHARSET_CHARS (charset) == 94)) - /* Mrmph. We are trying to invoke a register that has no - or an invalid charset in it, or trying to add a character - outside the range of the charset. Insert that char literally - to preserve it for the output. */ - { - DECODE_OUTPUT_PARTIAL_CHAR (ch); - DECODE_ADD_BINARY_CHAR (c, dst); - } - - else - { - /* Things are probably hunky-dorey. */ - - /* Fetch reverse charset, maybe. */ - if (((flags & CODING_STATE_R2L) && - XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT) - || - (!(flags & CODING_STATE_R2L) && - XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT)) - { - Lisp_Object new_charset = - XCHARSET_REVERSE_DIRECTION_CHARSET (charset); - if (!NILP (new_charset)) - charset = new_charset; - } - - lb = XCHARSET_LEADING_BYTE (charset); - switch (XCHARSET_REP_BYTES (charset)) - { - case 1: /* ASCII */ - DECODE_OUTPUT_PARTIAL_CHAR (ch); - Dynarr_add (dst, c & 0x7F); - break; - - case 2: /* one-byte official */ - DECODE_OUTPUT_PARTIAL_CHAR (ch); - Dynarr_add (dst, lb); - Dynarr_add (dst, c | 0x80); - break; - - case 3: /* one-byte private or two-byte official */ - if (XCHARSET_PRIVATE_P (charset)) - { - DECODE_OUTPUT_PARTIAL_CHAR (ch); - Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1); - Dynarr_add (dst, lb); - Dynarr_add (dst, c | 0x80); - } - else - { - if (ch) - { - Dynarr_add (dst, lb); - Dynarr_add (dst, ch | 0x80); - Dynarr_add (dst, c | 0x80); - ch = 0; - } - else - ch = c; - } - break; - - default: /* two-byte private */ - if (ch) - { - Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2); - Dynarr_add (dst, lb); - Dynarr_add (dst, ch | 0x80); - Dynarr_add (dst, c | 0x80); - ch = 0; - } - else - ch = c; - } - } - - if (!ch) - flags &= CODING_STATE_ISO2022_LOCK; - } - - label_continue_loop:; - } - - if (flags & CODING_STATE_END) - DECODE_OUTPUT_PARTIAL_CHAR (ch); - - CODING_STREAM_COMPOSE (str, flags, ch); -} - - -/***** ISO2022 encoder *****/ - -/* Designate CHARSET into register REG. */ - -static void -iso2022_designate (Lisp_Object charset, unsigned char reg, - struct encoding_stream *str, unsigned_char_dynarr *dst) -{ - CONST char *inter94 = "()*+", *inter96= ",-./"; - unsigned int type; - unsigned char final; - Lisp_Object old_charset = str->iso2022.charset[reg]; - - str->iso2022.charset[reg] = charset; - if (!CHARSETP (charset)) - /* charset might be an initial nil or t. */ - return; - type = XCHARSET_TYPE (charset); - final = XCHARSET_FINAL (charset); - if (!str->iso2022.force_charset_on_output[reg] && - CHARSETP (old_charset) && - XCHARSET_TYPE (old_charset) == type && - XCHARSET_FINAL (old_charset) == final) - return; - - str->iso2022.force_charset_on_output[reg] = 0; - - { - charset_conversion_spec_dynarr *dyn = - str->codesys->iso2022.output_conv; - - if (dyn) - { - int i; - - for (i = 0; i < Dynarr_length (dyn); i++) - { - struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); - if (EQ (charset, spec->from_charset)) - charset = spec->to_charset; - } - } - } - - Dynarr_add (dst, ISO_CODE_ESC); - switch (type) - { - case CHARSET_TYPE_94: - Dynarr_add (dst, inter94[reg]); - break; - case CHARSET_TYPE_96: - Dynarr_add (dst, inter96[reg]); - break; - case CHARSET_TYPE_94X94: - Dynarr_add (dst, '$'); - if (reg != 0 - || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys)) - || final < '@' - || final > 'B') - Dynarr_add (dst, inter94[reg]); - break; - case CHARSET_TYPE_96X96: - Dynarr_add (dst, '$'); - Dynarr_add (dst, inter96[reg]); - break; - } - Dynarr_add (dst, final); -} - -static void -ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst) -{ - if (str->iso2022.register_left != 0) - { - Dynarr_add (dst, ISO_CODE_SI); - str->iso2022.register_left = 0; - } -} - -static void -ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst) -{ - if (str->iso2022.register_left != 1) - { - Dynarr_add (dst, ISO_CODE_SO); - str->iso2022.register_left = 1; - } -} - -/* Convert internally-formatted data to ISO2022 format. */ - -static void -encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - unsigned char charmask, c; - unsigned int flags, ch; - enum eol_type eol_type; - unsigned char char_boundary; - struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); - struct Lisp_Coding_System *codesys = str->codesys; - int i; - Lisp_Object charset; - int half; - - /* flags for handling composite chars. We do a little switcharoo - on the source while we're outputting the composite char. */ - unsigned int saved_n = 0; - CONST unsigned char *saved_src = NULL; - int in_composite = 0; - - CODING_STREAM_DECOMPOSE (str, flags, ch); - eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); - char_boundary = str->iso2022.current_char_boundary; - charset = str->iso2022.current_charset; - half = str->iso2022.current_half; - - back_to_square_n: - while (n--) - { - c = *src++; - - if (BYTE_ASCII_P (c)) - { /* Processing ASCII character */ - ch = 0; - - restore_left_to_right_direction (codesys, dst, &flags, 0); - - /* Make sure G0 contains ASCII */ - if ((c > ' ' && c < ISO_CODE_DEL) || - !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys)) - { - ensure_normal_shift (str, dst); - iso2022_designate (Vcharset_ascii, 0, str, dst); - } - - /* If necessary, restore everything to the default state - at end-of-line */ - if (c == '\n' && - !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys))) - { - restore_left_to_right_direction (codesys, dst, &flags, 0); - - ensure_normal_shift (str, dst); - - for (i = 0; i < 4; i++) - { - Lisp_Object initial_charset = - CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); - iso2022_designate (initial_charset, i, str, dst); - } - } - if (c == '\n') - { - if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) - Dynarr_add (dst, '\r'); - if (eol_type != EOL_CR) - Dynarr_add (dst, c); - } - else - { - if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) - && fit_to_be_escape_quoted (c)) - Dynarr_add (dst, ISO_CODE_ESC); - Dynarr_add (dst, c); - } - char_boundary = 1; - } - - else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch)) - { /* Processing Leading Byte */ - ch = 0; - charset = CHARSET_BY_LEADING_BYTE (c); - if (LEADING_BYTE_PREFIX_P(c)) - ch = c; - else if (!EQ (charset, Vcharset_control_1) - && !EQ (charset, Vcharset_composite)) - { - int reg; - - ensure_correct_direction (XCHARSET_DIRECTION (charset), - codesys, dst, &flags, 0); - - /* Now determine which register to use. */ - reg = -1; - for (i = 0; i < 4; i++) - { - if (EQ (charset, str->iso2022.charset[i]) || - EQ (charset, - CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))) - { - reg = i; - break; - } - } - - if (reg == -1) - { - if (XCHARSET_GRAPHIC (charset) != 0) - { - if (!NILP (str->iso2022.charset[1]) && - (!CODING_SYSTEM_ISO2022_SEVEN (codesys) || - CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys))) - reg = 1; - else if (!NILP (str->iso2022.charset[2])) - reg = 2; - else if (!NILP (str->iso2022.charset[3])) - reg = 3; - else - reg = 0; - } - else - reg = 0; - } - - iso2022_designate (charset, reg, str, dst); - - /* Now invoke that register. */ - switch (reg) - { - case 0: - ensure_normal_shift (str, dst); - half = 0; - break; - - case 1: - if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) - { - ensure_shift_out (str, dst); - half = 0; - } - else - half = 1; - break; - - case 2: - if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys)) - { - Dynarr_add (dst, ISO_CODE_ESC); - Dynarr_add (dst, 'N'); - half = 0; - } - else - { - Dynarr_add (dst, ISO_CODE_SS2); - half = 1; - } - break; - - case 3: - if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys)) - { - Dynarr_add (dst, ISO_CODE_ESC); - Dynarr_add (dst, 'O'); - half = 0; - } - else - { - Dynarr_add (dst, ISO_CODE_SS3); - half = 1; - } - break; - - default: - abort (); - } - } - char_boundary = 0; - } - else - { /* Processing Non-ASCII character */ - charmask = (half == 0 ? 0x7F : 0xFF); - char_boundary = 1; - if (EQ (charset, Vcharset_control_1)) - { - if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) - && fit_to_be_escape_quoted (c)) - Dynarr_add (dst, ISO_CODE_ESC); - /* you asked for it ... */ - Dynarr_add (dst, c - 0x20); - } - else - { - switch (XCHARSET_REP_BYTES (charset)) - { - case 2: - Dynarr_add (dst, c & charmask); - break; - case 3: - if (XCHARSET_PRIVATE_P (charset)) - { - Dynarr_add (dst, c & charmask); - ch = 0; - } - else if (ch) - { - if (EQ (charset, Vcharset_composite)) - { - if (in_composite) - { - /* #### Bother! We don't know how to - handle this yet. */ - Dynarr_add (dst, '~'); - } - else - { - Emchar emch = MAKE_CHAR (Vcharset_composite, - ch & 0x7F, c & 0x7F); - Lisp_Object lstr = composite_char_string (emch); - saved_n = n; - saved_src = src; - in_composite = 1; - src = XSTRING_DATA (lstr); - n = XSTRING_LENGTH (lstr); - Dynarr_add (dst, ISO_CODE_ESC); - Dynarr_add (dst, '0'); /* start composing */ - } - } - else - { - Dynarr_add (dst, ch & charmask); - Dynarr_add (dst, c & charmask); - } - ch = 0; - } - else - { - ch = c; - char_boundary = 0; - } - break; - case 4: - if (ch) - { - Dynarr_add (dst, ch & charmask); - Dynarr_add (dst, c & charmask); - ch = 0; - } - else - { - ch = c; - char_boundary = 0; - } - break; - default: - abort (); - } - } - } - } - - if (in_composite) - { - n = saved_n; - src = saved_src; - in_composite = 0; - Dynarr_add (dst, ISO_CODE_ESC); - Dynarr_add (dst, '1'); /* end composing */ - goto back_to_square_n; /* Wheeeeeeeee ..... */ - } - - if (char_boundary && flags & CODING_STATE_END) - { - restore_left_to_right_direction (codesys, dst, &flags, 0); - ensure_normal_shift (str, dst); - for (i = 0; i < 4; i++) - { - Lisp_Object initial_charset = - CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); - iso2022_designate (initial_charset, i, str, dst); - } - } - - CODING_STREAM_COMPOSE (str, flags, ch); - str->iso2022.current_char_boundary = char_boundary; - str->iso2022.current_charset = charset; - str->iso2022.current_half = half; - - /* Verbum caro factum est! */ -} - - -/************************************************************************/ -/* No-conversion methods */ -/************************************************************************/ - -/* This is used when reading in "binary" files -- i.e. files that may - contain all 256 possible byte values and that are not to be - interpreted as being in any particular decoding. */ -static void -decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - unsigned char c; - unsigned int flags, ch; - enum eol_type eol_type; - struct decoding_stream *str = DECODING_STREAM_DATA (decoding); - - CODING_STREAM_DECOMPOSE (str, flags, ch); - eol_type = str->eol_type; - - while (n--) - { - c = *src++; - - DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); - DECODE_ADD_BINARY_CHAR (c, dst); - label_continue_loop:; - } - - DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); - - CODING_STREAM_COMPOSE (str, flags, ch); -} - -static void -encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src, - unsigned_char_dynarr *dst, unsigned int n) -{ - unsigned char c; - struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); - unsigned int flags, ch; - enum eol_type eol_type; - - CODING_STREAM_DECOMPOSE (str, flags, ch); - eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); - - while (n--) - { - c = *src++; - if (c == '\n') - { - if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) - Dynarr_add (dst, '\r'); - if (eol_type != EOL_CR) - Dynarr_add (dst, '\n'); - ch = 0; - } - else if (BYTE_ASCII_P (c)) - { - assert (ch == 0); - Dynarr_add (dst, c); - } - else if (BUFBYTE_LEADING_BYTE_P (c)) - { - assert (ch == 0); - if (c == LEADING_BYTE_LATIN_ISO8859_1 || - c == LEADING_BYTE_CONTROL_1) - ch = c; - else - Dynarr_add (dst, '~'); /* untranslatable character */ - } - else - { - if (ch == LEADING_BYTE_LATIN_ISO8859_1) - Dynarr_add (dst, c); - else if (ch == LEADING_BYTE_CONTROL_1) - { - assert (c < 0xC0); - Dynarr_add (dst, c - 0x20); - } - /* else it should be the second or third byte of an - untranslatable character, so ignore it */ - ch = 0; - } - } - - CODING_STREAM_COMPOSE (str, flags, ch); -} - - -/************************************************************************/ -/* Simple internal/external functions */ -/************************************************************************/ - -static Extbyte_dynarr *conversion_out_dynarr; -static Bufbyte_dynarr *conversion_in_dynarr; - -/* Determine coding system from coding format */ - -/* #### not correct for all values of `fmt'! */ -static Lisp_Object -external_data_format_to_coding_system (enum external_data_format fmt) -{ - switch (fmt) - { - case FORMAT_FILENAME: - case FORMAT_TERMINAL: - if (EQ (Vfile_name_coding_system, Qnil) || - EQ (Vfile_name_coding_system, Qbinary)) - return Qnil; - else - return Fget_coding_system (Vfile_name_coding_system); - case FORMAT_CTEXT: - return Fget_coding_system (Qctext); - default: - return Qnil; - } -} - -CONST Extbyte * -convert_to_external_format (CONST Bufbyte *ptr, - Bytecount len, - Extcount *len_out, - enum external_data_format fmt) -{ - Lisp_Object coding_system = external_data_format_to_coding_system (fmt); - - if (!conversion_out_dynarr) - conversion_out_dynarr = Dynarr_new (Extbyte); - else - Dynarr_reset (conversion_out_dynarr); - - if (NILP (coding_system)) - { - CONST Bufbyte *end = ptr + len; - - for (; ptr < end;) - { - Bufbyte c = - (BYTE_ASCII_P (*ptr)) ? *ptr : - (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) : - (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) : - '~'; - - Dynarr_add (conversion_out_dynarr, (Extbyte) c); - INC_CHARPTR (ptr); - } - -#ifdef ERROR_CHECK_BUFPOS - assert (ptr == end); -#endif - } - else - { - Lisp_Object instream, outstream, da_outstream; - Lstream *istr, *ostr; - struct gcpro gcpro1, gcpro2, gcpro3; - char tempbuf[1024]; /* some random amount */ - - instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len); - da_outstream = make_dynarr_output_stream - ((unsigned_char_dynarr *) conversion_out_dynarr); - outstream = - make_encoding_output_stream (XLSTREAM (da_outstream), coding_system); - istr = XLSTREAM (instream); - ostr = XLSTREAM (outstream); - GCPRO3 (instream, outstream, da_outstream); - while (1) - { - int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); - if (!size_in_bytes) - break; - Lstream_write (ostr, tempbuf, size_in_bytes); - } - Lstream_close (istr); - Lstream_close (ostr); - UNGCPRO; - Lstream_delete (istr); - Lstream_delete (ostr); - Lstream_delete (XLSTREAM (da_outstream)); - } - - *len_out = Dynarr_length (conversion_out_dynarr); - Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */ - return Dynarr_atp (conversion_out_dynarr, 0); -} - -CONST Bufbyte * -convert_from_external_format (CONST Extbyte *ptr, - Extcount len, - Bytecount *len_out, - enum external_data_format fmt) -{ - Lisp_Object coding_system = external_data_format_to_coding_system (fmt); - - if (!conversion_in_dynarr) - conversion_in_dynarr = Dynarr_new (Bufbyte); - else - Dynarr_reset (conversion_in_dynarr); - - if (NILP (coding_system)) - { - CONST Extbyte *end = ptr + len; - for (; ptr < end; ptr++) - { - Extbyte c = *ptr; - DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr); - } - } - else - { - Lisp_Object instream, outstream, da_outstream; - Lstream *istr, *ostr; - struct gcpro gcpro1, gcpro2, gcpro3; - char tempbuf[1024]; /* some random amount */ - - instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len); - da_outstream = make_dynarr_output_stream - ((unsigned_char_dynarr *) conversion_in_dynarr); - outstream = - make_decoding_output_stream (XLSTREAM (da_outstream), coding_system); - istr = XLSTREAM (instream); - ostr = XLSTREAM (outstream); - GCPRO3 (instream, outstream, da_outstream); - while (1) - { - int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); - if (!size_in_bytes) - break; - Lstream_write (ostr, tempbuf, size_in_bytes); - } - Lstream_close (istr); - Lstream_close (ostr); - UNGCPRO; - Lstream_delete (istr); - Lstream_delete (ostr); - Lstream_delete (XLSTREAM (da_outstream)); - } - - *len_out = Dynarr_length (conversion_in_dynarr); - Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */ - return Dynarr_atp (conversion_in_dynarr, 0); -} - - -/************************************************************************/ -/* Initialization */ -/************************************************************************/ - -void -syms_of_mule_coding (void) -{ - defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system"); - deferror (&Qcoding_system_error, "coding-system-error", - "Coding-system error", Qio_error); - - DEFSUBR (Fcoding_system_p); - DEFSUBR (Ffind_coding_system); - DEFSUBR (Fget_coding_system); - DEFSUBR (Fcoding_system_list); - DEFSUBR (Fcoding_system_name); - DEFSUBR (Fmake_coding_system); - DEFSUBR (Fcopy_coding_system); - DEFSUBR (Fsubsidiary_coding_system); - - DEFSUBR (Fcoding_system_type); - DEFSUBR (Fcoding_system_doc_string); - DEFSUBR (Fcoding_system_charset); - DEFSUBR (Fcoding_system_property); - - DEFSUBR (Fcoding_category_list); - DEFSUBR (Fset_coding_priority_list); - DEFSUBR (Fcoding_priority_list); - DEFSUBR (Fset_coding_category_system); - DEFSUBR (Fcoding_category_system); - - DEFSUBR (Fdetect_coding_region); - DEFSUBR (Fdecode_coding_region); - DEFSUBR (Fencode_coding_region); - DEFSUBR (Fdecode_shift_jis_char); - DEFSUBR (Fencode_shift_jis_char); - DEFSUBR (Fdecode_big5_char); - DEFSUBR (Fencode_big5_char); - - defsymbol (&Qcoding_system_p, "coding-system-p"); - - defsymbol (&Qbig5, "big5"); - defsymbol (&Qshift_jis, "shift-jis"); - defsymbol (&Qno_conversion, "no-conversion"); - defsymbol (&Qccl, "ccl"); - defsymbol (&Qiso2022, "iso2022"); - - defsymbol (&Qmnemonic, "mnemonic"); - defsymbol (&Qeol_type, "eol-type"); - defsymbol (&Qpost_read_conversion, "post-read-conversion"); - defsymbol (&Qpre_write_conversion, "pre-write-conversion"); - - defsymbol (&Qcr, "cr"); - defsymbol (&Qlf, "lf"); - defsymbol (&Qcrlf, "crlf"); - defsymbol (&Qeol_cr, "eol-cr"); - defsymbol (&Qeol_lf, "eol-lf"); - defsymbol (&Qeol_crlf, "eol-crlf"); - - defsymbol (&Qcharset_g0, "charset-g0"); - defsymbol (&Qcharset_g1, "charset-g1"); - defsymbol (&Qcharset_g2, "charset-g2"); - defsymbol (&Qcharset_g3, "charset-g3"); - defsymbol (&Qforce_g0_on_output, "force-g0-on-output"); - defsymbol (&Qforce_g1_on_output, "force-g1-on-output"); - defsymbol (&Qforce_g2_on_output, "force-g2-on-output"); - defsymbol (&Qforce_g3_on_output, "force-g3-on-output"); - defsymbol (&Qshort, "short"); - defsymbol (&Qno_ascii_eol, "no-ascii-eol"); - defsymbol (&Qno_ascii_cntl, "no-ascii-cntl"); - defsymbol (&Qseven, "seven"); - defsymbol (&Qlock_shift, "lock-shift"); - defsymbol (&Qno_iso6429, "no-iso6429"); - defsymbol (&Qescape_quoted, "escape-quoted"); - defsymbol (&Qinput_charset_conversion, "input-charset-conversion"); - defsymbol (&Qoutput_charset_conversion, "output-charset-conversion"); - - defsymbol (&Qencode, "encode"); - defsymbol (&Qdecode, "decode"); - - defsymbol (&Qctext, "ctext"); - - defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS], - "shift-jis"); - defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7], - "iso-7"); - defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE], - "iso-8-designate"); - defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1], - "iso-8-1"); - defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2], - "iso-8-2"); - defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT], - "iso-lock-shift"); - defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5], - "big5"); - defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION], - "no-conversion"); -} - -void -lstream_type_create_mule_coding (void) -{ - LSTREAM_HAS_METHOD (decoding, reader); - LSTREAM_HAS_METHOD (decoding, writer); - LSTREAM_HAS_METHOD (decoding, rewinder); - LSTREAM_HAS_METHOD (decoding, seekable_p); - LSTREAM_HAS_METHOD (decoding, flusher); - LSTREAM_HAS_METHOD (decoding, closer); - LSTREAM_HAS_METHOD (decoding, marker); - - LSTREAM_HAS_METHOD (encoding, reader); - LSTREAM_HAS_METHOD (encoding, writer); - LSTREAM_HAS_METHOD (encoding, rewinder); - LSTREAM_HAS_METHOD (encoding, seekable_p); - LSTREAM_HAS_METHOD (encoding, flusher); - LSTREAM_HAS_METHOD (encoding, closer); - LSTREAM_HAS_METHOD (encoding, marker); -} - -void -vars_of_mule_coding (void) -{ - int i; - - /* Initialize to something reasonable ... */ - for (i = 0; i <= CODING_CATEGORY_LAST; i++) - { - coding_category_system[i] = Qnil; - coding_category_by_priority[i] = i; - } - - DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /* -Coding system used for TTY keyboard input. -Not used under a windowing system. -*/ ); - Vkeyboard_coding_system = Qnil; - - DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /* -Coding system used for TTY display output. -Not used under a windowing system. -*/ ); - Vterminal_coding_system = Qnil; - - DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /* -Overriding coding system used when writing a file or process. -You should *bind* this, not set it. If this is non-nil, it specifies -the coding system that will be used when a file or process is read -in, and overrides `buffer-file-coding-system-for-read', -`insert-file-contents-pre-hook', etc. Use those variables instead of -this one for permanent changes to the environment. -*/ ); - Vcoding_system_for_read = Qnil; - - DEFVAR_LISP ("coding-system-for-write", - &Vcoding_system_for_write /* -Overriding coding system used when writing a file or process. -You should *bind* this, not set it. If this is non-nil, it specifies -the coding system that will be used when a file or process is wrote -in, and overrides `buffer-file-coding-system', -`write-region-pre-hook', etc. Use those variables instead of this one -for permanent changes to the environment. -*/ ); - Vcoding_system_for_write = Qnil; - - DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /* -Coding system used to convert pathnames when accessing files. -*/ ); - Vfile_name_coding_system = Qnil; - - DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /* -Non-nil means the buffer contents are regarded as multi-byte form -of characters, not a binary code. This affects the display, file I/O, -and behaviors of various editing commands. - -Setting this to nil does not do anything. -*/ ); - enable_multibyte_characters = 1; -} - -void -complex_vars_of_mule_coding (void) -{ - staticpro (&Vcoding_system_hash_table); - Vcoding_system_hash_table = - make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ); - - the_codesys_prop_dynarr = Dynarr_new (codesys_prop); - -#define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \ -{ \ - struct codesys_prop csp; \ - csp.sym = (Sym); \ - csp.prop_type = (Prop_Type); \ - Dynarr_add (the_codesys_prop_dynarr, csp); \ -} while (0) - - DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic); - DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type); - DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr); - DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf); - DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf); - DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion); - DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion); - - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion); - DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion); - - DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode); - DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode); - - /* Need to create this here or we're really screwed. */ - Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"), - list2 (Qmnemonic, build_string ("Noconv"))); - - Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf), - Qbinary); - - /* Need this for bootstrapping */ - coding_category_system[CODING_CATEGORY_NO_CONVERSION] = - Fget_coding_system (Qno_conversion); -} - -#endif diff --git a/src/mule-coding.h b/src/mule-coding.h deleted file mode 100644 index 7c631d9..0000000 --- a/src/mule-coding.h +++ /dev/null @@ -1,450 +0,0 @@ -/* Header for code conversion stuff - Copyright (C) 1991, 1995 Free Software Foundation, Inc. - Copyright (C) 1995 Sun Microsystems, Inc. - -This file is part of XEmacs. - -XEmacs is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 2, or (at your option) any -later version. - -XEmacs is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with XEmacs; see the file COPYING. If not, write to -the Free Software Foundation, Inc., 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ - -/* Synched up with: Mule 2.3. Not in FSF. */ - -/* 91.10.09 written by K.Handa */ -/* Rewritten by Ben Wing . */ - -#ifndef _XEMACS_MULE_CODING_H_ -#define _XEMACS_MULE_CODING_H_ - -struct decoding_stream; -struct encoding_stream; - -/* Coding system types. These go into the TYPE field of a - struct Lisp_Coding_System. */ - -enum coding_system_type -{ - CODESYS_AUTODETECT, /* Automatic conversion. */ - CODESYS_SHIFT_JIS, /* Shift-JIS; Hankaku (half-width) KANA - is also supported. */ - CODESYS_ISO2022, /* Any ISO2022-compliant coding system. - Includes JIS, EUC, CTEXT */ - CODESYS_BIG5, /* BIG5 (used for Taiwanese). */ - CODESYS_CCL, /* Converter written in CCL. */ - CODESYS_NO_CONVERSION /* "No conversion"; used for binary files. - We use quotes because there really - is some conversion being applied, - but it appears to the user as if - the text is read in without conversion. */ -#ifdef DEBUG_XEMACS - ,CODESYS_INTERNAL /* Raw (internally-formatted) data. */ -#endif -}; - -enum eol_type -{ - EOL_AUTODETECT, - EOL_LF, - EOL_CRLF, - EOL_CR -}; - -typedef struct charset_conversion_spec charset_conversion_spec; -struct charset_conversion_spec -{ - Lisp_Object from_charset; - Lisp_Object to_charset; -}; - -typedef struct -{ - Dynarr_declare (charset_conversion_spec); -} charset_conversion_spec_dynarr; - -struct Lisp_Coding_System -{ - struct lcrecord_header header; - - /* Name and doc string of this coding system. */ - Lisp_Object name, doc_string; - - /* This is the major type of the coding system -- one of Big5, ISO2022, - Shift-JIS, etc. See the constants above. */ - enum coding_system_type type; - - /* Mnemonic string displayed in the modeline when this coding - system is active for a particular buffer. */ - Lisp_Object mnemonic; - - Lisp_Object post_read_conversion, pre_write_conversion; - - enum eol_type eol_type; - - /* Subsidiary coding systems that specify a particular type of EOL - marking, rather than autodetecting it. These will only be non-nil - if (eol_type == EOL_AUTODETECT). */ - Lisp_Object eol_lf, eol_crlf, eol_cr; - - struct - { - /* What are the charsets to be initially designated to G0, G1, - G2, G3? If t, no charset is initially designated. If nil, - no charset is initially designated and no charset is allowed - to be designated. */ - Lisp_Object initial_charset[4]; - - /* If true, a designation escape sequence needs to be sent on output - for the charset in G[0-3] before that charset is used. */ - unsigned char force_charset_on_output[4]; - - charset_conversion_spec_dynarr *input_conv; - charset_conversion_spec_dynarr *output_conv; - - unsigned int shoort :1; /* C makes you speak Dutch */ - unsigned int no_ascii_eol :1; - unsigned int no_ascii_cntl :1; - unsigned int seven :1; - unsigned int lock_shift :1; - unsigned int no_iso6429 :1; - unsigned int escape_quoted :1; - } iso2022; - - struct - { - /* For a CCL coding system, these specify the CCL programs used for - decoding (input) and encoding (output). */ - Lisp_Object decode, encode; - } ccl; -}; - -DECLARE_LRECORD (coding_system, struct Lisp_Coding_System); -#define XCODING_SYSTEM(x) XRECORD (x, coding_system, struct Lisp_Coding_System) -#define XSETCODING_SYSTEM(x, p) XSETRECORD (x, p, coding_system) -#define CODING_SYSTEMP(x) RECORDP (x, coding_system) -#define GC_CODING_SYSTEMP(x) GC_RECORDP (x, coding_system) -#define CHECK_CODING_SYSTEM(x) CHECK_RECORD (x, coding_system) -#define CONCHECK_CODING_SYSTEM(x) CONCHECK_RECORD (x, coding_system) - -#define CODING_SYSTEM_NAME(codesys) ((codesys)->name) -#define CODING_SYSTEM_DOC_STRING(codesys) ((codesys)->doc_string) -#define CODING_SYSTEM_TYPE(codesys) ((codesys)->type) -#define CODING_SYSTEM_MNEMONIC(codesys) ((codesys)->mnemonic) -#define CODING_SYSTEM_POST_READ_CONVERSION(codesys) \ - ((codesys)->post_read_conversion) -#define CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) \ - ((codesys)->pre_write_conversion) -#define CODING_SYSTEM_EOL_TYPE(codesys) ((codesys)->eol_type) -#define CODING_SYSTEM_EOL_LF(codesys) ((codesys)->eol_lf) -#define CODING_SYSTEM_EOL_CRLF(codesys) ((codesys)->eol_crlf) -#define CODING_SYSTEM_EOL_CR(codesys) ((codesys)->eol_cr) -#define CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \ - ((codesys)->iso2022.initial_charset[g]) -#define CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \ - ((codesys)->iso2022.force_charset_on_output[g]) -#define CODING_SYSTEM_ISO2022_SHORT(codesys) ((codesys)->iso2022.shoort) -#define CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \ - ((codesys)->iso2022.no_ascii_eol) -#define CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \ - ((codesys)->iso2022.no_ascii_cntl) -#define CODING_SYSTEM_ISO2022_SEVEN(codesys) ((codesys)->iso2022.seven) -#define CODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \ - ((codesys)->iso2022.lock_shift) -#define CODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \ - ((codesys)->iso2022.no_iso6429) -#define CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \ - ((codesys)->iso2022.escape_quoted) -#define CODING_SYSTEM_CCL_DECODE(codesys) ((codesys)->ccl.decode) -#define CODING_SYSTEM_CCL_ENCODE(codesys) ((codesys)->ccl.encode) - -#define XCODING_SYSTEM_NAME(codesys) \ - CODING_SYSTEM_NAME (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_DOC_STRING(codesys) \ - CODING_SYSTEM_DOC_STRING (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_TYPE(codesys) \ - CODING_SYSTEM_TYPE (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_MNEMONIC(codesys) \ - CODING_SYSTEM_MNEMONIC (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_POST_READ_CONVERSION(codesys) \ - CODING_SYSTEM_POST_READ_CONVERSION (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) \ - CODING_SYSTEM_PRE_WRITE_CONVERSION (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_EOL_TYPE(codesys) \ - CODING_SYSTEM_EOL_TYPE (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_EOL_LF(codesys) \ - CODING_SYSTEM_EOL_LF (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_EOL_CRLF(codesys) \ - CODING_SYSTEM_EOL_CRLF (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_EOL_CR(codesys) \ - CODING_SYSTEM_EOL_CR (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \ - CODING_SYSTEM_ISO2022_INITIAL_CHARSET (XCODING_SYSTEM (codesys), g) -#define XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \ - CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (XCODING_SYSTEM (codesys), g) -#define XCODING_SYSTEM_ISO2022_SHORT(codesys) \ - CODING_SYSTEM_ISO2022_SHORT (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \ - CODING_SYSTEM_ISO2022_NO_ASCII_EOL (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \ - CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_ISO2022_SEVEN(codesys) \ - CODING_SYSTEM_ISO2022_SEVEN (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \ - CODING_SYSTEM_ISO2022_LOCK_SHIFT (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \ - CODING_SYSTEM_ISO2022_NO_ISO6429 (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \ - CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_CCL_DECODE(codesys) \ - CODING_SYSTEM_CCL_DECODE (XCODING_SYSTEM (codesys)) -#define XCODING_SYSTEM_CCL_ENCODE(codesys) \ - CODING_SYSTEM_CCL_ENCODE (XCODING_SYSTEM (codesys)) - -extern Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error; - -extern Lisp_Object Vkeyboard_coding_system; -extern Lisp_Object Vterminal_coding_system; -extern Lisp_Object Vcoding_system_for_read; -extern Lisp_Object Vcoding_system_for_write; -extern Lisp_Object Vpathname_coding_system; - -extern Lisp_Object Qescape_quoted; - -/* Flags indicating current state while converting code. */ - -/* Used by everyone. */ - -#define CODING_STATE_END (1 << 0) /* If set, this is the last chunk of - data being processed. When this - is finished, output any necessary - terminating control characters, - escape sequences, etc. */ -#define CODING_STATE_CR (1 << 1) /* If set, we just saw a CR. */ - - -/* Used by Big 5 on output. */ - -#define CODING_STATE_BIG5_1 (1 << 2) /* If set, we just encountered - LEADING_BYTE_BIG5_1. */ -#define CODING_STATE_BIG5_2 (1 << 3) /* If set, we just encountered - LEADING_BYTE_BIG5_2. */ - - -/* Used by ISO2022 on input and output. */ - -#define CODING_STATE_R2L (1 << 4) /* If set, the current - directionality is right-to-left. - Otherwise, it's left-to-right. */ - - -/* Used by ISO2022 on input. */ - -#define CODING_STATE_ESCAPE (1 << 5) /* If set, we're currently parsing - an escape sequence and the upper - 16 bits should be looked at to - indicate what partial escape - sequence we've seen so far. - Otherwise, we're running - through actual text. */ -#define CODING_STATE_SS2 (1 << 6) /* If set, G2 is invoked into GL, but - only for the next character. */ -#define CODING_STATE_SS3 (1 << 7) /* If set, G3 is invoked into GL, - but only for the next character. - If both CODING_STATE_SS2 and - CODING_STATE_SS3 are set, - CODING_STATE_SS2 overrides; but - this probably indicates an error - in the text encoding. */ -#define CODING_STATE_COMPOSITE (1 << 8) /* If set, we're currently processing - a composite character (i.e. a - character constructed by - overstriking two or more - characters). */ - - -/* CODING_STATE_ISO2022_LOCK is the mask of flags that remain on until - explicitly turned off when in the ISO2022 encoder/decoder. Other flags are - turned off at the end of processing each character or escape sequence. */ -# define CODING_STATE_ISO2022_LOCK \ - (CODING_STATE_END | CODING_STATE_COMPOSITE | CODING_STATE_R2L) -#define CODING_STATE_BIG5_LOCK \ - CODING_STATE_END - -/* Flags indicating what we've seen so far when parsing an - ISO2022 escape sequence. */ -enum iso_esc_flag -{ - /* Partial sequences */ - ISO_ESC_NOTHING, /* Nothing has been seen. */ - ISO_ESC, /* We've seen ESC. */ - ISO_ESC_2_4, /* We've seen ESC $. This indicates - that we're designating a multi-byte, rather - than a single-byte, character set. */ - ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (. - This means designate a 94-character - character set into G0. */ - ISO_ESC_2_9, /* We've seen ESC 0x29 -- designate a - 94-character character set into G1. */ - ISO_ESC_2_10, /* We've seen ESC 0x2A. */ - ISO_ESC_2_11, /* We've seen ESC 0x2B. */ - ISO_ESC_2_12, /* We've seen ESC 0x2C -- designate a - 96-character character set into G0. - (This is not ISO2022-standard. - The following 96-character - control sequences are standard, - though.) */ - ISO_ESC_2_13, /* We've seen ESC 0x2D -- designate a - 96-character character set into G1. - */ - ISO_ESC_2_14, /* We've seen ESC 0x2E. */ - ISO_ESC_2_15, /* We've seen ESC 0x2F. */ - ISO_ESC_2_4_8, /* We've seen ESC $ 0x28 -- designate - a 94^N character set into G0. */ - ISO_ESC_2_4_9, /* We've seen ESC $ 0x29. */ - ISO_ESC_2_4_10, /* We've seen ESC $ 0x2A. */ - ISO_ESC_2_4_11, /* We've seen ESC $ 0x2B. */ - ISO_ESC_2_4_12, /* We've seen ESC $ 0x2C. */ - ISO_ESC_2_4_13, /* We've seen ESC $ 0x2D. */ - ISO_ESC_2_4_14, /* We've seen ESC $ 0x2E. */ - ISO_ESC_2_4_15, /* We've seen ESC $ 0x2F. */ - ISO_ESC_5_11, /* We've seen ESC [ or 0x9B. This - starts a directionality-control - sequence. The next character - must be 0, 1, 2, or ]. */ - ISO_ESC_5_11_0, /* We've seen 0x9B 0. The next - character must be ]. */ - ISO_ESC_5_11_1, /* We've seen 0x9B 1. The next - character must be ]. */ - ISO_ESC_5_11_2, /* We've seen 0x9B 2. The next - character must be ]. */ - - /* Full sequences. */ - ISO_ESC_START_COMPOSITE, /* Private usage for START COMPOSING */ - ISO_ESC_END_COMPOSITE, /* Private usage for END COMPOSING */ - ISO_ESC_SINGLE_SHIFT, /* We've seen a complete single-shift sequence. */ - ISO_ESC_LOCKING_SHIFT,/* We've seen a complete locking-shift sequence. */ - ISO_ESC_DESIGNATE, /* We've seen a complete designation sequence. */ - ISO_ESC_DIRECTIONALITY,/* We've seen a complete ISO6429 directionality - sequence. */ - ISO_ESC_LITERAL /* We've seen a literal character ala - escape-quoting. */ -}; - -/* Macros to define code of control characters for ISO2022's functions. */ - /* code */ /* function */ -#define ISO_CODE_LF 0x0A /* line-feed */ -#define ISO_CODE_CR 0x0D /* carriage-return */ -#define ISO_CODE_SO 0x0E /* shift-out */ -#define ISO_CODE_SI 0x0F /* shift-in */ -#define ISO_CODE_ESC 0x1B /* escape */ -#define ISO_CODE_DEL 0x7F /* delete */ -#define ISO_CODE_SS2 0x8E /* single-shift-2 */ -#define ISO_CODE_SS3 0x8F /* single-shift-3 */ -#define ISO_CODE_CSI 0x9B /* control-sequence-introduce */ - -/* Macros to access an encoding stream or decoding stream */ - -#define CODING_STREAM_DECOMPOSE(str, flags, ch) \ -do { \ - flags = (str)->flags; \ - ch = (str)->ch; \ -} while (0) - -#define CODING_STREAM_COMPOSE(str, flags, ch) \ -do { \ - (str)->flags = flags; \ - (str)->ch = ch; \ -} while (0) - - -/* For detecting the encoding of text */ -enum coding_category_type -{ - CODING_CATEGORY_SHIFT_JIS, - CODING_CATEGORY_ISO_7, /* ISO2022 system using only seven-bit bytes, - no locking shift */ - CODING_CATEGORY_ISO_8_DESIGNATE, /* ISO2022 system using eight-bit bytes, - no locking shift, no single shift, - using designation to switch charsets */ - CODING_CATEGORY_ISO_8_1, /* ISO2022 system using eight-bit bytes, - no locking shift, no designation sequences, - one-dimension characters in the upper half. */ - CODING_CATEGORY_ISO_8_2, /* ISO2022 system using eight-bit bytes, - no locking shift, no designation sequences, - two-dimension characters in the upper half. */ - CODING_CATEGORY_ISO_LOCK_SHIFT, /* ISO2022 system using locking shift */ - CODING_CATEGORY_BIG5, - CODING_CATEGORY_NO_CONVERSION -}; - -#define CODING_CATEGORY_LAST CODING_CATEGORY_NO_CONVERSION - -#define CODING_CATEGORY_SHIFT_JIS_MASK \ - (1 << CODING_CATEGORY_SHIFT_JIS) -#define CODING_CATEGORY_ISO_7_MASK \ - (1 << CODING_CATEGORY_ISO_7) -#define CODING_CATEGORY_ISO_8_DESIGNATE_MASK \ - (1 << CODING_CATEGORY_ISO_8_DESIGNATE) -#define CODING_CATEGORY_ISO_8_1_MASK \ - (1 << CODING_CATEGORY_ISO_8_1) -#define CODING_CATEGORY_ISO_8_2_MASK \ - (1 << CODING_CATEGORY_ISO_8_2) -#define CODING_CATEGORY_ISO_LOCK_SHIFT_MASK \ - (1 << CODING_CATEGORY_ISO_LOCK_SHIFT) -#define CODING_CATEGORY_BIG5_MASK \ - (1 << CODING_CATEGORY_BIG5) -#define CODING_CATEGORY_NO_CONVERSION_MASK \ - (1 << CODING_CATEGORY_NO_CONVERSION) -#define CODING_CATEGORY_NOT_FINISHED_MASK \ - (1 << 30) - -/* Convert shift-JIS code (sj1, sj2) into internal string - representation (c1, c2). (The leading byte is assumed.) */ - -#define DECODE_SJIS(sj1, sj2, c1, c2) \ -do { \ - int I1 = sj1, I2 = sj2; \ - if (I2 >= 0x9f) \ - c1 = (I1 << 1) - ((I1 >= 0xe0) ? 0xe0 : 0x60), \ - c2 = I2 + 2; \ - else \ - c1 = (I1 << 1) - ((I1 >= 0xe0) ? 0xe1 : 0x61), \ - c2 = I2 + ((I2 >= 0x7f) ? 0x60 : 0x61); \ -} while (0) - -/* Convert the internal string representation of a Shift-JIS character - (c1, c2) into Shift-JIS code (sj1, sj2). The leading byte is - assumed. */ - -#define ENCODE_SJIS(c1, c2, sj1, sj2) \ -do { \ - int I1 = c1, I2 = c2; \ - if (I1 & 1) \ - sj1 = (I1 >> 1) + ((I1 < 0xdf) ? 0x31 : 0x71), \ - sj2 = I2 - ((I2 >= 0xe0) ? 0x60 : 0x61); \ - else \ - sj1 = (I1 >> 1) + ((I1 < 0xdf) ? 0x30 : 0x70), \ - sj2 = I2 - 2; \ -} while (0) - -Lisp_Object make_decoding_input_stream (Lstream *stream, Lisp_Object codesys); -Lisp_Object make_encoding_input_stream (Lstream *stream, Lisp_Object codesys); -Lisp_Object make_decoding_output_stream (Lstream *stream, Lisp_Object codesys); -Lisp_Object make_encoding_output_stream (Lstream *stream, Lisp_Object codesys); -Lisp_Object decoding_stream_coding_system (Lstream *stream); -Lisp_Object encoding_stream_coding_system (Lstream *stream); -void set_decoding_stream_coding_system (Lstream *stream, Lisp_Object codesys); -void set_encoding_stream_coding_system (Lstream *stream, Lisp_Object codesys); -void determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, - enum eol_type *eol_type_in_out); -#endif /* _XEMACS_MULE_CODING_H_ */