1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
35 #include "file-coding.h"
37 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
39 Lisp_Object Vkeyboard_coding_system;
40 Lisp_Object Vterminal_coding_system;
41 Lisp_Object Vcoding_system_for_read;
42 Lisp_Object Vcoding_system_for_write;
43 Lisp_Object Vfile_name_coding_system;
45 /* Table of symbols identifying each coding category. */
46 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
48 /* Coding system currently associated with each coding category. */
49 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
51 /* Table of all coding categories in decreasing order of priority.
52 This describes a permutation of the possible coding categories. */
53 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
55 Lisp_Object Qcoding_system_p;
57 Lisp_Object Qno_conversion, Qccl, Qiso2022;
58 /* Qinternal in general.c */
60 Lisp_Object Qmnemonic, Qeol_type;
61 Lisp_Object Qcr, Qcrlf, Qlf;
62 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
63 Lisp_Object Qpost_read_conversion;
64 Lisp_Object Qpre_write_conversion;
67 Lisp_Object Qbig5, Qshift_jis;
68 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
69 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
70 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
71 Lisp_Object Qno_iso6429;
72 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
73 Lisp_Object Qctext, Qescape_quoted;
74 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
76 Lisp_Object Qencode, Qdecode;
78 Lisp_Object Vcoding_system_hashtable;
80 int enable_multibyte_characters;
83 /* Additional information used by the ISO2022 decoder and detector. */
84 struct iso2022_decoder
86 /* CHARSET holds the character sets currently assigned to the G0
87 through G3 variables. It is initialized from the array
88 INITIAL_CHARSET in CODESYS. */
89 Lisp_Object charset[4];
91 /* Which registers are currently invoked into the left (GL) and
92 right (GR) halves of the 8-bit encoding space? */
93 int register_left, register_right;
95 /* ISO_ESC holds a value indicating part of an escape sequence
96 that has already been seen. */
97 enum iso_esc_flag esc;
99 /* This records the bytes we've seen so far in an escape sequence,
100 in case the sequence is invalid (we spit out the bytes unchanged). */
101 unsigned char esc_bytes[8];
103 /* Index for next byte to store in ISO escape sequence. */
106 /* Stuff seen so far when composing a string. */
107 unsigned_char_dynarr *composite_chars;
109 /* If we saw an invalid designation sequence for a particular
110 register, we flag it here and switch to ASCII. The next time we
111 see a valid designation for this register, we turn off the flag
112 and do the designation normally, but pretend the sequence was
113 invalid. The effect of all this is that (most of the time) the
114 escape sequences for both the switch to the unknown charset, and
115 the switch back to the known charset, get inserted literally into
116 the buffer and saved out as such. The hope is that we can
117 preserve the escape sequences so that the resulting written out
118 file makes sense. If we don't do any of this, the designation
119 to the invalid charset will be preserved but that switch back
120 to the known charset will probably get eaten because it was
121 the same charset that was already present in the register. */
122 unsigned char invalid_designated[4];
124 /* We try to do similar things as above for direction-switching
125 sequences. If we encountered a direction switch while an
126 invalid designation was present, or an invalid designation
127 just after a direction switch (i.e. no valid designation
128 encountered yet), we insert the direction-switch escape
129 sequence literally into the output stream, and later on
130 insert the corresponding direction-restoring escape sequence
132 unsigned int switched_dir_and_no_valid_charset_yet :1;
133 unsigned int invalid_switch_dir :1;
135 /* Tells the decoder to output the escape sequence literally
136 even though it was valid. Used in the games we play to
137 avoid lossage when we encounter invalid designations. */
138 unsigned int output_literally :1;
139 /* We encountered a direction switch followed by an invalid
140 designation. We didn't output the direction switch
141 literally because we didn't know about the invalid designation;
142 but we have to do so now. */
143 unsigned int output_direction_sequence :1;
146 EXFUN (Fcopy_coding_system, 2);
148 struct detection_state;
149 static int detect_coding_sjis (struct detection_state *st,
150 CONST unsigned char *src,
152 static void decode_coding_sjis (Lstream *decoding,
153 CONST unsigned char *src,
154 unsigned_char_dynarr *dst,
156 static void encode_coding_sjis (Lstream *encoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static int detect_coding_big5 (struct detection_state *st,
161 CONST unsigned char *src,
163 static void decode_coding_big5 (Lstream *decoding,
164 CONST unsigned char *src,
165 unsigned_char_dynarr *dst, unsigned int n);
166 static void encode_coding_big5 (Lstream *encoding,
167 CONST unsigned char *src,
168 unsigned_char_dynarr *dst, unsigned int n);
169 static int postprocess_iso2022_mask (int mask);
170 static void reset_iso2022 (Lisp_Object coding_system,
171 struct iso2022_decoder *iso);
172 static int detect_coding_iso2022 (struct detection_state *st,
173 CONST unsigned char *src,
175 static void decode_coding_iso2022 (Lstream *decoding,
176 CONST unsigned char *src,
177 unsigned_char_dynarr *dst, unsigned int n);
178 static void encode_coding_iso2022 (Lstream *encoding,
179 CONST unsigned char *src,
180 unsigned_char_dynarr *dst, unsigned int n);
182 static void decode_coding_no_conversion (Lstream *decoding,
183 CONST unsigned char *src,
184 unsigned_char_dynarr *dst,
186 static void encode_coding_no_conversion (Lstream *encoding,
187 CONST unsigned char *src,
188 unsigned_char_dynarr *dst,
190 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
191 unsigned_char_dynarr *dst, unsigned int n);
192 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
193 unsigned_char_dynarr *dst, unsigned int n);
195 typedef struct codesys_prop codesys_prop;
204 Dynarr_declare (codesys_prop);
205 } codesys_prop_dynarr;
207 codesys_prop_dynarr *the_codesys_prop_dynarr;
209 enum codesys_prop_enum
212 CODESYS_PROP_ISO2022,
217 /************************************************************************/
218 /* Coding system functions */
219 /************************************************************************/
221 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
222 static void print_coding_system (Lisp_Object, Lisp_Object, int);
223 static void finalize_coding_system (void *header, int for_disksave);
225 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
226 mark_coding_system, print_coding_system,
227 finalize_coding_system,
228 0, 0, struct Lisp_Coding_System);
231 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
233 struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
235 (markobj) (CODING_SYSTEM_NAME (codesys));
236 (markobj) (CODING_SYSTEM_DOC_STRING (codesys));
237 (markobj) (CODING_SYSTEM_MNEMONIC (codesys));
238 (markobj) (CODING_SYSTEM_EOL_LF (codesys));
239 (markobj) (CODING_SYSTEM_EOL_CRLF (codesys));
240 (markobj) (CODING_SYSTEM_EOL_CR (codesys));
242 switch (CODING_SYSTEM_TYPE (codesys))
246 case CODESYS_ISO2022:
247 for (i = 0; i < 4; i++)
248 (markobj) (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
249 if (codesys->iso2022.input_conv)
251 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
253 struct charset_conversion_spec *ccs =
254 Dynarr_atp (codesys->iso2022.input_conv, i);
255 (markobj) (ccs->from_charset);
256 (markobj) (ccs->to_charset);
259 if (codesys->iso2022.output_conv)
261 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
263 struct charset_conversion_spec *ccs =
264 Dynarr_atp (codesys->iso2022.output_conv, i);
265 (markobj) (ccs->from_charset);
266 (markobj) (ccs->to_charset);
272 (markobj) (CODING_SYSTEM_CCL_DECODE (codesys));
273 (markobj) (CODING_SYSTEM_CCL_ENCODE (codesys));
280 (markobj) (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
281 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
285 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
288 struct Lisp_Coding_System *c = XCODING_SYSTEM (obj);
290 error ("printing unreadable object #<coding_system 0x%x>",
293 write_c_string ("#<coding_system ", printcharfun);
294 print_internal (c->name, printcharfun, 1);
295 write_c_string (">", printcharfun);
299 finalize_coding_system (void *header, int for_disksave)
301 struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header;
302 /* Since coding systems never go away, this function is not
303 necessary. But it would be necessary if we changed things
304 so that coding systems could go away. */
305 if (!for_disksave) /* see comment in lstream.c */
307 switch (CODING_SYSTEM_TYPE (c))
310 case CODESYS_ISO2022:
311 if (c->iso2022.input_conv)
313 Dynarr_free (c->iso2022.input_conv);
314 c->iso2022.input_conv = 0;
316 if (c->iso2022.output_conv)
318 Dynarr_free (c->iso2022.output_conv);
319 c->iso2022.output_conv = 0;
330 symbol_to_eol_type (Lisp_Object symbol)
332 CHECK_SYMBOL (symbol);
333 if (NILP (symbol)) return EOL_AUTODETECT;
334 if (EQ (symbol, Qlf)) return EOL_LF;
335 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
336 if (EQ (symbol, Qcr)) return EOL_CR;
338 signal_simple_error ("Unrecognized eol type", symbol);
339 return EOL_AUTODETECT; /* not reached */
343 eol_type_to_symbol (enum eol_type type)
347 case EOL_LF: return Qlf;
348 case EOL_CRLF: return Qcrlf;
349 case EOL_CR: return Qcr;
350 case EOL_AUTODETECT: return Qnil;
351 default: abort (); return Qnil; /* not reached */
356 setup_eol_coding_systems (struct Lisp_Coding_System *codesys)
358 Lisp_Object codesys_obj;
359 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
360 char *codesys_name = (char *) alloca (len + 7);
362 char *codesys_mnemonic=0;
364 Lisp_Object codesys_name_sym, sub_codesys_obj;
368 XSETCODING_SYSTEM (codesys_obj, codesys);
370 memcpy (codesys_name,
371 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
373 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
375 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
376 codesys_mnemonic = (char *) alloca (mlen + 7);
377 memcpy (codesys_mnemonic,
378 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
381 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
382 strcpy (codesys_name + len, "-" op_sys); \
384 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
385 codesys_name_sym = intern (codesys_name); \
386 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
387 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
389 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
390 build_string (codesys_mnemonic); \
391 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
394 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
395 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
396 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
399 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
400 Return t if OBJECT is a coding system.
401 A coding system is an object that defines how text containing multiple
402 character sets is encoded into a stream of (typically 8-bit) bytes.
403 The coding system is used to decode the stream into a series of
404 characters (which may be from multiple charsets) when the text is read
405 from a file or process, and is used to encode the text back into the
406 same format when it is written out to a file or process.
408 For example, many ISO2022-compliant coding systems (such as Compound
409 Text, which is used for inter-client data under the X Window System)
410 use escape sequences to switch between different charsets -- Japanese
411 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
412 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
413 `make-coding-system' for more information.
415 Coding systems are normally identified using a symbol, and the
416 symbol is accepted in place of the actual coding system object whenever
417 a coding system is called for. (This is similar to how faces work.)
421 return CODING_SYSTEMP (object) ? Qt : Qnil;
424 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
425 Retrieve the coding system of the given name.
427 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
428 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
429 If there is no such coding system, nil is returned. Otherwise the
430 associated coding system object is returned.
432 (coding_system_or_name))
434 if (CODING_SYSTEMP (coding_system_or_name))
435 return coding_system_or_name;
437 if (NILP (coding_system_or_name))
438 coding_system_or_name = Qbinary;
440 CHECK_SYMBOL (coding_system_or_name);
442 return Fgethash (coding_system_or_name, Vcoding_system_hashtable, Qnil);
445 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
446 Retrieve the coding system of the given name.
447 Same as `find-coding-system' except that if there is no such
448 coding system, an error is signaled instead of returning nil.
452 Lisp_Object coding_system = Ffind_coding_system (name);
454 if (NILP (coding_system))
455 signal_simple_error ("No such coding system", name);
456 return coding_system;
459 /* We store the coding systems in hash tables with the names as the key and the
460 actual coding system object as the value. Occasionally we need to use them
461 in a list format. These routines provide us with that. */
462 struct coding_system_list_closure
464 Lisp_Object *coding_system_list;
468 add_coding_system_to_list_mapper (CONST void *hash_key, void *hash_contents,
469 void *coding_system_list_closure)
471 /* This function can GC */
472 Lisp_Object key, contents;
473 Lisp_Object *coding_system_list;
474 struct coding_system_list_closure *cscl =
475 (struct coding_system_list_closure *) coding_system_list_closure;
476 CVOID_TO_LISP (key, hash_key);
477 VOID_TO_LISP (contents, hash_contents);
478 coding_system_list = cscl->coding_system_list;
480 *coding_system_list = Fcons (XCODING_SYSTEM (contents)->name,
481 *coding_system_list);
485 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
486 Return a list of the names of all defined coding systems.
490 Lisp_Object coding_system_list = Qnil;
492 struct coding_system_list_closure coding_system_list_closure;
494 GCPRO1 (coding_system_list);
495 coding_system_list_closure.coding_system_list = &coding_system_list;
496 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hashtable,
497 &coding_system_list_closure);
500 return coding_system_list;
503 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
504 Return the name of the given coding system.
508 coding_system = Fget_coding_system (coding_system);
509 return XCODING_SYSTEM_NAME (coding_system);
512 static struct Lisp_Coding_System *
513 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
515 struct Lisp_Coding_System *codesys =
516 alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system);
518 zero_lcrecord (codesys);
519 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
520 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
521 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
522 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
523 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
524 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
525 CODING_SYSTEM_TYPE (codesys) = type;
526 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
528 if (type == CODESYS_ISO2022)
531 for (i = 0; i < 4; i++)
532 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
534 else if (type == CODESYS_CCL)
536 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
537 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
540 CODING_SYSTEM_NAME (codesys) = name;
546 /* Given a list of charset conversion specs as specified in a Lisp
547 program, parse it into STORE_HERE. */
550 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
551 Lisp_Object spec_list)
555 EXTERNAL_LIST_LOOP (rest, spec_list)
557 Lisp_Object car = XCAR (rest);
558 Lisp_Object from, to;
559 struct charset_conversion_spec spec;
561 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
562 signal_simple_error ("Invalid charset conversion spec", car);
563 from = Fget_charset (XCAR (car));
564 to = Fget_charset (XCAR (XCDR (car)));
565 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
566 signal_simple_error_2
567 ("Attempted conversion between different charset types",
569 spec.from_charset = from;
570 spec.to_charset = to;
572 Dynarr_add (store_here, spec);
576 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
577 specs, return the equivalent as the Lisp programmer would see it.
579 If LOAD_HERE is 0, return Qnil. */
582 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
589 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
591 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
592 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
595 return Fnreverse (result);
600 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
601 Register symbol NAME as a coding system.
603 TYPE describes the conversion method used and should be one of
606 Automatic conversion. XEmacs attempts to detect the coding system
609 No conversion. Use this for binary files and such. On output,
610 graphic characters that are not in ASCII or Latin-1 will be
611 replaced by a ?. (For a no-conversion-encoded buffer, these
612 characters will only be present if you explicitly insert them.)
614 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
616 Any ISO2022-compliant encoding. Among other things, this includes
617 JIS (the Japanese encoding commonly used for e-mail), EUC (the
618 standard Unix encoding for Japanese and other languages), and
619 Compound Text (the encoding used in X11). You can specify more
620 specific information about the conversion with the FLAGS argument.
622 Big5 (the encoding commonly used for Taiwanese).
624 The conversion is performed using a user-written pseudo-code
625 program. CCL (Code Conversion Language) is the name of this
628 Write out or read in the raw contents of the memory representing
629 the buffer's text. This is primarily useful for debugging
630 purposes, and is only enabled when XEmacs has been compiled with
631 DEBUG_XEMACS defined (via the --debug configure option).
632 WARNING: Reading in a file using 'internal conversion can result
633 in an internal inconsistency in the memory representing a
634 buffer's text, which will produce unpredictable results and may
635 cause XEmacs to crash. Under normal circumstances you should
636 never use 'internal conversion.
638 DOC-STRING is a string describing the coding system.
640 PROPS is a property list, describing the specific nature of the
641 character set. Recognized properties are:
644 String to be displayed in the modeline when this coding system is
648 End-of-line conversion to be used. It should be one of
651 Automatically detect the end-of-line type (LF, CRLF,
652 or CR). Also generate subsidiary coding systems named
653 `NAME-unix', `NAME-dos', and `NAME-mac', that are
654 identical to this coding system but have an EOL-TYPE
655 value of 'lf, 'crlf, and 'cr, respectively.
657 The end of a line is marked externally using ASCII LF.
658 Since this is also the way that XEmacs represents an
659 end-of-line internally, specifying this option results
660 in no end-of-line conversion. This is the standard
661 format for Unix text files.
663 The end of a line is marked externally using ASCII
664 CRLF. This is the standard format for MS-DOS text
667 The end of a line is marked externally using ASCII CR.
668 This is the standard format for Macintosh text files.
670 Automatically detect the end-of-line type but do not
671 generate subsidiary coding systems. (This value is
672 converted to nil when stored internally, and
673 `coding-system-property' will return nil.)
675 'post-read-conversion
676 Function called after a file has been read in, to perform the
677 decoding. Called with two arguments, BEG and END, denoting
678 a region of the current buffer to be decoded.
680 'pre-write-conversion
681 Function called before a file is written out, to perform the
682 encoding. Called with two arguments, BEG and END, denoting
683 a region of the current buffer to be encoded.
686 The following additional properties are recognized if TYPE is 'iso2022:
692 The character set initially designated to the G0 - G3 registers.
693 The value should be one of
695 -- A charset object (designate that character set)
696 -- nil (do not ever use this register)
697 -- t (no character set is initially designated to
698 the register, but may be later on; this automatically
699 sets the corresponding `force-g*-on-output' property)
705 If non-nil, send an explicit designation sequence on output before
706 using the specified register.
709 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
710 "ESC $ B" on output in place of the full designation sequences
711 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
714 If non-nil, don't designate ASCII to G0 at each end of line on output.
715 Setting this to non-nil also suppresses other state-resetting that
716 normally happens at the end of a line.
719 If non-nil, don't designate ASCII to G0 before control chars on output.
722 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
726 If non-nil, use locking-shift (SO/SI) instead of single-shift
727 or designation by escape sequence.
730 If non-nil, don't use ISO6429's direction specification.
733 If non-nil, literal control characters that are the same as
734 the beginning of a recognized ISO2022 or ISO6429 escape sequence
735 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
736 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
737 so that they can be properly distinguished from an escape sequence.
738 (Note that doing this results in a non-portable encoding.) This
739 encoding flag is used for byte-compiled files. Note that ESC
740 is a good choice for a quoting character because there are no
741 escape sequences whose second byte is a character from the Control-0
742 or Control-1 character sets; this is explicitly disallowed by the
745 'input-charset-conversion
746 A list of conversion specifications, specifying conversion of
747 characters in one charset to another when decoding is performed.
748 Each specification is a list of two elements: the source charset,
749 and the destination charset.
751 'output-charset-conversion
752 A list of conversion specifications, specifying conversion of
753 characters in one charset to another when encoding is performed.
754 The form of each specification is the same as for
755 'input-charset-conversion.
758 The following additional properties are recognized (and required)
762 CCL program used for decoding (converting to internal format).
765 CCL program used for encoding (converting to external format).
767 (name, type, doc_string, props))
769 struct Lisp_Coding_System *codesys;
770 Lisp_Object rest, key, value;
771 enum coding_system_type ty;
772 int need_to_setup_eol_systems = 1;
774 /* Convert type to constant */
775 if (NILP (type) || EQ (type, Qundecided))
776 { ty = CODESYS_AUTODETECT; }
778 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
779 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
780 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
781 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
783 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
785 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
788 signal_simple_error ("Invalid coding system type", type);
792 codesys = allocate_coding_system (ty, name);
794 if (NILP (doc_string))
795 doc_string = build_string ("");
797 CHECK_STRING (doc_string);
798 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
800 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
802 if (EQ (key, Qmnemonic))
805 CHECK_STRING (value);
806 CODING_SYSTEM_MNEMONIC (codesys) = value;
809 else if (EQ (key, Qeol_type))
811 need_to_setup_eol_systems = NILP (value);
814 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
817 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
818 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
820 else if (ty == CODESYS_ISO2022)
822 #define FROB_INITIAL_CHARSET(charset_num) \
823 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
824 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
826 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
827 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
828 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
829 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
831 #define FROB_FORCE_CHARSET(charset_num) \
832 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
834 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
835 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
836 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
837 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
839 #define FROB_BOOLEAN_PROPERTY(prop) \
840 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
842 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
843 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
844 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
845 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
846 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
847 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
848 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
850 else if (EQ (key, Qinput_charset_conversion))
852 codesys->iso2022.input_conv =
853 Dynarr_new (charset_conversion_spec);
854 parse_charset_conversion_specs (codesys->iso2022.input_conv,
857 else if (EQ (key, Qoutput_charset_conversion))
859 codesys->iso2022.output_conv =
860 Dynarr_new (charset_conversion_spec);
861 parse_charset_conversion_specs (codesys->iso2022.output_conv,
865 signal_simple_error ("Unrecognized property", key);
867 else if (EQ (type, Qccl))
869 if (EQ (key, Qdecode))
871 CHECK_VECTOR (value);
872 CODING_SYSTEM_CCL_DECODE (codesys) = value;
874 else if (EQ (key, Qencode))
876 CHECK_VECTOR (value);
877 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
880 signal_simple_error ("Unrecognized property", key);
884 signal_simple_error ("Unrecognized property", key);
887 if (need_to_setup_eol_systems)
888 setup_eol_coding_systems (codesys);
891 Lisp_Object codesys_obj;
892 XSETCODING_SYSTEM (codesys_obj, codesys);
893 Fputhash (name, codesys_obj, Vcoding_system_hashtable);
898 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
899 Copy OLD-CODING-SYSTEM to NEW-NAME.
900 If NEW-NAME does not name an existing coding system, a new one will
903 (old_coding_system, new_name))
905 Lisp_Object new_coding_system;
906 old_coding_system = Fget_coding_system (old_coding_system);
907 new_coding_system = Ffind_coding_system (new_name);
908 if (NILP (new_coding_system))
910 XSETCODING_SYSTEM (new_coding_system,
911 allocate_coding_system
912 (XCODING_SYSTEM_TYPE (old_coding_system),
914 Fputhash (new_name, new_coding_system, Vcoding_system_hashtable);
918 struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
919 struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
920 memcpy (((char *) to ) + sizeof (to->header),
921 ((char *) from) + sizeof (from->header),
922 sizeof (*from) - sizeof (from->header));
925 return new_coding_system;
929 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
931 struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
932 Lisp_Object new_coding_system;
934 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
935 return coding_system;
939 case EOL_AUTODETECT: return coding_system;
940 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
941 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
942 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
946 return NILP (new_coding_system) ? coding_system : new_coding_system;
949 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
950 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
952 (coding_system, eol_type))
954 coding_system = Fget_coding_system (coding_system);
956 return subsidiary_coding_system (coding_system,
957 symbol_to_eol_type (eol_type));
961 /************************************************************************/
962 /* Coding system accessors */
963 /************************************************************************/
965 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
966 Return the doc string for CODING-SYSTEM.
970 coding_system = Fget_coding_system (coding_system);
971 return XCODING_SYSTEM_DOC_STRING (coding_system);
974 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
975 Return the type of CODING-SYSTEM.
979 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
981 case CODESYS_AUTODETECT: return Qundecided;
983 case CODESYS_SHIFT_JIS: return Qshift_jis;
984 case CODESYS_ISO2022: return Qiso2022;
985 case CODESYS_BIG5: return Qbig5;
986 case CODESYS_CCL: return Qccl;
988 case CODESYS_NO_CONVERSION: return Qno_conversion;
990 case CODESYS_INTERNAL: return Qinternal;
996 return Qnil; /* not reached */
1001 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1004 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1006 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1009 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1010 Return initial charset of CODING-SYSTEM designated to GNUM.
1013 (coding_system, gnum))
1015 coding_system = Fget_coding_system (coding_system);
1018 return coding_system_charset (coding_system, XINT (gnum));
1022 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1023 Return the PROP property of CODING-SYSTEM.
1025 (coding_system, prop))
1028 enum coding_system_type type;
1030 coding_system = Fget_coding_system (coding_system);
1031 CHECK_SYMBOL (prop);
1032 type = XCODING_SYSTEM_TYPE (coding_system);
1034 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1035 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1038 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1040 case CODESYS_PROP_ALL_OK:
1043 case CODESYS_PROP_ISO2022:
1044 if (type != CODESYS_ISO2022)
1046 ("Property only valid in ISO2022 coding systems",
1050 case CODESYS_PROP_CCL:
1051 if (type != CODESYS_CCL)
1053 ("Property only valid in CCL coding systems",
1063 signal_simple_error ("Unrecognized property", prop);
1065 if (EQ (prop, Qname))
1066 return XCODING_SYSTEM_NAME (coding_system);
1067 else if (EQ (prop, Qtype))
1068 return Fcoding_system_type (coding_system);
1069 else if (EQ (prop, Qdoc_string))
1070 return XCODING_SYSTEM_DOC_STRING (coding_system);
1071 else if (EQ (prop, Qmnemonic))
1072 return XCODING_SYSTEM_MNEMONIC (coding_system);
1073 else if (EQ (prop, Qeol_type))
1074 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1075 else if (EQ (prop, Qeol_lf))
1076 return XCODING_SYSTEM_EOL_LF (coding_system);
1077 else if (EQ (prop, Qeol_crlf))
1078 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1079 else if (EQ (prop, Qeol_cr))
1080 return XCODING_SYSTEM_EOL_CR (coding_system);
1081 else if (EQ (prop, Qpost_read_conversion))
1082 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1083 else if (EQ (prop, Qpre_write_conversion))
1084 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1086 else if (type == CODESYS_ISO2022)
1088 if (EQ (prop, Qcharset_g0))
1089 return coding_system_charset (coding_system, 0);
1090 else if (EQ (prop, Qcharset_g1))
1091 return coding_system_charset (coding_system, 1);
1092 else if (EQ (prop, Qcharset_g2))
1093 return coding_system_charset (coding_system, 2);
1094 else if (EQ (prop, Qcharset_g3))
1095 return coding_system_charset (coding_system, 3);
1097 #define FORCE_CHARSET(charset_num) \
1098 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1099 (coding_system, charset_num) ? Qt : Qnil)
1101 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1102 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1103 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1104 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1106 #define LISP_BOOLEAN(prop) \
1107 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1109 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1110 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1111 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1112 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1113 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1114 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1115 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1117 else if (EQ (prop, Qinput_charset_conversion))
1119 unparse_charset_conversion_specs
1120 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1121 else if (EQ (prop, Qoutput_charset_conversion))
1123 unparse_charset_conversion_specs
1124 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1128 else if (type == CODESYS_CCL)
1130 if (EQ (prop, Qdecode))
1131 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1132 else if (EQ (prop, Qencode))
1133 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1141 return Qnil; /* not reached */
1145 /************************************************************************/
1146 /* Coding category functions */
1147 /************************************************************************/
1150 decode_coding_category (Lisp_Object symbol)
1154 CHECK_SYMBOL (symbol);
1155 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1156 if (EQ (coding_category_symbol[i], symbol))
1159 signal_simple_error ("Unrecognized coding category", symbol);
1160 return 0; /* not reached */
1163 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1164 Return a list of all recognized coding categories.
1169 Lisp_Object list = Qnil;
1171 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1172 list = Fcons (coding_category_symbol[i], list);
1176 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1177 Change the priority order of the coding categories.
1178 LIST should be list of coding categories, in descending order of
1179 priority. Unspecified coding categories will be lower in priority
1180 than all specified ones, in the same relative order they were in
1185 int category_to_priority[CODING_CATEGORY_LAST + 1];
1189 /* First generate a list that maps coding categories to priorities. */
1191 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1192 category_to_priority[i] = -1;
1194 /* Highest priority comes from the specified list. */
1196 EXTERNAL_LIST_LOOP (rest, list)
1198 int cat = decode_coding_category (XCAR (rest));
1200 if (category_to_priority[cat] >= 0)
1201 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1202 category_to_priority[cat] = i++;
1205 /* Now go through the existing categories by priority to retrieve
1206 the categories not yet specified and preserve their priority
1208 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1210 int cat = coding_category_by_priority[j];
1211 if (category_to_priority[cat] < 0)
1212 category_to_priority[cat] = i++;
1215 /* Now we need to construct the inverse of the mapping we just
1218 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1219 coding_category_by_priority[category_to_priority[i]] = i;
1221 /* Phew! That was confusing. */
1225 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1226 Return a list of coding categories in descending order of priority.
1231 Lisp_Object list = Qnil;
1233 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1234 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1239 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1240 Change the coding system associated with a coding category.
1242 (coding_category, coding_system))
1244 int cat = decode_coding_category (coding_category);
1246 coding_system = Fget_coding_system (coding_system);
1247 coding_category_system[cat] = coding_system;
1251 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1252 Return the coding system associated with a coding category.
1256 int cat = decode_coding_category (coding_category);
1257 Lisp_Object sys = coding_category_system[cat];
1260 return XCODING_SYSTEM_NAME (sys);
1265 /************************************************************************/
1266 /* Detecting the encoding of data */
1267 /************************************************************************/
1269 struct detection_state
1271 enum eol_type eol_type;
1293 struct iso2022_decoder iso;
1295 int high_byte_count;
1296 unsigned int saw_single_shift:1;
1309 acceptable_control_char_p (int c)
1313 /* Allow and ignore control characters that you might
1314 reasonably see in a text file */
1319 case 8: /* backspace */
1320 case 11: /* vertical tab */
1321 case 12: /* form feed */
1322 case 26: /* MS-DOS C-z junk */
1323 case 31: /* '^_' -- for info */
1331 mask_has_at_most_one_bit_p (int mask)
1333 /* Perhaps the only thing useful you learn from intensive Microsoft
1334 technical interviews */
1335 return (mask & (mask - 1)) == 0;
1338 static enum eol_type
1339 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1348 st->eol.just_saw_cr = 1;
1353 if (st->eol.just_saw_cr)
1355 else if (st->eol.seen_anything)
1358 else if (st->eol.just_saw_cr)
1360 st->eol.just_saw_cr = 0;
1362 st->eol.seen_anything = 1;
1365 return EOL_AUTODETECT;
1368 /* Attempt to determine the encoding and EOL type of the given text.
1369 Before calling this function for the first type, you must initialize
1370 st->eol_type as appropriate and initialize st->mask to ~0.
1372 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1375 st->mask holds the determined coding category mask, or ~0 if only
1376 ASCII has been seen so far.
1380 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1381 is present in st->mask
1382 1 == definitive answers are here for both st->eol_type and st->mask
1386 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1387 unsigned int n, int just_do_eol)
1391 if (st->eol_type == EOL_AUTODETECT)
1392 st->eol_type = detect_eol_type (st, src, n);
1395 return st->eol_type != EOL_AUTODETECT;
1397 if (!st->seen_non_ascii)
1399 for (; n; n--, src++)
1402 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1404 st->seen_non_ascii = 1;
1406 st->shift_jis.mask = ~0;
1408 st->iso2022.mask = ~0;
1418 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1419 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1420 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1421 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1422 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1423 st->big5.mask = detect_coding_big5 (st, src, n);
1425 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask;
1428 int retval = mask_has_at_most_one_bit_p (st->mask);
1429 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1430 return retval && st->eol_type != EOL_AUTODETECT;
1435 coding_system_from_mask (int mask)
1439 /* If the file was entirely or basically ASCII, use the
1440 default value of `buffer-file-coding-system'. */
1441 Lisp_Object retval =
1442 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1445 retval = Ffind_coding_system (retval);
1449 (Qbad_variable, Qwarning,
1450 "Invalid `default-buffer-file-coding-system', set to nil");
1451 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1455 retval = Fget_coding_system (Qno_conversion);
1463 mask = postprocess_iso2022_mask (mask);
1465 /* Look through the coding categories by priority and find
1466 the first one that is allowed. */
1467 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1469 cat = coding_category_by_priority[i];
1470 if ((mask & (1 << cat)) &&
1471 !NILP (coding_category_system[cat]))
1475 return coding_category_system[cat];
1477 return Fget_coding_system (Qno_conversion);
1481 /* Given a seekable read stream and potential coding system and EOL type
1482 as specified, do any autodetection that is called for. If the
1483 coding system and/or EOL type are not autodetect, they will be left
1484 alone; but this function will never return an autodetect coding system
1487 This function does not automatically fetch subsidiary coding systems;
1488 that should be unnecessary with the explicit eol-type argument. */
1491 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1492 enum eol_type *eol_type_in_out)
1494 struct detection_state decst;
1496 if (*eol_type_in_out == EOL_AUTODETECT)
1497 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1500 decst.eol_type = *eol_type_in_out;
1503 /* If autodetection is called for, do it now. */
1504 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1505 *eol_type_in_out == EOL_AUTODETECT)
1510 unsigned char random_buffer[4096];
1513 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1516 if (detect_coding_type (&decst, random_buffer, nread,
1517 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1518 CODESYS_AUTODETECT))
1522 *eol_type_in_out = decst.eol_type;
1523 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1524 *codesys_in_out = coding_system_from_mask (decst.mask);
1527 /* If we absolutely can't determine the EOL type, just assume LF. */
1528 if (*eol_type_in_out == EOL_AUTODETECT)
1529 *eol_type_in_out = EOL_LF;
1531 Lstream_rewind (stream);
1534 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1535 Detect coding system of the text in the region between START and END.
1536 Returned a list of possible coding systems ordered by priority.
1537 If only ASCII characters are found, it returns 'undecided or one of
1538 its subsidiary coding systems according to a detected end-of-line
1539 type. Optional arg BUFFER defaults to the current buffer.
1541 (start, end, buffer))
1543 Lisp_Object val = Qnil;
1544 struct buffer *buf = decode_buffer (buffer, 0);
1546 Lisp_Object instream, lb_instream;
1547 Lstream *istr, *lb_istr;
1548 struct detection_state decst;
1549 struct gcpro gcpro1, gcpro2;
1551 get_buffer_range_char (buf, start, end, &b, &e, 0);
1552 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1553 lb_istr = XLSTREAM (lb_instream);
1554 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1555 istr = XLSTREAM (instream);
1556 GCPRO2 (instream, lb_instream);
1558 decst.eol_type = EOL_AUTODETECT;
1562 unsigned char random_buffer[4096];
1563 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1567 if (detect_coding_type (&decst, random_buffer, nread, 0))
1571 if (decst.mask == ~0)
1572 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1580 decst.mask = postprocess_iso2022_mask (decst.mask);
1582 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1584 int sys = coding_category_by_priority[i];
1585 if (decst.mask & (1 << sys))
1587 Lisp_Object codesys = coding_category_system[sys];
1588 if (!NILP (codesys))
1589 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1590 val = Fcons (codesys, val);
1594 Lstream_close (istr);
1596 Lstream_delete (istr);
1597 Lstream_delete (lb_istr);
1602 /************************************************************************/
1603 /* Converting to internal Mule format ("decoding") */
1604 /************************************************************************/
1606 /* A decoding stream is a stream used for decoding text (i.e.
1607 converting from some external format to internal format).
1608 The decoding-stream object keeps track of the actual coding
1609 stream, the stream that is at the other end, and data that
1610 needs to be persistent across the lifetime of the stream. */
1612 /* Handle the EOL stuff related to just-read-in character C.
1613 EOL_TYPE is the EOL type of the coding stream.
1614 FLAGS is the current value of FLAGS in the coding stream, and may
1615 be modified by this macro. (The macro only looks at the
1616 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1617 bytes are to be written. You need to also define a local goto
1618 label "label_continue_loop" that is at the end of the main
1619 character-reading loop.
1621 If C is a CR character, then this macro handles it entirely and
1622 jumps to label_continue_loop. Otherwise, this macro does not add
1623 anything to DST, and continues normally. You should continue
1624 processing C normally after this macro. */
1626 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1630 if (eol_type == EOL_CR) \
1631 Dynarr_add (dst, '\n'); \
1632 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1633 Dynarr_add (dst, c); \
1635 flags |= CODING_STATE_CR; \
1636 goto label_continue_loop; \
1638 else if (flags & CODING_STATE_CR) \
1639 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1641 Dynarr_add (dst, '\r'); \
1642 flags &= ~CODING_STATE_CR; \
1646 /* C should be a binary character in the range 0 - 255; convert
1647 to internal format and add to Dynarr DST. */
1649 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1651 if (BYTE_ASCII_P (c)) \
1652 Dynarr_add (dst, c); \
1653 else if (BYTE_C1_P (c)) \
1655 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1656 Dynarr_add (dst, c + 0x20); \
1660 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1661 Dynarr_add (dst, c); \
1665 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1669 DECODE_ADD_BINARY_CHAR (ch, dst); \
1674 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1676 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1677 if ((flags & CODING_STATE_END) && \
1678 (flags & CODING_STATE_CR)) \
1679 Dynarr_add (dst, '\r'); \
1682 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1684 struct decoding_stream
1686 /* Coding system that governs the conversion. */
1687 struct Lisp_Coding_System *codesys;
1689 /* Stream that we read the encoded data from or
1690 write the decoded data to. */
1693 /* If we are reading, then we can return only a fixed amount of
1694 data, so if the conversion resulted in too much data, we store it
1695 here for retrieval the next time around. */
1696 unsigned_char_dynarr *runoff;
1698 /* FLAGS holds flags indicating the current state of the decoding.
1699 Some of these flags are dependent on the coding system. */
1702 /* CH holds a partially built-up character. Since we only deal
1703 with one- and two-byte characters at the moment, we only use
1704 this to store the first byte of a two-byte character. */
1707 /* EOL_TYPE specifies the type of end-of-line conversion that
1708 currently applies. We need to keep this separate from the
1709 EOL type stored in CODESYS because the latter might indicate
1710 automatic EOL-type detection while the former will always
1711 indicate a particular EOL type. */
1712 enum eol_type eol_type;
1714 /* Additional ISO2022 information. We define the structure above
1715 because it's also needed by the detection routines. */
1716 struct iso2022_decoder iso2022;
1718 /* Additional information (the state of the running CCL program)
1719 used by the CCL decoder. */
1720 struct ccl_program ccl;
1722 struct detection_state decst;
1725 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1726 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1727 static int decoding_rewinder (Lstream *stream);
1728 static int decoding_seekable_p (Lstream *stream);
1729 static int decoding_flusher (Lstream *stream);
1730 static int decoding_closer (Lstream *stream);
1732 static Lisp_Object decoding_marker (Lisp_Object stream,
1733 void (*markobj) (Lisp_Object));
1735 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1736 sizeof (struct decoding_stream));
1739 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1741 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1742 Lisp_Object str_obj;
1744 /* We do not need to mark the coding systems or charsets stored
1745 within the stream because they are stored in a global list
1746 and automatically marked. */
1748 XSETLSTREAM (str_obj, str);
1749 (markobj) (str_obj);
1750 if (str->imp->marker)
1751 return (str->imp->marker) (str_obj, markobj);
1756 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1757 so we read data from the other end, decode it, and store it into DATA. */
1760 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1762 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1763 unsigned char *orig_data = data;
1765 int error_occurred = 0;
1767 /* We need to interface to mule_decode(), which expects to take some
1768 amount of data and store the result into a Dynarr. We have
1769 mule_decode() store into str->runoff, and take data from there
1772 /* We loop until we have enough data, reading chunks from the other
1773 end and decoding it. */
1776 /* Take data from the runoff if we can. Make sure to take at
1777 most SIZE bytes, and delete the data from the runoff. */
1778 if (Dynarr_length (str->runoff) > 0)
1780 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1781 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1782 Dynarr_delete_many (str->runoff, 0, chunk);
1788 break; /* No more room for data */
1790 if (str->flags & CODING_STATE_END)
1791 /* This means that on the previous iteration, we hit the EOF on
1792 the other end. We loop once more so that mule_decode() can
1793 output any final stuff it may be holding, or any "go back
1794 to a sane state" escape sequences. (This latter makes sense
1795 during encoding.) */
1798 /* Exhausted the runoff, so get some more. DATA has at least
1799 SIZE bytes left of storage in it, so it's OK to read directly
1800 into it. (We'll be overwriting above, after we've decoded it
1801 into the runoff.) */
1802 read_size = Lstream_read (str->other_end, data, size);
1809 /* There might be some more end data produced in the translation.
1810 See the comment above. */
1811 str->flags |= CODING_STATE_END;
1812 mule_decode (stream, data, str->runoff, read_size);
1815 if (data - orig_data == 0)
1816 return error_occurred ? -1 : 0;
1818 return data - orig_data;
1822 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1824 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1827 /* Decode all our data into the runoff, and then attempt to write
1828 it all out to the other end. Remove whatever chunk we succeeded
1830 mule_decode (stream, data, str->runoff, size);
1831 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1832 Dynarr_length (str->runoff));
1834 Dynarr_delete_many (str->runoff, 0, retval);
1835 /* Do NOT return retval. The return value indicates how much
1836 of the incoming data was written, not how many bytes were
1842 reset_decoding_stream (struct decoding_stream *str)
1845 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1847 Lisp_Object coding_system;
1848 XSETCODING_SYSTEM (coding_system, str->codesys);
1849 reset_iso2022 (coding_system, &str->iso2022);
1851 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1853 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1856 str->flags = str->ch = 0;
1860 decoding_rewinder (Lstream *stream)
1862 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1863 reset_decoding_stream (str);
1864 Dynarr_reset (str->runoff);
1865 return Lstream_rewind (str->other_end);
1869 decoding_seekable_p (Lstream *stream)
1871 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1872 return Lstream_seekable_p (str->other_end);
1876 decoding_flusher (Lstream *stream)
1878 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1879 return Lstream_flush (str->other_end);
1883 decoding_closer (Lstream *stream)
1885 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1886 if (stream->flags & LSTREAM_FL_WRITE)
1888 str->flags |= CODING_STATE_END;
1889 decoding_writer (stream, 0, 0);
1891 Dynarr_free (str->runoff);
1893 if (str->iso2022.composite_chars)
1894 Dynarr_free (str->iso2022.composite_chars);
1896 return Lstream_close (str->other_end);
1900 decoding_stream_coding_system (Lstream *stream)
1902 Lisp_Object coding_system;
1903 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1905 XSETCODING_SYSTEM (coding_system, str->codesys);
1906 return subsidiary_coding_system (coding_system, str->eol_type);
1910 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1912 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1913 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1915 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1916 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1917 reset_decoding_stream (str);
1920 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
1921 stream for writing, no automatic code detection will be performed.
1922 The reason for this is that automatic code detection requires a
1923 seekable input. Things will also fail if you open a decoding
1924 stream for reading using a non-fully-specified coding system and
1925 a non-seekable input stream. */
1928 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1931 Lstream *lstr = Lstream_new (lstream_decoding, mode);
1932 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1936 str->other_end = stream;
1937 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1938 str->eol_type = EOL_AUTODETECT;
1939 if (!strcmp (mode, "r")
1940 && Lstream_seekable_p (stream))
1941 /* We can determine the coding system now. */
1942 determine_real_coding_system (stream, &codesys, &str->eol_type);
1943 set_decoding_stream_coding_system (lstr, codesys);
1944 str->decst.eol_type = str->eol_type;
1945 str->decst.mask = ~0;
1946 XSETLSTREAM (obj, lstr);
1951 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
1953 return make_decoding_stream_1 (stream, codesys, "r");
1957 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
1959 return make_decoding_stream_1 (stream, codesys, "w");
1962 /* Note: the decode_coding_* functions all take the same
1963 arguments as mule_decode(), which is to say some SRC data of
1964 size N, which is to be stored into dynamic array DST.
1965 DECODING is the stream within which the decoding is
1966 taking place, but no data is actually read from or
1967 written to that stream; that is handled in decoding_reader()
1968 or decoding_writer(). This allows the same functions to
1969 be used for both reading and writing. */
1972 mule_decode (Lstream *decoding, CONST unsigned char *src,
1973 unsigned_char_dynarr *dst, unsigned int n)
1975 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
1977 /* If necessary, do encoding-detection now. We do this when
1978 we're a writing stream or a non-seekable reading stream,
1979 meaning that we can't just process the whole input,
1980 rewind, and start over. */
1982 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
1983 str->eol_type == EOL_AUTODETECT)
1985 Lisp_Object codesys;
1987 XSETCODING_SYSTEM (codesys, str->codesys);
1988 detect_coding_type (&str->decst, src, n,
1989 CODING_SYSTEM_TYPE (str->codesys) !=
1990 CODESYS_AUTODETECT);
1991 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
1992 str->decst.mask != ~0)
1993 /* #### This is cheesy. What we really ought to do is
1994 buffer up a certain amount of data so as to get a
1995 less random result. */
1996 codesys = coding_system_from_mask (str->decst.mask);
1997 str->eol_type = str->decst.eol_type;
1998 if (XCODING_SYSTEM (codesys) != str->codesys)
2000 /* Preserve the CODING_STATE_END flag in case it was set.
2001 If we erase it, bad things might happen. */
2002 int was_end = str->flags & CODING_STATE_END;
2003 set_decoding_stream_coding_system (decoding, codesys);
2005 str->flags |= CODING_STATE_END;
2009 switch (CODING_SYSTEM_TYPE (str->codesys))
2012 case CODESYS_INTERNAL:
2013 Dynarr_add_many (dst, src, n);
2016 case CODESYS_AUTODETECT:
2017 /* If we got this far and still haven't decided on the coding
2018 system, then do no conversion. */
2019 case CODESYS_NO_CONVERSION:
2020 decode_coding_no_conversion (decoding, src, dst, n);
2023 case CODESYS_SHIFT_JIS:
2024 decode_coding_sjis (decoding, src, dst, n);
2027 decode_coding_big5 (decoding, src, dst, n);
2030 ccl_driver (&str->ccl, src, dst, n, 0);
2032 case CODESYS_ISO2022:
2033 decode_coding_iso2022 (decoding, src, dst, n);
2041 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2042 Decode the text between START and END which is encoded in CODING-SYSTEM.
2043 This is useful if you've read in encoded text from a file without decoding
2044 it (e.g. you read in a JIS-formatted file but used the `binary' or
2045 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2046 Return length of decoded text.
2047 BUFFER defaults to the current buffer if unspecified.
2049 (start, end, coding_system, buffer))
2052 struct buffer *buf = decode_buffer (buffer, 0);
2053 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2054 Lstream *istr, *ostr;
2055 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2057 get_buffer_range_char (buf, start, end, &b, &e, 0);
2059 barf_if_buffer_read_only (buf, b, e);
2061 coding_system = Fget_coding_system (coding_system);
2062 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2063 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2064 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2066 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2067 Fget_coding_system (Qbinary));
2068 istr = XLSTREAM (instream);
2069 ostr = XLSTREAM (outstream);
2070 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2072 /* The chain of streams looks like this:
2074 [BUFFER] <----- send through
2075 ------> [ENCODE AS BINARY]
2076 ------> [DECODE AS SPECIFIED]
2082 char tempbuf[1024]; /* some random amount */
2083 Bufpos newpos, even_newer_pos;
2084 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2085 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2089 newpos = lisp_buffer_stream_startpos (istr);
2090 Lstream_write (ostr, tempbuf, size_in_bytes);
2091 even_newer_pos = lisp_buffer_stream_startpos (istr);
2092 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2095 Lstream_close (istr);
2096 Lstream_close (ostr);
2098 Lstream_delete (istr);
2099 Lstream_delete (ostr);
2100 Lstream_delete (XLSTREAM (de_outstream));
2101 Lstream_delete (XLSTREAM (lb_outstream));
2106 /************************************************************************/
2107 /* Converting to an external encoding ("encoding") */
2108 /************************************************************************/
2110 /* An encoding stream is an output stream. When you create the
2111 stream, you specify the coding system that governs the encoding
2112 and another stream that the resulting encoded data is to be
2113 sent to, and then start sending data to it. */
2115 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2117 struct encoding_stream
2119 /* Coding system that governs the conversion. */
2120 struct Lisp_Coding_System *codesys;
2122 /* Stream that we read the encoded data from or
2123 write the decoded data to. */
2126 /* If we are reading, then we can return only a fixed amount of
2127 data, so if the conversion resulted in too much data, we store it
2128 here for retrieval the next time around. */
2129 unsigned_char_dynarr *runoff;
2131 /* FLAGS holds flags indicating the current state of the encoding.
2132 Some of these flags are dependent on the coding system. */
2135 /* CH holds a partially built-up character. Since we only deal
2136 with one- and two-byte characters at the moment, we only use
2137 this to store the first byte of a two-byte character. */
2140 /* Additional information used by the ISO2022 encoder. */
2143 /* CHARSET holds the character sets currently assigned to the G0
2144 through G3 registers. It is initialized from the array
2145 INITIAL_CHARSET in CODESYS. */
2146 Lisp_Object charset[4];
2148 /* Which registers are currently invoked into the left (GL) and
2149 right (GR) halves of the 8-bit encoding space? */
2150 int register_left, register_right;
2152 /* Whether we need to explicitly designate the charset in the
2153 G? register before using it. It is initialized from the
2154 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2155 unsigned char force_charset_on_output[4];
2157 /* Other state variables that need to be preserved across
2159 Lisp_Object current_charset;
2161 int current_char_boundary;
2164 /* Additional information (the state of the running CCL program)
2165 used by the CCL encoder. */
2166 struct ccl_program ccl;
2170 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2171 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2173 static int encoding_rewinder (Lstream *stream);
2174 static int encoding_seekable_p (Lstream *stream);
2175 static int encoding_flusher (Lstream *stream);
2176 static int encoding_closer (Lstream *stream);
2178 static Lisp_Object encoding_marker (Lisp_Object stream,
2179 void (*markobj) (Lisp_Object));
2181 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2182 sizeof (struct encoding_stream));
2185 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2187 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2188 Lisp_Object str_obj;
2190 /* We do not need to mark the coding systems or charsets stored
2191 within the stream because they are stored in a global list
2192 and automatically marked. */
2194 XSETLSTREAM (str_obj, str);
2195 (markobj) (str_obj);
2196 if (str->imp->marker)
2197 return (str->imp->marker) (str_obj, markobj);
2202 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2203 so we read data from the other end, encode it, and store it into DATA. */
2206 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2208 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2209 unsigned char *orig_data = data;
2211 int error_occurred = 0;
2213 /* We need to interface to mule_encode(), which expects to take some
2214 amount of data and store the result into a Dynarr. We have
2215 mule_encode() store into str->runoff, and take data from there
2218 /* We loop until we have enough data, reading chunks from the other
2219 end and encoding it. */
2222 /* Take data from the runoff if we can. Make sure to take at
2223 most SIZE bytes, and delete the data from the runoff. */
2224 if (Dynarr_length (str->runoff) > 0)
2226 int chunk = min ((int) size, Dynarr_length (str->runoff));
2227 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2228 Dynarr_delete_many (str->runoff, 0, chunk);
2234 break; /* No more room for data */
2236 if (str->flags & CODING_STATE_END)
2237 /* This means that on the previous iteration, we hit the EOF on
2238 the other end. We loop once more so that mule_encode() can
2239 output any final stuff it may be holding, or any "go back
2240 to a sane state" escape sequences. (This latter makes sense
2241 during encoding.) */
2244 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2245 left of storage in it, so it's OK to read directly into it.
2246 (We'll be overwriting above, after we've encoded it into the
2248 read_size = Lstream_read (str->other_end, data, size);
2255 /* There might be some more end data produced in the translation.
2256 See the comment above. */
2257 str->flags |= CODING_STATE_END;
2258 mule_encode (stream, data, str->runoff, read_size);
2261 if (data == orig_data)
2262 return error_occurred ? -1 : 0;
2264 return data - orig_data;
2268 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2270 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2273 /* Encode all our data into the runoff, and then attempt to write
2274 it all out to the other end. Remove whatever chunk we succeeded
2276 mule_encode (stream, data, str->runoff, size);
2277 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2278 Dynarr_length (str->runoff));
2280 Dynarr_delete_many (str->runoff, 0, retval);
2281 /* Do NOT return retval. The return value indicates how much
2282 of the incoming data was written, not how many bytes were
2288 reset_encoding_stream (struct encoding_stream *str)
2291 switch (CODING_SYSTEM_TYPE (str->codesys))
2293 case CODESYS_ISO2022:
2297 for (i = 0; i < 4; i++)
2299 str->iso2022.charset[i] =
2300 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2301 str->iso2022.force_charset_on_output[i] =
2302 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2304 str->iso2022.register_left = 0;
2305 str->iso2022.register_right = 1;
2306 str->iso2022.current_charset = Qnil;
2307 str->iso2022.current_half = 0;
2308 str->iso2022.current_char_boundary = 1;
2312 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2319 str->flags = str->ch = 0;
2323 encoding_rewinder (Lstream *stream)
2325 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2326 reset_encoding_stream (str);
2327 Dynarr_reset (str->runoff);
2328 return Lstream_rewind (str->other_end);
2332 encoding_seekable_p (Lstream *stream)
2334 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2335 return Lstream_seekable_p (str->other_end);
2339 encoding_flusher (Lstream *stream)
2341 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2342 return Lstream_flush (str->other_end);
2346 encoding_closer (Lstream *stream)
2348 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2349 if (stream->flags & LSTREAM_FL_WRITE)
2351 str->flags |= CODING_STATE_END;
2352 encoding_writer (stream, 0, 0);
2354 Dynarr_free (str->runoff);
2355 return Lstream_close (str->other_end);
2359 encoding_stream_coding_system (Lstream *stream)
2361 Lisp_Object coding_system;
2362 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2364 XSETCODING_SYSTEM (coding_system, str->codesys);
2365 return coding_system;
2369 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2371 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2372 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2374 reset_encoding_stream (str);
2378 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2381 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2382 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2386 str->runoff = Dynarr_new (unsigned_char);
2387 str->other_end = stream;
2388 set_encoding_stream_coding_system (lstr, codesys);
2389 XSETLSTREAM (obj, lstr);
2394 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2396 return make_encoding_stream_1 (stream, codesys, "r");
2400 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2402 return make_encoding_stream_1 (stream, codesys, "w");
2405 /* Convert N bytes of internally-formatted data stored in SRC to an
2406 external format, according to the encoding stream ENCODING.
2407 Store the encoded data into DST. */
2410 mule_encode (Lstream *encoding, CONST unsigned char *src,
2411 unsigned_char_dynarr *dst, unsigned int n)
2413 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2415 switch (CODING_SYSTEM_TYPE (str->codesys))
2418 case CODESYS_INTERNAL:
2419 Dynarr_add_many (dst, src, n);
2422 case CODESYS_AUTODETECT:
2423 /* If we got this far and still haven't decided on the coding
2424 system, then do no conversion. */
2425 case CODESYS_NO_CONVERSION:
2426 encode_coding_no_conversion (encoding, src, dst, n);
2429 case CODESYS_SHIFT_JIS:
2430 encode_coding_sjis (encoding, src, dst, n);
2433 encode_coding_big5 (encoding, src, dst, n);
2436 ccl_driver (&str->ccl, src, dst, n, 0);
2438 case CODESYS_ISO2022:
2439 encode_coding_iso2022 (encoding, src, dst, n);
2447 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2448 Encode the text between START and END using CODING-SYSTEM.
2449 This will, for example, convert Japanese characters into stuff such as
2450 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2451 text. BUFFER defaults to the current buffer if unspecified.
2453 (start, end, coding_system, buffer))
2456 struct buffer *buf = decode_buffer (buffer, 0);
2457 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2458 Lstream *istr, *ostr;
2459 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2461 get_buffer_range_char (buf, start, end, &b, &e, 0);
2463 barf_if_buffer_read_only (buf, b, e);
2465 coding_system = Fget_coding_system (coding_system);
2466 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2467 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2468 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2469 Fget_coding_system (Qbinary));
2470 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2472 istr = XLSTREAM (instream);
2473 ostr = XLSTREAM (outstream);
2474 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2475 /* The chain of streams looks like this:
2477 [BUFFER] <----- send through
2478 ------> [ENCODE AS SPECIFIED]
2479 ------> [DECODE AS BINARY]
2484 char tempbuf[1024]; /* some random amount */
2485 Bufpos newpos, even_newer_pos;
2486 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2487 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2491 newpos = lisp_buffer_stream_startpos (istr);
2492 Lstream_write (ostr, tempbuf, size_in_bytes);
2493 even_newer_pos = lisp_buffer_stream_startpos (istr);
2494 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2500 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2501 Lstream_close (istr);
2502 Lstream_close (ostr);
2504 Lstream_delete (istr);
2505 Lstream_delete (ostr);
2506 Lstream_delete (XLSTREAM (de_outstream));
2507 Lstream_delete (XLSTREAM (lb_outstream));
2508 return make_int (retlen);
2514 /************************************************************************/
2515 /* Shift-JIS methods */
2516 /************************************************************************/
2518 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2519 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2520 as is. A character of JISX0201-Kana (TYPE94 character set) is
2521 encoded by "position-code + 0x80". A character of JISX0208
2522 (TYPE94x94 character set) is encoded in 2-byte but two
2523 position-codes are divided and shifted so that it fit in the range
2526 --- CODE RANGE of Shift-JIS ---
2527 (character set) (range)
2529 JISX0201-Kana 0xA0 .. 0xDF
2530 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2531 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2532 -------------------------------
2536 /* Is this the first byte of a Shift-JIS two-byte char? */
2538 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2539 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2541 /* Is this the second byte of a Shift-JIS two-byte char? */
2543 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2544 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2546 #define BYTE_SJIS_KATAKANA_P(c) \
2547 ((c) >= 0xA1 && (c) <= 0xDF)
2550 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2558 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2560 if (st->shift_jis.in_second_byte)
2562 st->shift_jis.in_second_byte = 0;
2566 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2567 st->shift_jis.in_second_byte = 1;
2569 return CODING_CATEGORY_SHIFT_JIS_MASK;
2572 /* Convert Shift-JIS data to internal format. */
2575 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2576 unsigned_char_dynarr *dst, unsigned int n)
2579 unsigned int flags, ch;
2580 enum eol_type eol_type;
2581 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2583 CODING_STREAM_DECOMPOSE (str, flags, ch);
2584 eol_type = str->eol_type;
2592 /* Previous character was first byte of Shift-JIS Kanji char. */
2593 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2595 unsigned char e1, e2;
2597 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2598 DECODE_SJIS (ch, c, e1, e2);
2599 Dynarr_add (dst, e1);
2600 Dynarr_add (dst, e2);
2604 DECODE_ADD_BINARY_CHAR (ch, dst);
2605 DECODE_ADD_BINARY_CHAR (c, dst);
2611 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2612 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2614 else if (BYTE_SJIS_KATAKANA_P (c))
2616 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2617 Dynarr_add (dst, c);
2620 DECODE_ADD_BINARY_CHAR (c, dst);
2622 label_continue_loop:;
2625 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2627 CODING_STREAM_COMPOSE (str, flags, ch);
2630 /* Convert internally-formatted data to Shift-JIS. */
2633 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2634 unsigned_char_dynarr *dst, unsigned int n)
2637 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2638 unsigned int flags, ch;
2639 enum eol_type eol_type;
2641 CODING_STREAM_DECOMPOSE (str, flags, ch);
2642 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2649 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2650 Dynarr_add (dst, '\r');
2651 if (eol_type != EOL_CR)
2652 Dynarr_add (dst, '\n');
2655 else if (BYTE_ASCII_P (c))
2657 Dynarr_add (dst, c);
2660 else if (BUFBYTE_LEADING_BYTE_P (c))
2661 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2662 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2663 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2666 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2668 Dynarr_add (dst, c);
2671 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2672 ch == LEADING_BYTE_JAPANESE_JISX0208)
2676 unsigned char j1, j2;
2677 ENCODE_SJIS (ch, c, j1, j2);
2678 Dynarr_add (dst, j1);
2679 Dynarr_add (dst, j2);
2685 CODING_STREAM_COMPOSE (str, flags, ch);
2688 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2689 Decode a JISX0208 character of Shift-JIS coding-system.
2690 CODE is the character code in Shift-JIS as a cons of type bytes.
2691 Return the corresponding character.
2695 unsigned char c1, c2, s1, s2;
2698 CHECK_INT (XCAR (code));
2699 CHECK_INT (XCDR (code));
2700 s1 = XINT (XCAR (code));
2701 s2 = XINT (XCDR (code));
2702 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2703 BYTE_SJIS_TWO_BYTE_2_P (s2))
2705 DECODE_SJIS (s1, s2, c1, c2);
2706 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2707 c1 & 0x7F, c2 & 0x7F));
2713 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2714 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2715 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2719 Lisp_Object charset;
2722 CHECK_CHAR_COERCE_INT (ch);
2723 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2724 if (EQ (charset, Vcharset_japanese_jisx0208))
2726 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2727 return Fcons (make_int (s1), make_int (s2));
2734 /************************************************************************/
2736 /************************************************************************/
2738 /* BIG5 is a coding system encoding two character sets: ASCII and
2739 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2740 character set and is encoded in two-byte.
2742 --- CODE RANGE of BIG5 ---
2743 (character set) (range)
2745 Big5 (1st byte) 0xA1 .. 0xFE
2746 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2747 --------------------------
2749 Since the number of characters in Big5 is larger than maximum
2750 characters in Emacs' charset (96x96), it can't be handled as one
2751 charset. So, in Emacs, Big5 is devided into two: `charset-big5-1'
2752 and `charset-big5-2'. Both <type>s are TYPE94x94. The former
2753 contains frequently used characters and the latter contains less
2754 frequently used characters. */
2756 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2757 ((c) >= 0xA1 && (c) <= 0xFE)
2759 /* Is this the second byte of a Shift-JIS two-byte char? */
2761 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2762 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2764 /* Number of Big5 characters which have the same code in 1st byte. */
2766 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2768 /* Code conversion macros. These are macros because they are used in
2769 inner loops during code conversion.
2771 Note that temporary variables in macros introduce the classic
2772 dynamic-scoping problems with variable names. We use capital-
2773 lettered variables in the assumption that XEmacs does not use
2774 capital letters in variables except in a very formalized way
2777 /* Convert Big5 code (b1, b2) into its internal string representation
2780 /* There is a much simpler way to split the Big5 charset into two.
2781 For the moment I'm going to leave the algorithm as-is because it
2782 claims to separate out the most-used characters into a single
2783 charset, which perhaps will lead to optimizations in various
2786 The way the algorithm works is something like this:
2788 Big5 can be viewed as a 94x157 charset, where the row is
2789 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2790 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2791 the split between low and high column numbers is apparently
2792 meaningless; ascending rows produce less and less frequent chars.
2793 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2794 the first charset, and the upper half (0xC9 .. 0xFE) to the
2795 second. To do the conversion, we convert the character into
2796 a single number where 0 .. 156 is the first row, 157 .. 313
2797 is the second, etc. That way, the characters are ordered by
2798 decreasing frequency. Then we just chop the space in two
2799 and coerce the result into a 94x94 space.
2802 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2804 int B1 = b1, B2 = b2; \
2806 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2810 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2814 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2815 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2817 c1 = I / (0xFF - 0xA1) + 0xA1; \
2818 c2 = I % (0xFF - 0xA1) + 0xA1; \
2821 /* Convert the internal string representation of a Big5 character
2822 (lb, c1, c2) into Big5 code (b1, b2). */
2824 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2826 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2828 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2830 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2832 b1 = I / BIG5_SAME_ROW + 0xA1; \
2833 b2 = I % BIG5_SAME_ROW; \
2834 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2838 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2846 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2847 (c >= 0x80 && c <= 0xA0))
2849 if (st->big5.in_second_byte)
2851 st->big5.in_second_byte = 0;
2852 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2856 st->big5.in_second_byte = 1;
2858 return CODING_CATEGORY_BIG5_MASK;
2861 /* Convert Big5 data to internal format. */
2864 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2865 unsigned_char_dynarr *dst, unsigned int n)
2868 unsigned int flags, ch;
2869 enum eol_type eol_type;
2870 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2872 CODING_STREAM_DECOMPOSE (str, flags, ch);
2873 eol_type = str->eol_type;
2880 /* Previous character was first byte of Big5 char. */
2881 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2883 unsigned char b1, b2, b3;
2884 DECODE_BIG5 (ch, c, b1, b2, b3);
2885 Dynarr_add (dst, b1);
2886 Dynarr_add (dst, b2);
2887 Dynarr_add (dst, b3);
2891 DECODE_ADD_BINARY_CHAR (ch, dst);
2892 DECODE_ADD_BINARY_CHAR (c, dst);
2898 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2899 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2902 DECODE_ADD_BINARY_CHAR (c, dst);
2904 label_continue_loop:;
2907 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2909 CODING_STREAM_COMPOSE (str, flags, ch);
2912 /* Convert internally-formatted data to Big5. */
2915 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2916 unsigned_char_dynarr *dst, unsigned int n)
2919 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2920 unsigned int flags, ch;
2921 enum eol_type eol_type;
2923 CODING_STREAM_DECOMPOSE (str, flags, ch);
2924 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2931 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2932 Dynarr_add (dst, '\r');
2933 if (eol_type != EOL_CR)
2934 Dynarr_add (dst, '\n');
2936 else if (BYTE_ASCII_P (c))
2939 Dynarr_add (dst, c);
2941 else if (BUFBYTE_LEADING_BYTE_P (c))
2943 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
2944 c == LEADING_BYTE_CHINESE_BIG5_2)
2946 /* A recognized leading byte. */
2948 continue; /* not done with this character. */
2950 /* otherwise just ignore this character. */
2952 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
2953 ch == LEADING_BYTE_CHINESE_BIG5_2)
2955 /* Previous char was a recognized leading byte. */
2957 continue; /* not done with this character. */
2961 /* Encountering second byte of a Big5 character. */
2962 unsigned char b1, b2;
2964 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
2965 Dynarr_add (dst, b1);
2966 Dynarr_add (dst, b2);
2972 CODING_STREAM_COMPOSE (str, flags, ch);
2976 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
2977 Decode a Big5 character CODE of BIG5 coding-system.
2978 CODE is the character code in BIG5, a cons of two integers.
2979 Return the corresponding character.
2983 unsigned char c1, c2, b1, b2;
2986 CHECK_INT (XCAR (code));
2987 CHECK_INT (XCDR (code));
2988 b1 = XINT (XCAR (code));
2989 b2 = XINT (XCDR (code));
2990 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
2991 BYTE_BIG5_TWO_BYTE_2_P (b2))
2994 Lisp_Object charset;
2995 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
2996 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
2997 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3003 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3004 Encode the Big5 character CH to BIG5 coding-system.
3005 Return the corresponding character code in Big5.
3009 Lisp_Object charset;
3012 CHECK_CHAR_COERCE_INT (ch);
3013 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3014 if (EQ (charset, Vcharset_chinese_big5_1) ||
3015 EQ (charset, Vcharset_chinese_big5_2))
3017 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3019 return Fcons (make_int (b1), make_int (b2));
3026 /************************************************************************/
3027 /* ISO2022 methods */
3028 /************************************************************************/
3030 /* The following note describes the coding system ISO2022 briefly.
3031 Since the intention of this note is to help understanding of the
3032 programs in this file, some parts are NOT ACCURATE or OVERLY
3033 SIMPLIFIED. For thorough understanding, please refer to the
3034 original document of ISO2022.
3036 ISO2022 provides many mechanisms to encode several character sets
3037 in 7-bit and 8-bit environments. If one chooses 7-bit environment,
3038 all text is encoded by codes of less than 128. This may make the
3039 encoded text a little bit longer, but the text get more stability
3040 to pass through several gateways (some of them strip off MSB).
3042 There are two kind of character sets: control character set and
3043 graphic character set. The former contains control characters such
3044 as `newline' and `escape' to provide control functions (control
3045 functions are provided also by escape sequence). The latter
3046 contains graphic characters such as 'A' and '-'. Emacs recognizes
3047 two control character sets and many graphic character sets.
3049 Graphic character sets are classified into one of four types,
3050 according to the dimension and number of characters in the set:
3051 TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each
3052 character set is assigned an identification byte, unique for each
3053 type, called "final character" (denoted as <F> hereafter). The <F>
3054 of each character set is decided by ECMA(*) when it is registered
3055 in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for
3058 Note (*): ECMA = European Computer Manufacturers Association
3060 Here are examples of graphic character set [NAME(<F>)]:
3061 o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3062 o TYPE96 -- right-half-of-ISO8859-1('A'), ...
3063 o TYPE94x94 -- GB2312('A'), JISX0208('B'), ...
3064 o TYPE96x96 -- none for the moment
3066 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
3067 C0 [0x00..0x1F] -- control character plane 0
3068 GL [0x20..0x7F] -- graphic character plane 0
3069 C1 [0x80..0x9F] -- control character plane 1
3070 GR [0xA0..0xFF] -- graphic character plane 1
3072 A control character set is directly designated and invoked to C0 or
3073 C1 by an escape sequence. The most common case is that:
3074 - ISO646's control character set is designated/invoked to C0, and
3075 - ISO6429's control character set is designated/invoked to C1,
3076 and usually these designations/invocations are omitted in encoded
3077 text. In a 7-bit environment, only C0 can be used, and a control
3078 character for C1 is encoded by an appropriate escape sequence to
3079 fit into the environment. All control characters for C1 are
3080 defined to have corresponding escape sequences.
3082 A graphic character set is at first designated to one of four
3083 graphic registers (G0 through G3), then these graphic registers are
3084 invoked to GL or GR. These designations and invocations can be
3085 done independently. The most common case is that G0 is invoked to
3086 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3087 these invocations and designations are omitted in encoded text.
3088 In a 7-bit environment, only GL can be used.
3090 When a graphic character set of TYPE94 or TYPE94x94 is invoked to
3091 GL, codes 0x20 and 0x7F of the GL area work as control characters
3092 SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area
3095 There are two ways of invocation: locking-shift and single-shift.
3096 With locking-shift, the invocation lasts until the next different
3097 invocation, whereas with single-shift, the invocation works only
3098 for the following character and doesn't affect locking-shift.
3099 Invocations are done by the following control characters or escape
3102 ----------------------------------------------------------------------
3103 abbrev function cntrl escape seq description
3104 ----------------------------------------------------------------------
3105 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3106 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3107 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR
3108 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3109 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR
3110 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3111 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR
3112 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3113 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3114 ----------------------------------------------------------------------
3115 The first four are for locking-shift. Control characters for these
3116 functions are defined by macros ISO_CODE_XXX in `coding.h'.
3118 Designations are done by the following escape sequences.
3119 ----------------------------------------------------------------------
3120 escape sequence description
3121 ----------------------------------------------------------------------
3122 ESC '(' <F> designate TYPE94<F> to G0
3123 ESC ')' <F> designate TYPE94<F> to G1
3124 ESC '*' <F> designate TYPE94<F> to G2
3125 ESC '+' <F> designate TYPE94<F> to G3
3126 ESC ',' <F> designate TYPE96<F> to G0 (*)
3127 ESC '-' <F> designate TYPE96<F> to G1
3128 ESC '.' <F> designate TYPE96<F> to G2
3129 ESC '/' <F> designate TYPE96<F> to G3
3130 ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**)
3131 ESC '$' ')' <F> designate TYPE94x94<F> to G1
3132 ESC '$' '*' <F> designate TYPE94x94<F> to G2
3133 ESC '$' '+' <F> designate TYPE94x94<F> to G3
3134 ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*)
3135 ESC '$' '-' <F> designate TYPE96x96<F> to G1
3136 ESC '$' '.' <F> designate TYPE96x96<F> to G2
3137 ESC '$' '/' <F> designate TYPE96x96<F> to G3
3138 ----------------------------------------------------------------------
3139 In this list, "TYPE94<F>" means a graphic character set of type TYPE94
3140 and final character <F>, and etc.
3142 Note (*): Although these designations are not allowed in ISO2022,
3143 Emacs accepts them on decoding, and produces them on encoding
3144 TYPE96 or TYPE96x96 character set in a coding system which is
3145 characterized as 7-bit environment, non-locking-shift, and
3148 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3149 '(' can be omitted. We call this as "short-form" here after.
3151 Now you may notice that there are a lot of ways for encoding the
3152 same multilingual text in ISO2022. Actually, there exist many
3153 coding systems such as Compound Text (used in X's inter client
3154 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3155 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3156 localized platforms), and all of these are variants of ISO2022.
3158 In addition to the above, Emacs handles two more kinds of escape
3159 sequences: ISO6429's direction specification and Emacs' private
3160 sequence for specifying character composition.
3162 ISO6429's direction specification takes the following format:
3163 o CSI ']' -- end of the current direction
3164 o CSI '0' ']' -- end of the current direction
3165 o CSI '1' ']' -- start of left-to-right text
3166 o CSI '2' ']' -- start of right-to-left text
3167 The control character CSI (0x9B: control sequence introducer) is
3168 abbreviated to the escape sequence ESC '[' in 7-bit environment.
3170 Character composition specification takes the following format:
3171 o ESC '0' -- start character composition
3172 o ESC '1' -- end character composition
3173 Since these are not standard escape sequences of any ISO, the use
3174 of them for these meanings is restricted to Emacs only. */
3177 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3181 for (i = 0; i < 4; i++)
3183 if (!NILP (coding_system))
3185 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3187 iso->charset[i] = Qt;
3188 iso->invalid_designated[i] = 0;
3190 iso->esc = ISO_ESC_NOTHING;
3191 iso->esc_bytes_index = 0;
3192 iso->register_left = 0;
3193 iso->register_right = 1;
3194 iso->switched_dir_and_no_valid_charset_yet = 0;
3195 iso->invalid_switch_dir = 0;
3196 iso->output_direction_sequence = 0;
3197 iso->output_literally = 0;
3198 if (iso->composite_chars)
3199 Dynarr_reset (iso->composite_chars);
3203 fit_to_be_escape_quoted (unsigned char c)
3220 /* Parse one byte of an ISO2022 escape sequence.
3221 If the result is an invalid escape sequence, return 0 and
3222 do not change anything in STR. Otherwise, if the result is
3223 an incomplete escape sequence, update ISO2022.ESC and
3224 ISO2022.ESC_BYTES and return -1. Otherwise, update
3225 all the state variables (but not ISO2022.ESC_BYTES) and
3228 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3229 or invocation of an invalid character set and treat that as
3230 an unrecognized escape sequence. */
3233 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3234 unsigned char c, unsigned int *flags,
3235 int check_invalid_charsets)
3237 /* (1) If we're at the end of a designation sequence, CS is the
3238 charset being designated and REG is the register to designate
3241 (2) If we're at the end of a locking-shift sequence, REG is
3242 the register to invoke and HALF (0 == left, 1 == right) is
3243 the half to invoke it into.
3245 (3) If we're at the end of a single-shift sequence, REG is
3246 the register to invoke. */
3247 Lisp_Object cs = Qnil;
3250 /* NOTE: This code does goto's all over the fucking place.
3251 The reason for this is that we're basically implementing
3252 a state machine here, and hierarchical languages like C
3253 don't really provide a clean way of doing this. */
3255 if (! (*flags & CODING_STATE_ESCAPE))
3256 /* At beginning of escape sequence; we need to reset our
3257 escape-state variables. */
3258 iso->esc = ISO_ESC_NOTHING;
3260 iso->output_literally = 0;
3261 iso->output_direction_sequence = 0;
3265 case ISO_ESC_NOTHING:
3266 iso->esc_bytes_index = 0;
3269 case ISO_CODE_ESC: /* Start escape sequence */
3270 *flags |= CODING_STATE_ESCAPE;
3274 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3275 *flags |= CODING_STATE_ESCAPE;
3276 iso->esc = ISO_ESC_5_11;
3279 case ISO_CODE_SO: /* locking shift 1 */
3282 case ISO_CODE_SI: /* locking shift 0 */
3286 case ISO_CODE_SS2: /* single shift */
3289 case ISO_CODE_SS3: /* single shift */
3293 default: /* Other control characters */
3300 /**** single shift ****/
3302 case 'N': /* single shift 2 */
3305 case 'O': /* single shift 3 */
3309 /**** locking shift ****/
3311 case '~': /* locking shift 1 right */
3314 case 'n': /* locking shift 2 */
3317 case '}': /* locking shift 2 right */
3320 case 'o': /* locking shift 3 */
3323 case '|': /* locking shift 3 right */
3327 /**** composite ****/
3330 iso->esc = ISO_ESC_START_COMPOSITE;
3331 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3332 CODING_STATE_COMPOSITE;
3336 iso->esc = ISO_ESC_END_COMPOSITE;
3337 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3338 ~CODING_STATE_COMPOSITE;
3341 /**** directionality ****/
3344 iso->esc = ISO_ESC_5_11;
3347 /**** designation ****/
3349 case '$': /* multibyte charset prefix */
3350 iso->esc = ISO_ESC_2_4;
3354 if (0x28 <= c && c <= 0x2F)
3356 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3360 /* This function is called with CODESYS equal to nil when
3361 doing coding-system detection. */
3363 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3364 && fit_to_be_escape_quoted (c))
3366 iso->esc = ISO_ESC_LITERAL;
3367 *flags &= CODING_STATE_ISO2022_LOCK;
3377 /**** directionality ****/
3379 case ISO_ESC_5_11: /* ISO6429 direction control */
3382 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3383 goto directionality;
3385 if (c == '0') iso->esc = ISO_ESC_5_11_0;
3386 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3387 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3391 case ISO_ESC_5_11_0:
3394 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3395 goto directionality;
3399 case ISO_ESC_5_11_1:
3402 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3403 goto directionality;
3407 case ISO_ESC_5_11_2:
3410 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
3411 goto directionality;
3416 iso->esc = ISO_ESC_DIRECTIONALITY;
3417 /* Various junk here to attempt to preserve the direction sequences
3418 literally in the text if they would otherwise be swallowed due
3419 to invalid designations that don't show up as actual charset
3420 changes in the text. */
3421 if (iso->invalid_switch_dir)
3423 /* We already inserted a direction switch literally into the
3424 text. We assume (#### this may not be right) that the
3425 next direction switch is the one going the other way,
3426 and we need to output that literally as well. */
3427 iso->output_literally = 1;
3428 iso->invalid_switch_dir = 0;
3434 /* If we are in the thrall of an invalid designation,
3435 then stick the directionality sequence literally into the
3436 output stream so it ends up in the original text again. */
3437 for (jj = 0; jj < 4; jj++)
3438 if (iso->invalid_designated[jj])
3442 iso->output_literally = 1;
3443 iso->invalid_switch_dir = 1;
3446 /* Indicate that we haven't yet seen a valid designation,
3447 so that if a switch-dir is directly followed by an
3448 invalid designation, both get inserted literally. */
3449 iso->switched_dir_and_no_valid_charset_yet = 1;
3454 /**** designation ****/
3457 if (0x28 <= c && c <= 0x2F)
3459 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
3462 if (0x40 <= c && c <= 0x42)
3464 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
3465 *flags & CODING_STATE_R2L ?
3466 CHARSET_RIGHT_TO_LEFT :
3467 CHARSET_LEFT_TO_RIGHT);
3477 if (c < '0' || c > '~')
3478 return 0; /* bad final byte */
3480 if (iso->esc >= ISO_ESC_2_8 &&
3481 iso->esc <= ISO_ESC_2_15)
3483 type = ((iso->esc >= ISO_ESC_2_12) ?
3484 CHARSET_TYPE_96 : CHARSET_TYPE_94);
3485 reg = (iso->esc - ISO_ESC_2_8) & 3;
3487 else if (iso->esc >= ISO_ESC_2_4_8 &&
3488 iso->esc <= ISO_ESC_2_4_15)
3490 type = ((iso->esc >= ISO_ESC_2_4_12) ?
3491 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
3492 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
3496 /* Can this ever be reached? -slb */
3500 cs = CHARSET_BY_ATTRIBUTES (type, c,
3501 *flags & CODING_STATE_R2L ?
3502 CHARSET_RIGHT_TO_LEFT :
3503 CHARSET_LEFT_TO_RIGHT);
3509 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
3513 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
3514 /* can't invoke something that ain't there. */
3516 iso->esc = ISO_ESC_SINGLE_SHIFT;
3517 *flags &= CODING_STATE_ISO2022_LOCK;
3519 *flags |= CODING_STATE_SS2;
3521 *flags |= CODING_STATE_SS3;
3525 if (check_invalid_charsets &&
3526 !CHARSETP (iso->charset[reg]))
3527 /* can't invoke something that ain't there. */
3530 iso->register_right = reg;
3532 iso->register_left = reg;
3533 *flags &= CODING_STATE_ISO2022_LOCK;
3534 iso->esc = ISO_ESC_LOCKING_SHIFT;
3538 if (NILP (cs) && check_invalid_charsets)
3540 iso->invalid_designated[reg] = 1;
3541 iso->charset[reg] = Vcharset_ascii;
3542 iso->esc = ISO_ESC_DESIGNATE;
3543 *flags &= CODING_STATE_ISO2022_LOCK;
3544 iso->output_literally = 1;
3545 if (iso->switched_dir_and_no_valid_charset_yet)
3547 /* We encountered a switch-direction followed by an
3548 invalid designation. Ensure that the switch-direction
3549 gets outputted; otherwise it will probably get eaten
3550 when the text is written out again. */
3551 iso->switched_dir_and_no_valid_charset_yet = 0;
3552 iso->output_direction_sequence = 1;
3553 /* And make sure that the switch-dir going the other
3554 way gets outputted, as well. */
3555 iso->invalid_switch_dir = 1;
3559 /* This function is called with CODESYS equal to nil when
3560 doing coding-system detection. */
3561 if (!NILP (codesys))
3563 charset_conversion_spec_dynarr *dyn =
3564 XCODING_SYSTEM (codesys)->iso2022.input_conv;
3570 for (i = 0; i < Dynarr_length (dyn); i++)
3572 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
3573 if (EQ (cs, spec->from_charset))
3574 cs = spec->to_charset;
3579 iso->charset[reg] = cs;
3580 iso->esc = ISO_ESC_DESIGNATE;
3581 *flags &= CODING_STATE_ISO2022_LOCK;
3582 if (iso->invalid_designated[reg])
3584 iso->invalid_designated[reg] = 0;
3585 iso->output_literally = 1;
3587 if (iso->switched_dir_and_no_valid_charset_yet)
3588 iso->switched_dir_and_no_valid_charset_yet = 0;
3593 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
3599 /* #### There are serious deficiencies in the recognition mechanism
3600 here. This needs to be much smarter if it's going to cut it. */
3602 if (!st->iso2022.initted)
3604 reset_iso2022 (Qnil, &st->iso2022.iso);
3605 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
3606 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3607 CODING_CATEGORY_ISO_8_1_MASK |
3608 CODING_CATEGORY_ISO_8_2_MASK |
3609 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
3610 st->iso2022.flags = 0;
3611 st->iso2022.high_byte_count = 0;
3612 st->iso2022.saw_single_shift = 0;
3613 st->iso2022.initted = 1;
3616 mask = st->iso2022.mask;
3623 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3624 st->iso2022.high_byte_count++;
3628 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
3630 if (st->iso2022.high_byte_count & 1)
3631 /* odd number of high bytes; assume not iso-8-2 */
3632 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3634 st->iso2022.high_byte_count = 0;
3635 st->iso2022.saw_single_shift = 0;
3637 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3639 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
3640 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
3641 { /* control chars */
3644 /* Allow and ignore control characters that you might
3645 reasonably see in a text file */
3650 case 8: /* backspace */
3651 case 11: /* vertical tab */
3652 case 12: /* form feed */
3653 case 26: /* MS-DOS C-z junk */
3654 case 31: /* '^_' -- for info */
3655 goto label_continue_loop;
3662 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
3665 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
3666 &st->iso2022.flags, 0))
3668 switch (st->iso2022.iso.esc)
3670 case ISO_ESC_DESIGNATE:
3671 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
3672 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3674 case ISO_ESC_LOCKING_SHIFT:
3675 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
3676 goto ran_out_of_chars;
3677 case ISO_ESC_SINGLE_SHIFT:
3678 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
3679 st->iso2022.saw_single_shift = 1;
3688 goto ran_out_of_chars;
3691 label_continue_loop:;
3700 postprocess_iso2022_mask (int mask)
3702 /* #### kind of cheesy */
3703 /* If seven-bit ISO is allowed, then assume that the encoding is
3704 entirely seven-bit and turn off the eight-bit ones. */
3705 if (mask & CODING_CATEGORY_ISO_7_MASK)
3706 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3707 CODING_CATEGORY_ISO_8_1_MASK |
3708 CODING_CATEGORY_ISO_8_2_MASK);
3712 /* If FLAGS is a null pointer or specifies right-to-left motion,
3713 output a switch-dir-to-left-to-right sequence to DST.
3714 Also update FLAGS if it is not a null pointer.
3715 If INTERNAL_P is set, we are outputting in internal format and
3716 need to handle the CSI differently. */
3719 restore_left_to_right_direction (struct Lisp_Coding_System *codesys,
3720 unsigned_char_dynarr *dst,
3721 unsigned int *flags,
3724 if (!flags || (*flags & CODING_STATE_R2L))
3726 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3728 Dynarr_add (dst, ISO_CODE_ESC);
3729 Dynarr_add (dst, '[');
3731 else if (internal_p)
3732 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3734 Dynarr_add (dst, ISO_CODE_CSI);
3735 Dynarr_add (dst, '0');
3736 Dynarr_add (dst, ']');
3738 *flags &= ~CODING_STATE_R2L;
3742 /* If FLAGS is a null pointer or specifies a direction different from
3743 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
3744 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
3745 sequence to DST. Also update FLAGS if it is not a null pointer.
3746 If INTERNAL_P is set, we are outputting in internal format and
3747 need to handle the CSI differently. */
3750 ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys,
3751 unsigned_char_dynarr *dst, unsigned int *flags,
3754 if ((!flags || (*flags & CODING_STATE_R2L)) &&
3755 direction == CHARSET_LEFT_TO_RIGHT)
3756 restore_left_to_right_direction (codesys, dst, flags, internal_p);
3757 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
3758 && (!flags || !(*flags & CODING_STATE_R2L)) &&
3759 direction == CHARSET_RIGHT_TO_LEFT)
3761 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3763 Dynarr_add (dst, ISO_CODE_ESC);
3764 Dynarr_add (dst, '[');
3766 else if (internal_p)
3767 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3769 Dynarr_add (dst, ISO_CODE_CSI);
3770 Dynarr_add (dst, '2');
3771 Dynarr_add (dst, ']');
3773 *flags |= CODING_STATE_R2L;
3777 /* Convert ISO2022-format data to internal format. */
3780 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
3781 unsigned_char_dynarr *dst, unsigned int n)
3784 unsigned int flags, ch;
3785 enum eol_type eol_type;
3786 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3787 Lisp_Object coding_system;
3788 unsigned_char_dynarr *real_dst = dst;
3790 CODING_STREAM_DECOMPOSE (str, flags, ch);
3791 eol_type = str->eol_type;
3792 XSETCODING_SYSTEM (coding_system, str->codesys);
3794 if (flags & CODING_STATE_COMPOSITE)
3795 dst = str->iso2022.composite_chars;
3800 if (flags & CODING_STATE_ESCAPE)
3801 { /* Within ESC sequence */
3802 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
3807 switch (str->iso2022.esc)
3809 case ISO_ESC_START_COMPOSITE:
3810 if (str->iso2022.composite_chars)
3811 Dynarr_reset (str->iso2022.composite_chars);
3813 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
3814 dst = str->iso2022.composite_chars;
3816 case ISO_ESC_END_COMPOSITE:
3818 Bufbyte comstr[MAX_EMCHAR_LEN];
3820 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
3821 Dynarr_length (dst));
3823 len = set_charptr_emchar (comstr, emch);
3824 Dynarr_add_many (dst, comstr, len);
3828 case ISO_ESC_LITERAL:
3829 DECODE_ADD_BINARY_CHAR (c, dst);
3833 /* Everything else handled already */
3838 /* Attempted error recovery. */
3839 if (str->iso2022.output_direction_sequence)
3840 ensure_correct_direction (flags & CODING_STATE_R2L ?
3841 CHARSET_RIGHT_TO_LEFT :
3842 CHARSET_LEFT_TO_RIGHT,
3843 str->codesys, dst, 0, 1);
3844 /* More error recovery. */
3845 if (!retval || str->iso2022.output_literally)
3847 /* Output the (possibly invalid) sequence */
3849 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
3850 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
3851 flags &= CODING_STATE_ISO2022_LOCK;
3853 n++, src--;/* Repeat the loop with the same character. */
3856 /* No sense in reprocessing the final byte of the
3857 escape sequence; it could mess things up anyway.
3859 DECODE_ADD_BINARY_CHAR (c, dst);
3864 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
3865 { /* Control characters */
3867 /***** Error-handling *****/
3869 /* If we were in the middle of a character, dump out the
3870 partial character. */
3871 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3873 /* If we just saw a single-shift character, dump it out.
3874 This may dump out the wrong sort of single-shift character,
3875 but least it will give an indication that something went
3877 if (flags & CODING_STATE_SS2)
3879 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
3880 flags &= ~CODING_STATE_SS2;
3882 if (flags & CODING_STATE_SS3)
3884 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
3885 flags &= ~CODING_STATE_SS3;
3888 /***** Now handle the control characters. *****/
3891 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3893 flags &= CODING_STATE_ISO2022_LOCK;
3895 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
3896 DECODE_ADD_BINARY_CHAR (c, dst);
3899 { /* Graphic characters */
3900 Lisp_Object charset;
3904 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3906 /* Now determine the charset. */
3907 reg = ((flags & CODING_STATE_SS2) ? 2
3908 : (flags & CODING_STATE_SS3) ? 3
3909 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
3910 : str->iso2022.register_left);
3911 charset = str->iso2022.charset[reg];
3913 /* Error checking: */
3914 if (NILP (charset) || str->iso2022.invalid_designated[reg]
3915 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
3916 && XCHARSET_CHARS (charset) == 94))
3917 /* Mrmph. We are trying to invoke a register that has no
3918 or an invalid charset in it, or trying to add a character
3919 outside the range of the charset. Insert that char literally
3920 to preserve it for the output. */
3922 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3923 DECODE_ADD_BINARY_CHAR (c, dst);
3928 /* Things are probably hunky-dorey. */
3930 /* Fetch reverse charset, maybe. */
3931 if (((flags & CODING_STATE_R2L) &&
3932 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
3934 (!(flags & CODING_STATE_R2L) &&
3935 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
3937 Lisp_Object new_charset =
3938 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
3939 if (!NILP (new_charset))
3940 charset = new_charset;
3943 lb = XCHARSET_LEADING_BYTE (charset);
3944 switch (XCHARSET_REP_BYTES (charset))
3947 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3948 Dynarr_add (dst, c & 0x7F);
3951 case 2: /* one-byte official */
3952 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3953 Dynarr_add (dst, lb);
3954 Dynarr_add (dst, c | 0x80);
3957 case 3: /* one-byte private or two-byte official */
3958 if (XCHARSET_PRIVATE_P (charset))
3960 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3961 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
3962 Dynarr_add (dst, lb);
3963 Dynarr_add (dst, c | 0x80);
3969 Dynarr_add (dst, lb);
3970 Dynarr_add (dst, ch | 0x80);
3971 Dynarr_add (dst, c | 0x80);
3979 default: /* two-byte private */
3982 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
3983 Dynarr_add (dst, lb);
3984 Dynarr_add (dst, ch | 0x80);
3985 Dynarr_add (dst, c | 0x80);
3994 flags &= CODING_STATE_ISO2022_LOCK;
3997 label_continue_loop:;
4000 if (flags & CODING_STATE_END)
4001 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4003 CODING_STREAM_COMPOSE (str, flags, ch);
4007 /***** ISO2022 encoder *****/
4009 /* Designate CHARSET into register REG. */
4012 iso2022_designate (Lisp_Object charset, unsigned char reg,
4013 struct encoding_stream *str, unsigned_char_dynarr *dst)
4015 CONST char *inter94 = "()*+", *inter96= ",-./";
4017 unsigned char final;
4018 Lisp_Object old_charset = str->iso2022.charset[reg];
4020 str->iso2022.charset[reg] = charset;
4021 if (!CHARSETP (charset))
4022 /* charset might be an initial nil or t. */
4024 type = XCHARSET_TYPE (charset);
4025 final = XCHARSET_FINAL (charset);
4026 if (!str->iso2022.force_charset_on_output[reg] &&
4027 CHARSETP (old_charset) &&
4028 XCHARSET_TYPE (old_charset) == type &&
4029 XCHARSET_FINAL (old_charset) == final)
4032 str->iso2022.force_charset_on_output[reg] = 0;
4035 charset_conversion_spec_dynarr *dyn =
4036 str->codesys->iso2022.output_conv;
4042 for (i = 0; i < Dynarr_length (dyn); i++)
4044 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4045 if (EQ (charset, spec->from_charset))
4046 charset = spec->to_charset;
4051 Dynarr_add (dst, ISO_CODE_ESC);
4054 case CHARSET_TYPE_94:
4055 Dynarr_add (dst, inter94[reg]);
4057 case CHARSET_TYPE_96:
4058 Dynarr_add (dst, inter96[reg]);
4060 case CHARSET_TYPE_94X94:
4061 Dynarr_add (dst, '$');
4063 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4066 Dynarr_add (dst, inter94[reg]);
4068 case CHARSET_TYPE_96X96:
4069 Dynarr_add (dst, '$');
4070 Dynarr_add (dst, inter96[reg]);
4073 Dynarr_add (dst, final);
4077 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4079 if (str->iso2022.register_left != 0)
4081 Dynarr_add (dst, ISO_CODE_SI);
4082 str->iso2022.register_left = 0;
4087 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4089 if (str->iso2022.register_left != 1)
4091 Dynarr_add (dst, ISO_CODE_SO);
4092 str->iso2022.register_left = 1;
4096 /* Convert internally-formatted data to ISO2022 format. */
4099 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4100 unsigned_char_dynarr *dst, unsigned int n)
4102 unsigned char charmask, c;
4103 unsigned int flags, ch;
4104 enum eol_type eol_type;
4105 unsigned char char_boundary;
4106 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4107 struct Lisp_Coding_System *codesys = str->codesys;
4109 Lisp_Object charset;
4112 /* flags for handling composite chars. We do a little switcharoo
4113 on the source while we're outputting the composite char. */
4114 unsigned int saved_n = 0;
4115 CONST unsigned char *saved_src = NULL;
4116 int in_composite = 0;
4118 CODING_STREAM_DECOMPOSE (str, flags, ch);
4119 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4120 char_boundary = str->iso2022.current_char_boundary;
4121 charset = str->iso2022.current_charset;
4122 half = str->iso2022.current_half;
4129 if (BYTE_ASCII_P (c))
4130 { /* Processing ASCII character */
4133 restore_left_to_right_direction (codesys, dst, &flags, 0);
4135 /* Make sure G0 contains ASCII */
4136 if ((c > ' ' && c < ISO_CODE_DEL) ||
4137 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4139 ensure_normal_shift (str, dst);
4140 iso2022_designate (Vcharset_ascii, 0, str, dst);
4143 /* If necessary, restore everything to the default state
4146 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4148 restore_left_to_right_direction (codesys, dst, &flags, 0);
4150 ensure_normal_shift (str, dst);
4152 for (i = 0; i < 4; i++)
4154 Lisp_Object initial_charset =
4155 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4156 iso2022_designate (initial_charset, i, str, dst);
4161 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4162 Dynarr_add (dst, '\r');
4163 if (eol_type != EOL_CR)
4164 Dynarr_add (dst, c);
4168 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4169 && fit_to_be_escape_quoted (c))
4170 Dynarr_add (dst, ISO_CODE_ESC);
4171 Dynarr_add (dst, c);
4176 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4177 { /* Processing Leading Byte */
4179 charset = CHARSET_BY_LEADING_BYTE (c);
4180 if (LEADING_BYTE_PREFIX_P(c))
4182 else if (!EQ (charset, Vcharset_control_1)
4183 && !EQ (charset, Vcharset_composite))
4187 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4188 codesys, dst, &flags, 0);
4190 /* Now determine which register to use. */
4192 for (i = 0; i < 4; i++)
4194 if (EQ (charset, str->iso2022.charset[i]) ||
4196 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4205 if (XCHARSET_GRAPHIC (charset) != 0)
4207 if (!NILP (str->iso2022.charset[1]) &&
4208 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4209 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4211 else if (!NILP (str->iso2022.charset[2]))
4213 else if (!NILP (str->iso2022.charset[3]))
4222 iso2022_designate (charset, reg, str, dst);
4224 /* Now invoke that register. */
4228 ensure_normal_shift (str, dst);
4233 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4235 ensure_shift_out (str, dst);
4243 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4245 Dynarr_add (dst, ISO_CODE_ESC);
4246 Dynarr_add (dst, 'N');
4251 Dynarr_add (dst, ISO_CODE_SS2);
4257 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4259 Dynarr_add (dst, ISO_CODE_ESC);
4260 Dynarr_add (dst, 'O');
4265 Dynarr_add (dst, ISO_CODE_SS3);
4277 { /* Processing Non-ASCII character */
4278 charmask = (half == 0 ? 0x7F : 0xFF);
4280 if (EQ (charset, Vcharset_control_1))
4282 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4283 && fit_to_be_escape_quoted (c))
4284 Dynarr_add (dst, ISO_CODE_ESC);
4285 /* you asked for it ... */
4286 Dynarr_add (dst, c - 0x20);
4290 switch (XCHARSET_REP_BYTES (charset))
4293 Dynarr_add (dst, c & charmask);
4296 if (XCHARSET_PRIVATE_P (charset))
4298 Dynarr_add (dst, c & charmask);
4303 if (EQ (charset, Vcharset_composite))
4307 /* #### Bother! We don't know how to
4309 Dynarr_add (dst, '~');
4313 Emchar emch = MAKE_CHAR (Vcharset_composite,
4314 ch & 0x7F, c & 0x7F);
4315 Lisp_Object lstr = composite_char_string (emch);
4319 src = XSTRING_DATA (lstr);
4320 n = XSTRING_LENGTH (lstr);
4321 Dynarr_add (dst, ISO_CODE_ESC);
4322 Dynarr_add (dst, '0'); /* start composing */
4327 Dynarr_add (dst, ch & charmask);
4328 Dynarr_add (dst, c & charmask);
4341 Dynarr_add (dst, ch & charmask);
4342 Dynarr_add (dst, c & charmask);
4363 Dynarr_add (dst, ISO_CODE_ESC);
4364 Dynarr_add (dst, '1'); /* end composing */
4365 goto back_to_square_n; /* Wheeeeeeeee ..... */
4368 if (char_boundary && flags & CODING_STATE_END)
4370 restore_left_to_right_direction (codesys, dst, &flags, 0);
4371 ensure_normal_shift (str, dst);
4372 for (i = 0; i < 4; i++)
4374 Lisp_Object initial_charset =
4375 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4376 iso2022_designate (initial_charset, i, str, dst);
4380 CODING_STREAM_COMPOSE (str, flags, ch);
4381 str->iso2022.current_char_boundary = char_boundary;
4382 str->iso2022.current_charset = charset;
4383 str->iso2022.current_half = half;
4385 /* Verbum caro factum est! */
4389 /************************************************************************/
4390 /* No-conversion methods */
4391 /************************************************************************/
4393 /* This is used when reading in "binary" files -- i.e. files that may
4394 contain all 256 possible byte values and that are not to be
4395 interpreted as being in any particular decoding. */
4397 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4398 unsigned_char_dynarr *dst, unsigned int n)
4401 unsigned int flags, ch;
4402 enum eol_type eol_type;
4403 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4405 CODING_STREAM_DECOMPOSE (str, flags, ch);
4406 eol_type = str->eol_type;
4412 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4413 DECODE_ADD_BINARY_CHAR (c, dst);
4414 label_continue_loop:;
4417 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4419 CODING_STREAM_COMPOSE (str, flags, ch);
4423 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4424 unsigned_char_dynarr *dst, unsigned int n)
4427 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4428 unsigned int flags, ch;
4429 enum eol_type eol_type;
4431 CODING_STREAM_DECOMPOSE (str, flags, ch);
4432 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4439 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4440 Dynarr_add (dst, '\r');
4441 if (eol_type != EOL_CR)
4442 Dynarr_add (dst, '\n');
4445 else if (BYTE_ASCII_P (c))
4448 Dynarr_add (dst, c);
4450 else if (BUFBYTE_LEADING_BYTE_P (c))
4453 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
4454 c == LEADING_BYTE_CONTROL_1)
4457 Dynarr_add (dst, '~'); /* untranslatable character */
4461 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
4462 Dynarr_add (dst, c);
4463 else if (ch == LEADING_BYTE_CONTROL_1)
4466 Dynarr_add (dst, c - 0x20);
4468 /* else it should be the second or third byte of an
4469 untranslatable character, so ignore it */
4474 CODING_STREAM_COMPOSE (str, flags, ch);
4478 /************************************************************************/
4479 /* Simple internal/external functions */
4480 /************************************************************************/
4482 static Extbyte_dynarr *conversion_out_dynarr;
4483 static Bufbyte_dynarr *conversion_in_dynarr;
4485 /* Determine coding system from coding format */
4487 #define FILE_NAME_CODING_SYSTEM \
4488 ((NILP (Vfile_name_coding_system) || \
4489 (EQ ((Vfile_name_coding_system), Qbinary))) ? \
4490 Qnil : Fget_coding_system (Vfile_name_coding_system))
4492 /* #### not correct for all values of `fmt'! */
4494 #define FMT_CODING_SYSTEM(fmt) \
4495 (((fmt) == FORMAT_FILENAME) ? FILE_NAME_CODING_SYSTEM : \
4496 ((fmt) == FORMAT_CTEXT ) ? Fget_coding_system (Qctext) : \
4497 ((fmt) == FORMAT_TERMINAL) ? FILE_NAME_CODING_SYSTEM : \
4500 #define FMT_CODING_SYSTEM(fmt) \
4501 (((fmt) == FORMAT_FILENAME) ? FILE_NAME_CODING_SYSTEM : \
4502 ((fmt) == FORMAT_TERMINAL) ? FILE_NAME_CODING_SYSTEM : \
4507 convert_to_external_format (CONST Bufbyte *ptr,
4510 enum external_data_format fmt)
4512 Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
4514 if (!conversion_out_dynarr)
4515 conversion_out_dynarr = Dynarr_new (Extbyte);
4517 Dynarr_reset (conversion_out_dynarr);
4519 if (NILP (coding_system))
4521 CONST Bufbyte *end = ptr + len;
4526 (BYTE_ASCII_P (*ptr)) ? *ptr :
4527 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
4528 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
4531 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
4535 #ifdef ERROR_CHECK_BUFPOS
4536 assert (ptr == end);
4541 Lisp_Object instream, outstream, da_outstream;
4542 Lstream *istr, *ostr;
4543 struct gcpro gcpro1, gcpro2, gcpro3;
4544 char tempbuf[1024]; /* some random amount */
4546 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4547 da_outstream = make_dynarr_output_stream
4548 ((unsigned_char_dynarr *) conversion_out_dynarr);
4550 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
4551 istr = XLSTREAM (instream);
4552 ostr = XLSTREAM (outstream);
4553 GCPRO3 (instream, outstream, da_outstream);
4556 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4559 Lstream_write (ostr, tempbuf, size_in_bytes);
4561 Lstream_close (istr);
4562 Lstream_close (ostr);
4564 Lstream_delete (istr);
4565 Lstream_delete (ostr);
4566 Lstream_delete (XLSTREAM (da_outstream));
4569 *len_out = Dynarr_length (conversion_out_dynarr);
4570 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
4571 return Dynarr_atp (conversion_out_dynarr, 0);
4575 convert_from_external_format (CONST Extbyte *ptr,
4578 enum external_data_format fmt)
4580 Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
4582 if (!conversion_in_dynarr)
4583 conversion_in_dynarr = Dynarr_new (Bufbyte);
4585 Dynarr_reset (conversion_in_dynarr);
4587 if (NILP (coding_system))
4589 CONST Extbyte *end = ptr + len;
4590 for (; ptr < end; ptr++)
4593 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
4598 Lisp_Object instream, outstream, da_outstream;
4599 Lstream *istr, *ostr;
4600 struct gcpro gcpro1, gcpro2, gcpro3;
4601 char tempbuf[1024]; /* some random amount */
4603 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4604 da_outstream = make_dynarr_output_stream
4605 ((unsigned_char_dynarr *) conversion_in_dynarr);
4607 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
4608 istr = XLSTREAM (instream);
4609 ostr = XLSTREAM (outstream);
4610 GCPRO3 (instream, outstream, da_outstream);
4613 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4616 Lstream_write (ostr, tempbuf, size_in_bytes);
4618 Lstream_close (istr);
4619 Lstream_close (ostr);
4621 Lstream_delete (istr);
4622 Lstream_delete (ostr);
4623 Lstream_delete (XLSTREAM (da_outstream));
4626 *len_out = Dynarr_length (conversion_in_dynarr);
4627 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
4628 return Dynarr_atp (conversion_in_dynarr, 0);
4632 /************************************************************************/
4633 /* Initialization */
4634 /************************************************************************/
4637 syms_of_mule_coding (void)
4639 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
4640 deferror (&Qcoding_system_error, "coding-system-error",
4641 "Coding-system error", Qio_error);
4643 DEFSUBR (Fcoding_system_p);
4644 DEFSUBR (Ffind_coding_system);
4645 DEFSUBR (Fget_coding_system);
4646 DEFSUBR (Fcoding_system_list);
4647 DEFSUBR (Fcoding_system_name);
4648 DEFSUBR (Fmake_coding_system);
4649 DEFSUBR (Fcopy_coding_system);
4650 DEFSUBR (Fsubsidiary_coding_system);
4652 DEFSUBR (Fcoding_system_type);
4653 DEFSUBR (Fcoding_system_doc_string);
4655 DEFSUBR (Fcoding_system_charset);
4657 DEFSUBR (Fcoding_system_property);
4659 DEFSUBR (Fcoding_category_list);
4660 DEFSUBR (Fset_coding_priority_list);
4661 DEFSUBR (Fcoding_priority_list);
4662 DEFSUBR (Fset_coding_category_system);
4663 DEFSUBR (Fcoding_category_system);
4665 DEFSUBR (Fdetect_coding_region);
4666 DEFSUBR (Fdecode_coding_region);
4667 DEFSUBR (Fencode_coding_region);
4669 DEFSUBR (Fdecode_shift_jis_char);
4670 DEFSUBR (Fencode_shift_jis_char);
4671 DEFSUBR (Fdecode_big5_char);
4672 DEFSUBR (Fencode_big5_char);
4674 defsymbol (&Qcoding_system_p, "coding-system-p");
4675 defsymbol (&Qno_conversion, "no-conversion");
4677 defsymbol (&Qbig5, "big5");
4678 defsymbol (&Qshift_jis, "shift-jis");
4679 defsymbol (&Qccl, "ccl");
4680 defsymbol (&Qiso2022, "iso2022");
4682 defsymbol (&Qmnemonic, "mnemonic");
4683 defsymbol (&Qeol_type, "eol-type");
4684 defsymbol (&Qpost_read_conversion, "post-read-conversion");
4685 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
4687 defsymbol (&Qcr, "cr");
4688 defsymbol (&Qlf, "lf");
4689 defsymbol (&Qcrlf, "crlf");
4690 defsymbol (&Qeol_cr, "eol-cr");
4691 defsymbol (&Qeol_lf, "eol-lf");
4692 defsymbol (&Qeol_crlf, "eol-crlf");
4694 defsymbol (&Qcharset_g0, "charset-g0");
4695 defsymbol (&Qcharset_g1, "charset-g1");
4696 defsymbol (&Qcharset_g2, "charset-g2");
4697 defsymbol (&Qcharset_g3, "charset-g3");
4698 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
4699 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
4700 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
4701 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
4702 defsymbol (&Qno_iso6429, "no-iso6429");
4703 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
4704 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
4706 defsymbol (&Qshort, "short");
4707 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
4708 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
4709 defsymbol (&Qseven, "seven");
4710 defsymbol (&Qlock_shift, "lock-shift");
4711 defsymbol (&Qescape_quoted, "escape-quoted");
4713 defsymbol (&Qencode, "encode");
4714 defsymbol (&Qdecode, "decode");
4717 defsymbol (&Qctext, "ctext");
4718 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
4720 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
4722 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
4724 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
4726 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
4728 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
4730 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
4733 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
4738 lstream_type_create_mule_coding (void)
4740 LSTREAM_HAS_METHOD (decoding, reader);
4741 LSTREAM_HAS_METHOD (decoding, writer);
4742 LSTREAM_HAS_METHOD (decoding, rewinder);
4743 LSTREAM_HAS_METHOD (decoding, seekable_p);
4744 LSTREAM_HAS_METHOD (decoding, flusher);
4745 LSTREAM_HAS_METHOD (decoding, closer);
4746 LSTREAM_HAS_METHOD (decoding, marker);
4748 LSTREAM_HAS_METHOD (encoding, reader);
4749 LSTREAM_HAS_METHOD (encoding, writer);
4750 LSTREAM_HAS_METHOD (encoding, rewinder);
4751 LSTREAM_HAS_METHOD (encoding, seekable_p);
4752 LSTREAM_HAS_METHOD (encoding, flusher);
4753 LSTREAM_HAS_METHOD (encoding, closer);
4754 LSTREAM_HAS_METHOD (encoding, marker);
4758 vars_of_mule_coding (void)
4762 /* Initialize to something reasonable ... */
4763 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
4765 coding_category_system[i] = Qnil;
4766 coding_category_by_priority[i] = i;
4769 Fprovide (intern ("file-coding"));
4771 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
4772 Coding system used for TTY keyboard input.
4773 Not used under a windowing system.
4775 Vkeyboard_coding_system = Qnil;
4777 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
4778 Coding system used for TTY display output.
4779 Not used under a windowing system.
4781 Vterminal_coding_system = Qnil;
4783 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
4784 Overriding coding system used when writing a file or process.
4785 You should *bind* this, not set it. If this is non-nil, it specifies
4786 the coding system that will be used when a file or process is read
4787 in, and overrides `buffer-file-coding-system-for-read',
4788 `insert-file-contents-pre-hook', etc. Use those variables instead of
4789 this one for permanent changes to the environment.
4791 Vcoding_system_for_read = Qnil;
4793 DEFVAR_LISP ("coding-system-for-write",
4794 &Vcoding_system_for_write /*
4795 Overriding coding system used when writing a file or process.
4796 You should *bind* this, not set it. If this is non-nil, it specifies
4797 the coding system that will be used when a file or process is wrote
4798 in, and overrides `buffer-file-coding-system',
4799 `write-region-pre-hook', etc. Use those variables instead of this one
4800 for permanent changes to the environment.
4802 Vcoding_system_for_write = Qnil;
4804 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
4805 Coding system used to convert pathnames when accessing files.
4807 Vfile_name_coding_system = Qnil;
4809 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
4810 Non-nil means the buffer contents are regarded as multi-byte form
4811 of characters, not a binary code. This affects the display, file I/O,
4812 and behaviors of various editing commands.
4814 Setting this to nil does not do anything.
4816 enable_multibyte_characters = 1;
4820 complex_vars_of_mule_coding (void)
4822 staticpro (&Vcoding_system_hashtable);
4823 Vcoding_system_hashtable = make_lisp_hashtable (50, HASHTABLE_NONWEAK,
4826 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
4828 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
4830 struct codesys_prop csp; \
4832 csp.prop_type = (Prop_Type); \
4833 Dynarr_add (the_codesys_prop_dynarr, csp); \
4836 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
4837 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
4838 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
4839 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
4840 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
4841 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
4842 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
4844 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
4845 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
4846 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
4847 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
4848 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
4849 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
4850 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
4851 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
4852 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
4853 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
4854 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
4855 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
4856 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
4857 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
4858 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
4859 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
4860 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
4862 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
4863 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
4865 /* Need to create this here or we're really screwed. */
4866 Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
4867 list2 (Qmnemonic, build_string ("Noconv")));
4869 Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
4872 /* Need this for bootstrapping */
4873 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
4874 Fget_coding_system (Qno_conversion);