1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
35 #include "file-coding.h"
37 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
39 Lisp_Object Vkeyboard_coding_system;
40 Lisp_Object Vterminal_coding_system;
41 Lisp_Object Vcoding_system_for_read;
42 Lisp_Object Vcoding_system_for_write;
43 Lisp_Object Vfile_name_coding_system;
45 /* Table of symbols identifying each coding category. */
46 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
48 /* Coding system currently associated with each coding category. */
49 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
51 /* Table of all coding categories in decreasing order of priority.
52 This describes a permutation of the possible coding categories. */
53 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
55 Lisp_Object Qcoding_system_p;
57 Lisp_Object Qno_conversion, Qccl, Qiso2022;
58 /* Qinternal in general.c */
60 Lisp_Object Qmnemonic, Qeol_type;
61 Lisp_Object Qcr, Qcrlf, Qlf;
62 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
63 Lisp_Object Qpost_read_conversion;
64 Lisp_Object Qpre_write_conversion;
67 Lisp_Object Qbig5, Qshift_jis;
68 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
69 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
70 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
71 Lisp_Object Qno_iso6429;
72 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
73 Lisp_Object Qctext, Qescape_quoted;
74 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
76 Lisp_Object Qencode, Qdecode;
78 Lisp_Object Vcoding_system_hash_table;
80 int enable_multibyte_characters;
83 /* Additional information used by the ISO2022 decoder and detector. */
84 struct iso2022_decoder
86 /* CHARSET holds the character sets currently assigned to the G0
87 through G3 variables. It is initialized from the array
88 INITIAL_CHARSET in CODESYS. */
89 Lisp_Object charset[4];
91 /* Which registers are currently invoked into the left (GL) and
92 right (GR) halves of the 8-bit encoding space? */
93 int register_left, register_right;
95 /* ISO_ESC holds a value indicating part of an escape sequence
96 that has already been seen. */
97 enum iso_esc_flag esc;
99 /* This records the bytes we've seen so far in an escape sequence,
100 in case the sequence is invalid (we spit out the bytes unchanged). */
101 unsigned char esc_bytes[8];
103 /* Index for next byte to store in ISO escape sequence. */
106 /* Stuff seen so far when composing a string. */
107 unsigned_char_dynarr *composite_chars;
109 /* If we saw an invalid designation sequence for a particular
110 register, we flag it here and switch to ASCII. The next time we
111 see a valid designation for this register, we turn off the flag
112 and do the designation normally, but pretend the sequence was
113 invalid. The effect of all this is that (most of the time) the
114 escape sequences for both the switch to the unknown charset, and
115 the switch back to the known charset, get inserted literally into
116 the buffer and saved out as such. The hope is that we can
117 preserve the escape sequences so that the resulting written out
118 file makes sense. If we don't do any of this, the designation
119 to the invalid charset will be preserved but that switch back
120 to the known charset will probably get eaten because it was
121 the same charset that was already present in the register. */
122 unsigned char invalid_designated[4];
124 /* We try to do similar things as above for direction-switching
125 sequences. If we encountered a direction switch while an
126 invalid designation was present, or an invalid designation
127 just after a direction switch (i.e. no valid designation
128 encountered yet), we insert the direction-switch escape
129 sequence literally into the output stream, and later on
130 insert the corresponding direction-restoring escape sequence
132 unsigned int switched_dir_and_no_valid_charset_yet :1;
133 unsigned int invalid_switch_dir :1;
135 /* Tells the decoder to output the escape sequence literally
136 even though it was valid. Used in the games we play to
137 avoid lossage when we encounter invalid designations. */
138 unsigned int output_literally :1;
139 /* We encountered a direction switch followed by an invalid
140 designation. We didn't output the direction switch
141 literally because we didn't know about the invalid designation;
142 but we have to do so now. */
143 unsigned int output_direction_sequence :1;
146 EXFUN (Fcopy_coding_system, 2);
148 struct detection_state;
149 static int detect_coding_sjis (struct detection_state *st,
150 CONST unsigned char *src,
152 static void decode_coding_sjis (Lstream *decoding,
153 CONST unsigned char *src,
154 unsigned_char_dynarr *dst,
156 static void encode_coding_sjis (Lstream *encoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static int detect_coding_big5 (struct detection_state *st,
161 CONST unsigned char *src,
163 static void decode_coding_big5 (Lstream *decoding,
164 CONST unsigned char *src,
165 unsigned_char_dynarr *dst, unsigned int n);
166 static void encode_coding_big5 (Lstream *encoding,
167 CONST unsigned char *src,
168 unsigned_char_dynarr *dst, unsigned int n);
169 static int postprocess_iso2022_mask (int mask);
170 static void reset_iso2022 (Lisp_Object coding_system,
171 struct iso2022_decoder *iso);
172 static int detect_coding_iso2022 (struct detection_state *st,
173 CONST unsigned char *src,
175 static void decode_coding_iso2022 (Lstream *decoding,
176 CONST unsigned char *src,
177 unsigned_char_dynarr *dst, unsigned int n);
178 static void encode_coding_iso2022 (Lstream *encoding,
179 CONST unsigned char *src,
180 unsigned_char_dynarr *dst, unsigned int n);
182 static void decode_coding_no_conversion (Lstream *decoding,
183 CONST unsigned char *src,
184 unsigned_char_dynarr *dst,
186 static void encode_coding_no_conversion (Lstream *encoding,
187 CONST unsigned char *src,
188 unsigned_char_dynarr *dst,
190 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
191 unsigned_char_dynarr *dst, unsigned int n);
192 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
193 unsigned_char_dynarr *dst, unsigned int n);
195 typedef struct codesys_prop codesys_prop;
204 Dynarr_declare (codesys_prop);
205 } codesys_prop_dynarr;
207 codesys_prop_dynarr *the_codesys_prop_dynarr;
209 enum codesys_prop_enum
212 CODESYS_PROP_ISO2022,
217 /************************************************************************/
218 /* Coding system functions */
219 /************************************************************************/
221 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
222 static void print_coding_system (Lisp_Object, Lisp_Object, int);
223 static void finalize_coding_system (void *header, int for_disksave);
225 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
226 mark_coding_system, print_coding_system,
227 finalize_coding_system,
228 0, 0, struct Lisp_Coding_System);
231 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
233 struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
235 markobj (CODING_SYSTEM_NAME (codesys));
236 markobj (CODING_SYSTEM_DOC_STRING (codesys));
237 markobj (CODING_SYSTEM_MNEMONIC (codesys));
238 markobj (CODING_SYSTEM_EOL_LF (codesys));
239 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
240 markobj (CODING_SYSTEM_EOL_CR (codesys));
242 switch (CODING_SYSTEM_TYPE (codesys))
246 case CODESYS_ISO2022:
247 for (i = 0; i < 4; i++)
248 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
249 if (codesys->iso2022.input_conv)
251 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
253 struct charset_conversion_spec *ccs =
254 Dynarr_atp (codesys->iso2022.input_conv, i);
255 markobj (ccs->from_charset);
256 markobj (ccs->to_charset);
259 if (codesys->iso2022.output_conv)
261 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
263 struct charset_conversion_spec *ccs =
264 Dynarr_atp (codesys->iso2022.output_conv, i);
265 markobj (ccs->from_charset);
266 markobj (ccs->to_charset);
272 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
273 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
280 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
281 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
285 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
288 struct Lisp_Coding_System *c = XCODING_SYSTEM (obj);
290 error ("printing unreadable object #<coding_system 0x%x>",
293 write_c_string ("#<coding_system ", printcharfun);
294 print_internal (c->name, printcharfun, 1);
295 write_c_string (">", printcharfun);
299 finalize_coding_system (void *header, int for_disksave)
301 struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header;
302 /* Since coding systems never go away, this function is not
303 necessary. But it would be necessary if we changed things
304 so that coding systems could go away. */
305 if (!for_disksave) /* see comment in lstream.c */
307 switch (CODING_SYSTEM_TYPE (c))
310 case CODESYS_ISO2022:
311 if (c->iso2022.input_conv)
313 Dynarr_free (c->iso2022.input_conv);
314 c->iso2022.input_conv = 0;
316 if (c->iso2022.output_conv)
318 Dynarr_free (c->iso2022.output_conv);
319 c->iso2022.output_conv = 0;
330 symbol_to_eol_type (Lisp_Object symbol)
332 CHECK_SYMBOL (symbol);
333 if (NILP (symbol)) return EOL_AUTODETECT;
334 if (EQ (symbol, Qlf)) return EOL_LF;
335 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
336 if (EQ (symbol, Qcr)) return EOL_CR;
338 signal_simple_error ("Unrecognized eol type", symbol);
339 return EOL_AUTODETECT; /* not reached */
343 eol_type_to_symbol (enum eol_type type)
348 case EOL_LF: return Qlf;
349 case EOL_CRLF: return Qcrlf;
350 case EOL_CR: return Qcr;
351 case EOL_AUTODETECT: return Qnil;
356 setup_eol_coding_systems (struct Lisp_Coding_System *codesys)
358 Lisp_Object codesys_obj;
359 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
360 char *codesys_name = (char *) alloca (len + 7);
362 char *codesys_mnemonic=0;
364 Lisp_Object codesys_name_sym, sub_codesys_obj;
368 XSETCODING_SYSTEM (codesys_obj, codesys);
370 memcpy (codesys_name,
371 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
373 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
375 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
376 codesys_mnemonic = (char *) alloca (mlen + 7);
377 memcpy (codesys_mnemonic,
378 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
381 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
382 strcpy (codesys_name + len, "-" op_sys); \
384 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
385 codesys_name_sym = intern (codesys_name); \
386 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
387 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
389 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
390 build_string (codesys_mnemonic); \
391 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
394 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
395 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
396 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
399 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
400 Return t if OBJECT is a coding system.
401 A coding system is an object that defines how text containing multiple
402 character sets is encoded into a stream of (typically 8-bit) bytes.
403 The coding system is used to decode the stream into a series of
404 characters (which may be from multiple charsets) when the text is read
405 from a file or process, and is used to encode the text back into the
406 same format when it is written out to a file or process.
408 For example, many ISO2022-compliant coding systems (such as Compound
409 Text, which is used for inter-client data under the X Window System)
410 use escape sequences to switch between different charsets -- Japanese
411 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
412 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
413 `make-coding-system' for more information.
415 Coding systems are normally identified using a symbol, and the
416 symbol is accepted in place of the actual coding system object whenever
417 a coding system is called for. (This is similar to how faces work.)
421 return CODING_SYSTEMP (object) ? Qt : Qnil;
424 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
425 Retrieve the coding system of the given name.
427 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
428 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
429 If there is no such coding system, nil is returned. Otherwise the
430 associated coding system object is returned.
432 (coding_system_or_name))
434 if (CODING_SYSTEMP (coding_system_or_name))
435 return coding_system_or_name;
437 if (NILP (coding_system_or_name))
438 coding_system_or_name = Qbinary;
440 CHECK_SYMBOL (coding_system_or_name);
442 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
445 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
446 Retrieve the coding system of the given name.
447 Same as `find-coding-system' except that if there is no such
448 coding system, an error is signaled instead of returning nil.
452 Lisp_Object coding_system = Ffind_coding_system (name);
454 if (NILP (coding_system))
455 signal_simple_error ("No such coding system", name);
456 return coding_system;
459 /* We store the coding systems in hash tables with the names as the key and the
460 actual coding system object as the value. Occasionally we need to use them
461 in a list format. These routines provide us with that. */
462 struct coding_system_list_closure
464 Lisp_Object *coding_system_list;
468 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
469 void *coding_system_list_closure)
471 /* This function can GC */
472 struct coding_system_list_closure *cscl =
473 (struct coding_system_list_closure *) coding_system_list_closure;
474 Lisp_Object *coding_system_list = cscl->coding_system_list;
476 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
477 *coding_system_list);
481 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
482 Return a list of the names of all defined coding systems.
486 Lisp_Object coding_system_list = Qnil;
488 struct coding_system_list_closure coding_system_list_closure;
490 GCPRO1 (coding_system_list);
491 coding_system_list_closure.coding_system_list = &coding_system_list;
492 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
493 &coding_system_list_closure);
496 return coding_system_list;
499 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
500 Return the name of the given coding system.
504 coding_system = Fget_coding_system (coding_system);
505 return XCODING_SYSTEM_NAME (coding_system);
508 static struct Lisp_Coding_System *
509 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
511 struct Lisp_Coding_System *codesys =
512 alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system);
514 zero_lcrecord (codesys);
515 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
516 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
517 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
518 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
519 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
520 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
521 CODING_SYSTEM_TYPE (codesys) = type;
522 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
524 if (type == CODESYS_ISO2022)
527 for (i = 0; i < 4; i++)
528 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
530 else if (type == CODESYS_CCL)
532 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
533 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
536 CODING_SYSTEM_NAME (codesys) = name;
542 /* Given a list of charset conversion specs as specified in a Lisp
543 program, parse it into STORE_HERE. */
546 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
547 Lisp_Object spec_list)
551 EXTERNAL_LIST_LOOP (rest, spec_list)
553 Lisp_Object car = XCAR (rest);
554 Lisp_Object from, to;
555 struct charset_conversion_spec spec;
557 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
558 signal_simple_error ("Invalid charset conversion spec", car);
559 from = Fget_charset (XCAR (car));
560 to = Fget_charset (XCAR (XCDR (car)));
561 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
562 signal_simple_error_2
563 ("Attempted conversion between different charset types",
565 spec.from_charset = from;
566 spec.to_charset = to;
568 Dynarr_add (store_here, spec);
572 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
573 specs, return the equivalent as the Lisp programmer would see it.
575 If LOAD_HERE is 0, return Qnil. */
578 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
585 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
587 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
588 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
591 return Fnreverse (result);
596 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
597 Register symbol NAME as a coding system.
599 TYPE describes the conversion method used and should be one of
602 Automatic conversion. XEmacs attempts to detect the coding system
605 No conversion. Use this for binary files and such. On output,
606 graphic characters that are not in ASCII or Latin-1 will be
607 replaced by a ?. (For a no-conversion-encoded buffer, these
608 characters will only be present if you explicitly insert them.)
610 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
612 Any ISO2022-compliant encoding. Among other things, this includes
613 JIS (the Japanese encoding commonly used for e-mail), EUC (the
614 standard Unix encoding for Japanese and other languages), and
615 Compound Text (the encoding used in X11). You can specify more
616 specific information about the conversion with the FLAGS argument.
618 Big5 (the encoding commonly used for Taiwanese).
620 The conversion is performed using a user-written pseudo-code
621 program. CCL (Code Conversion Language) is the name of this
624 Write out or read in the raw contents of the memory representing
625 the buffer's text. This is primarily useful for debugging
626 purposes, and is only enabled when XEmacs has been compiled with
627 DEBUG_XEMACS defined (via the --debug configure option).
628 WARNING: Reading in a file using 'internal conversion can result
629 in an internal inconsistency in the memory representing a
630 buffer's text, which will produce unpredictable results and may
631 cause XEmacs to crash. Under normal circumstances you should
632 never use 'internal conversion.
634 DOC-STRING is a string describing the coding system.
636 PROPS is a property list, describing the specific nature of the
637 character set. Recognized properties are:
640 String to be displayed in the modeline when this coding system is
644 End-of-line conversion to be used. It should be one of
647 Automatically detect the end-of-line type (LF, CRLF,
648 or CR). Also generate subsidiary coding systems named
649 `NAME-unix', `NAME-dos', and `NAME-mac', that are
650 identical to this coding system but have an EOL-TYPE
651 value of 'lf, 'crlf, and 'cr, respectively.
653 The end of a line is marked externally using ASCII LF.
654 Since this is also the way that XEmacs represents an
655 end-of-line internally, specifying this option results
656 in no end-of-line conversion. This is the standard
657 format for Unix text files.
659 The end of a line is marked externally using ASCII
660 CRLF. This is the standard format for MS-DOS text
663 The end of a line is marked externally using ASCII CR.
664 This is the standard format for Macintosh text files.
666 Automatically detect the end-of-line type but do not
667 generate subsidiary coding systems. (This value is
668 converted to nil when stored internally, and
669 `coding-system-property' will return nil.)
671 'post-read-conversion
672 Function called after a file has been read in, to perform the
673 decoding. Called with two arguments, BEG and END, denoting
674 a region of the current buffer to be decoded.
676 'pre-write-conversion
677 Function called before a file is written out, to perform the
678 encoding. Called with two arguments, BEG and END, denoting
679 a region of the current buffer to be encoded.
682 The following additional properties are recognized if TYPE is 'iso2022:
688 The character set initially designated to the G0 - G3 registers.
689 The value should be one of
691 -- A charset object (designate that character set)
692 -- nil (do not ever use this register)
693 -- t (no character set is initially designated to
694 the register, but may be later on; this automatically
695 sets the corresponding `force-g*-on-output' property)
701 If non-nil, send an explicit designation sequence on output before
702 using the specified register.
705 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
706 "ESC $ B" on output in place of the full designation sequences
707 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
710 If non-nil, don't designate ASCII to G0 at each end of line on output.
711 Setting this to non-nil also suppresses other state-resetting that
712 normally happens at the end of a line.
715 If non-nil, don't designate ASCII to G0 before control chars on output.
718 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
722 If non-nil, use locking-shift (SO/SI) instead of single-shift
723 or designation by escape sequence.
726 If non-nil, don't use ISO6429's direction specification.
729 If non-nil, literal control characters that are the same as
730 the beginning of a recognized ISO2022 or ISO6429 escape sequence
731 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
732 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
733 so that they can be properly distinguished from an escape sequence.
734 (Note that doing this results in a non-portable encoding.) This
735 encoding flag is used for byte-compiled files. Note that ESC
736 is a good choice for a quoting character because there are no
737 escape sequences whose second byte is a character from the Control-0
738 or Control-1 character sets; this is explicitly disallowed by the
741 'input-charset-conversion
742 A list of conversion specifications, specifying conversion of
743 characters in one charset to another when decoding is performed.
744 Each specification is a list of two elements: the source charset,
745 and the destination charset.
747 'output-charset-conversion
748 A list of conversion specifications, specifying conversion of
749 characters in one charset to another when encoding is performed.
750 The form of each specification is the same as for
751 'input-charset-conversion.
754 The following additional properties are recognized (and required)
758 CCL program used for decoding (converting to internal format).
761 CCL program used for encoding (converting to external format).
763 (name, type, doc_string, props))
765 struct Lisp_Coding_System *codesys;
766 Lisp_Object rest, key, value;
767 enum coding_system_type ty;
768 int need_to_setup_eol_systems = 1;
770 /* Convert type to constant */
771 if (NILP (type) || EQ (type, Qundecided))
772 { ty = CODESYS_AUTODETECT; }
774 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
775 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
776 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
777 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
779 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
781 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
784 signal_simple_error ("Invalid coding system type", type);
788 codesys = allocate_coding_system (ty, name);
790 if (NILP (doc_string))
791 doc_string = build_string ("");
793 CHECK_STRING (doc_string);
794 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
796 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
798 if (EQ (key, Qmnemonic))
801 CHECK_STRING (value);
802 CODING_SYSTEM_MNEMONIC (codesys) = value;
805 else if (EQ (key, Qeol_type))
807 need_to_setup_eol_systems = NILP (value);
810 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
813 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
814 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
816 else if (ty == CODESYS_ISO2022)
818 #define FROB_INITIAL_CHARSET(charset_num) \
819 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
820 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
822 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
823 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
824 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
825 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
827 #define FROB_FORCE_CHARSET(charset_num) \
828 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
830 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
831 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
832 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
833 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
835 #define FROB_BOOLEAN_PROPERTY(prop) \
836 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
838 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
839 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
840 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
841 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
842 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
843 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
844 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
846 else if (EQ (key, Qinput_charset_conversion))
848 codesys->iso2022.input_conv =
849 Dynarr_new (charset_conversion_spec);
850 parse_charset_conversion_specs (codesys->iso2022.input_conv,
853 else if (EQ (key, Qoutput_charset_conversion))
855 codesys->iso2022.output_conv =
856 Dynarr_new (charset_conversion_spec);
857 parse_charset_conversion_specs (codesys->iso2022.output_conv,
861 signal_simple_error ("Unrecognized property", key);
863 else if (EQ (type, Qccl))
865 if (EQ (key, Qdecode))
867 CHECK_VECTOR (value);
868 CODING_SYSTEM_CCL_DECODE (codesys) = value;
870 else if (EQ (key, Qencode))
872 CHECK_VECTOR (value);
873 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
876 signal_simple_error ("Unrecognized property", key);
880 signal_simple_error ("Unrecognized property", key);
883 if (need_to_setup_eol_systems)
884 setup_eol_coding_systems (codesys);
887 Lisp_Object codesys_obj;
888 XSETCODING_SYSTEM (codesys_obj, codesys);
889 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
894 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
895 Copy OLD-CODING-SYSTEM to NEW-NAME.
896 If NEW-NAME does not name an existing coding system, a new one will
899 (old_coding_system, new_name))
901 Lisp_Object new_coding_system;
902 old_coding_system = Fget_coding_system (old_coding_system);
903 new_coding_system = Ffind_coding_system (new_name);
904 if (NILP (new_coding_system))
906 XSETCODING_SYSTEM (new_coding_system,
907 allocate_coding_system
908 (XCODING_SYSTEM_TYPE (old_coding_system),
910 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
914 struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
915 struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
916 memcpy (((char *) to ) + sizeof (to->header),
917 ((char *) from) + sizeof (from->header),
918 sizeof (*from) - sizeof (from->header));
921 return new_coding_system;
925 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
927 struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
928 Lisp_Object new_coding_system;
930 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
931 return coding_system;
935 case EOL_AUTODETECT: return coding_system;
936 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
937 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
938 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
942 return NILP (new_coding_system) ? coding_system : new_coding_system;
945 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
946 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
948 (coding_system, eol_type))
950 coding_system = Fget_coding_system (coding_system);
952 return subsidiary_coding_system (coding_system,
953 symbol_to_eol_type (eol_type));
957 /************************************************************************/
958 /* Coding system accessors */
959 /************************************************************************/
961 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
962 Return the doc string for CODING-SYSTEM.
966 coding_system = Fget_coding_system (coding_system);
967 return XCODING_SYSTEM_DOC_STRING (coding_system);
970 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
971 Return the type of CODING-SYSTEM.
975 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
978 case CODESYS_AUTODETECT: return Qundecided;
980 case CODESYS_SHIFT_JIS: return Qshift_jis;
981 case CODESYS_ISO2022: return Qiso2022;
982 case CODESYS_BIG5: return Qbig5;
983 case CODESYS_CCL: return Qccl;
985 case CODESYS_NO_CONVERSION: return Qno_conversion;
987 case CODESYS_INTERNAL: return Qinternal;
994 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
997 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
999 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1002 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1003 Return initial charset of CODING-SYSTEM designated to GNUM.
1006 (coding_system, gnum))
1008 coding_system = Fget_coding_system (coding_system);
1011 return coding_system_charset (coding_system, XINT (gnum));
1015 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1016 Return the PROP property of CODING-SYSTEM.
1018 (coding_system, prop))
1021 enum coding_system_type type;
1023 coding_system = Fget_coding_system (coding_system);
1024 CHECK_SYMBOL (prop);
1025 type = XCODING_SYSTEM_TYPE (coding_system);
1027 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1028 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1031 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1033 case CODESYS_PROP_ALL_OK:
1036 case CODESYS_PROP_ISO2022:
1037 if (type != CODESYS_ISO2022)
1039 ("Property only valid in ISO2022 coding systems",
1043 case CODESYS_PROP_CCL:
1044 if (type != CODESYS_CCL)
1046 ("Property only valid in CCL coding systems",
1056 signal_simple_error ("Unrecognized property", prop);
1058 if (EQ (prop, Qname))
1059 return XCODING_SYSTEM_NAME (coding_system);
1060 else if (EQ (prop, Qtype))
1061 return Fcoding_system_type (coding_system);
1062 else if (EQ (prop, Qdoc_string))
1063 return XCODING_SYSTEM_DOC_STRING (coding_system);
1064 else if (EQ (prop, Qmnemonic))
1065 return XCODING_SYSTEM_MNEMONIC (coding_system);
1066 else if (EQ (prop, Qeol_type))
1067 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1068 else if (EQ (prop, Qeol_lf))
1069 return XCODING_SYSTEM_EOL_LF (coding_system);
1070 else if (EQ (prop, Qeol_crlf))
1071 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1072 else if (EQ (prop, Qeol_cr))
1073 return XCODING_SYSTEM_EOL_CR (coding_system);
1074 else if (EQ (prop, Qpost_read_conversion))
1075 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1076 else if (EQ (prop, Qpre_write_conversion))
1077 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1079 else if (type == CODESYS_ISO2022)
1081 if (EQ (prop, Qcharset_g0))
1082 return coding_system_charset (coding_system, 0);
1083 else if (EQ (prop, Qcharset_g1))
1084 return coding_system_charset (coding_system, 1);
1085 else if (EQ (prop, Qcharset_g2))
1086 return coding_system_charset (coding_system, 2);
1087 else if (EQ (prop, Qcharset_g3))
1088 return coding_system_charset (coding_system, 3);
1090 #define FORCE_CHARSET(charset_num) \
1091 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1092 (coding_system, charset_num) ? Qt : Qnil)
1094 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1095 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1096 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1097 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1099 #define LISP_BOOLEAN(prop) \
1100 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1102 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1103 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1104 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1105 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1106 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1107 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1108 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1110 else if (EQ (prop, Qinput_charset_conversion))
1112 unparse_charset_conversion_specs
1113 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1114 else if (EQ (prop, Qoutput_charset_conversion))
1116 unparse_charset_conversion_specs
1117 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1121 else if (type == CODESYS_CCL)
1123 if (EQ (prop, Qdecode))
1124 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1125 else if (EQ (prop, Qencode))
1126 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1134 return Qnil; /* not reached */
1138 /************************************************************************/
1139 /* Coding category functions */
1140 /************************************************************************/
1143 decode_coding_category (Lisp_Object symbol)
1147 CHECK_SYMBOL (symbol);
1148 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1149 if (EQ (coding_category_symbol[i], symbol))
1152 signal_simple_error ("Unrecognized coding category", symbol);
1153 return 0; /* not reached */
1156 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1157 Return a list of all recognized coding categories.
1162 Lisp_Object list = Qnil;
1164 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1165 list = Fcons (coding_category_symbol[i], list);
1169 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1170 Change the priority order of the coding categories.
1171 LIST should be list of coding categories, in descending order of
1172 priority. Unspecified coding categories will be lower in priority
1173 than all specified ones, in the same relative order they were in
1178 int category_to_priority[CODING_CATEGORY_LAST + 1];
1182 /* First generate a list that maps coding categories to priorities. */
1184 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1185 category_to_priority[i] = -1;
1187 /* Highest priority comes from the specified list. */
1189 EXTERNAL_LIST_LOOP (rest, list)
1191 int cat = decode_coding_category (XCAR (rest));
1193 if (category_to_priority[cat] >= 0)
1194 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1195 category_to_priority[cat] = i++;
1198 /* Now go through the existing categories by priority to retrieve
1199 the categories not yet specified and preserve their priority
1201 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1203 int cat = coding_category_by_priority[j];
1204 if (category_to_priority[cat] < 0)
1205 category_to_priority[cat] = i++;
1208 /* Now we need to construct the inverse of the mapping we just
1211 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1212 coding_category_by_priority[category_to_priority[i]] = i;
1214 /* Phew! That was confusing. */
1218 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1219 Return a list of coding categories in descending order of priority.
1224 Lisp_Object list = Qnil;
1226 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1227 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1232 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1233 Change the coding system associated with a coding category.
1235 (coding_category, coding_system))
1237 int cat = decode_coding_category (coding_category);
1239 coding_system = Fget_coding_system (coding_system);
1240 coding_category_system[cat] = coding_system;
1244 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1245 Return the coding system associated with a coding category.
1249 int cat = decode_coding_category (coding_category);
1250 Lisp_Object sys = coding_category_system[cat];
1253 return XCODING_SYSTEM_NAME (sys);
1258 /************************************************************************/
1259 /* Detecting the encoding of data */
1260 /************************************************************************/
1262 struct detection_state
1264 enum eol_type eol_type;
1286 struct iso2022_decoder iso;
1288 int high_byte_count;
1289 unsigned int saw_single_shift:1;
1302 acceptable_control_char_p (int c)
1306 /* Allow and ignore control characters that you might
1307 reasonably see in a text file */
1312 case 8: /* backspace */
1313 case 11: /* vertical tab */
1314 case 12: /* form feed */
1315 case 26: /* MS-DOS C-z junk */
1316 case 31: /* '^_' -- for info */
1324 mask_has_at_most_one_bit_p (int mask)
1326 /* Perhaps the only thing useful you learn from intensive Microsoft
1327 technical interviews */
1328 return (mask & (mask - 1)) == 0;
1331 static enum eol_type
1332 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1341 st->eol.just_saw_cr = 1;
1346 if (st->eol.just_saw_cr)
1348 else if (st->eol.seen_anything)
1351 else if (st->eol.just_saw_cr)
1353 st->eol.just_saw_cr = 0;
1355 st->eol.seen_anything = 1;
1358 return EOL_AUTODETECT;
1361 /* Attempt to determine the encoding and EOL type of the given text.
1362 Before calling this function for the first type, you must initialize
1363 st->eol_type as appropriate and initialize st->mask to ~0.
1365 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1368 st->mask holds the determined coding category mask, or ~0 if only
1369 ASCII has been seen so far.
1373 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1374 is present in st->mask
1375 1 == definitive answers are here for both st->eol_type and st->mask
1379 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1380 unsigned int n, int just_do_eol)
1384 if (st->eol_type == EOL_AUTODETECT)
1385 st->eol_type = detect_eol_type (st, src, n);
1388 return st->eol_type != EOL_AUTODETECT;
1390 if (!st->seen_non_ascii)
1392 for (; n; n--, src++)
1395 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1397 st->seen_non_ascii = 1;
1399 st->shift_jis.mask = ~0;
1401 st->iso2022.mask = ~0;
1411 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1412 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1413 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1414 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1415 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1416 st->big5.mask = detect_coding_big5 (st, src, n);
1418 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask;
1421 int retval = mask_has_at_most_one_bit_p (st->mask);
1422 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1423 return retval && st->eol_type != EOL_AUTODETECT;
1428 coding_system_from_mask (int mask)
1432 /* If the file was entirely or basically ASCII, use the
1433 default value of `buffer-file-coding-system'. */
1434 Lisp_Object retval =
1435 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1438 retval = Ffind_coding_system (retval);
1442 (Qbad_variable, Qwarning,
1443 "Invalid `default-buffer-file-coding-system', set to nil");
1444 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1448 retval = Fget_coding_system (Qno_conversion);
1456 mask = postprocess_iso2022_mask (mask);
1458 /* Look through the coding categories by priority and find
1459 the first one that is allowed. */
1460 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1462 cat = coding_category_by_priority[i];
1463 if ((mask & (1 << cat)) &&
1464 !NILP (coding_category_system[cat]))
1468 return coding_category_system[cat];
1470 return Fget_coding_system (Qno_conversion);
1474 /* Given a seekable read stream and potential coding system and EOL type
1475 as specified, do any autodetection that is called for. If the
1476 coding system and/or EOL type are not autodetect, they will be left
1477 alone; but this function will never return an autodetect coding system
1480 This function does not automatically fetch subsidiary coding systems;
1481 that should be unnecessary with the explicit eol-type argument. */
1484 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1485 enum eol_type *eol_type_in_out)
1487 struct detection_state decst;
1489 if (*eol_type_in_out == EOL_AUTODETECT)
1490 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1493 decst.eol_type = *eol_type_in_out;
1496 /* If autodetection is called for, do it now. */
1497 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1498 *eol_type_in_out == EOL_AUTODETECT)
1503 unsigned char random_buffer[4096];
1506 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1509 if (detect_coding_type (&decst, random_buffer, nread,
1510 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1511 CODESYS_AUTODETECT))
1515 *eol_type_in_out = decst.eol_type;
1516 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1517 *codesys_in_out = coding_system_from_mask (decst.mask);
1520 /* If we absolutely can't determine the EOL type, just assume LF. */
1521 if (*eol_type_in_out == EOL_AUTODETECT)
1522 *eol_type_in_out = EOL_LF;
1524 Lstream_rewind (stream);
1527 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1528 Detect coding system of the text in the region between START and END.
1529 Returned a list of possible coding systems ordered by priority.
1530 If only ASCII characters are found, it returns 'undecided or one of
1531 its subsidiary coding systems according to a detected end-of-line
1532 type. Optional arg BUFFER defaults to the current buffer.
1534 (start, end, buffer))
1536 Lisp_Object val = Qnil;
1537 struct buffer *buf = decode_buffer (buffer, 0);
1539 Lisp_Object instream, lb_instream;
1540 Lstream *istr, *lb_istr;
1541 struct detection_state decst;
1542 struct gcpro gcpro1, gcpro2;
1544 get_buffer_range_char (buf, start, end, &b, &e, 0);
1545 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1546 lb_istr = XLSTREAM (lb_instream);
1547 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1548 istr = XLSTREAM (instream);
1549 GCPRO2 (instream, lb_instream);
1551 decst.eol_type = EOL_AUTODETECT;
1555 unsigned char random_buffer[4096];
1556 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1560 if (detect_coding_type (&decst, random_buffer, nread, 0))
1564 if (decst.mask == ~0)
1565 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1573 decst.mask = postprocess_iso2022_mask (decst.mask);
1575 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1577 int sys = coding_category_by_priority[i];
1578 if (decst.mask & (1 << sys))
1580 Lisp_Object codesys = coding_category_system[sys];
1581 if (!NILP (codesys))
1582 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1583 val = Fcons (codesys, val);
1587 Lstream_close (istr);
1589 Lstream_delete (istr);
1590 Lstream_delete (lb_istr);
1595 /************************************************************************/
1596 /* Converting to internal Mule format ("decoding") */
1597 /************************************************************************/
1599 /* A decoding stream is a stream used for decoding text (i.e.
1600 converting from some external format to internal format).
1601 The decoding-stream object keeps track of the actual coding
1602 stream, the stream that is at the other end, and data that
1603 needs to be persistent across the lifetime of the stream. */
1605 /* Handle the EOL stuff related to just-read-in character C.
1606 EOL_TYPE is the EOL type of the coding stream.
1607 FLAGS is the current value of FLAGS in the coding stream, and may
1608 be modified by this macro. (The macro only looks at the
1609 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1610 bytes are to be written. You need to also define a local goto
1611 label "label_continue_loop" that is at the end of the main
1612 character-reading loop.
1614 If C is a CR character, then this macro handles it entirely and
1615 jumps to label_continue_loop. Otherwise, this macro does not add
1616 anything to DST, and continues normally. You should continue
1617 processing C normally after this macro. */
1619 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1623 if (eol_type == EOL_CR) \
1624 Dynarr_add (dst, '\n'); \
1625 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1626 Dynarr_add (dst, c); \
1628 flags |= CODING_STATE_CR; \
1629 goto label_continue_loop; \
1631 else if (flags & CODING_STATE_CR) \
1632 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1634 Dynarr_add (dst, '\r'); \
1635 flags &= ~CODING_STATE_CR; \
1639 /* C should be a binary character in the range 0 - 255; convert
1640 to internal format and add to Dynarr DST. */
1642 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1644 if (BYTE_ASCII_P (c)) \
1645 Dynarr_add (dst, c); \
1646 else if (BYTE_C1_P (c)) \
1648 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1649 Dynarr_add (dst, c + 0x20); \
1653 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1654 Dynarr_add (dst, c); \
1658 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1662 DECODE_ADD_BINARY_CHAR (ch, dst); \
1667 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1669 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1670 if ((flags & CODING_STATE_END) && \
1671 (flags & CODING_STATE_CR)) \
1672 Dynarr_add (dst, '\r'); \
1675 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1677 struct decoding_stream
1679 /* Coding system that governs the conversion. */
1680 struct Lisp_Coding_System *codesys;
1682 /* Stream that we read the encoded data from or
1683 write the decoded data to. */
1686 /* If we are reading, then we can return only a fixed amount of
1687 data, so if the conversion resulted in too much data, we store it
1688 here for retrieval the next time around. */
1689 unsigned_char_dynarr *runoff;
1691 /* FLAGS holds flags indicating the current state of the decoding.
1692 Some of these flags are dependent on the coding system. */
1695 /* CH holds a partially built-up character. Since we only deal
1696 with one- and two-byte characters at the moment, we only use
1697 this to store the first byte of a two-byte character. */
1700 /* EOL_TYPE specifies the type of end-of-line conversion that
1701 currently applies. We need to keep this separate from the
1702 EOL type stored in CODESYS because the latter might indicate
1703 automatic EOL-type detection while the former will always
1704 indicate a particular EOL type. */
1705 enum eol_type eol_type;
1707 /* Additional ISO2022 information. We define the structure above
1708 because it's also needed by the detection routines. */
1709 struct iso2022_decoder iso2022;
1711 /* Additional information (the state of the running CCL program)
1712 used by the CCL decoder. */
1713 struct ccl_program ccl;
1715 struct detection_state decst;
1718 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1719 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1720 static int decoding_rewinder (Lstream *stream);
1721 static int decoding_seekable_p (Lstream *stream);
1722 static int decoding_flusher (Lstream *stream);
1723 static int decoding_closer (Lstream *stream);
1725 static Lisp_Object decoding_marker (Lisp_Object stream,
1726 void (*markobj) (Lisp_Object));
1728 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1729 sizeof (struct decoding_stream));
1732 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1734 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1735 Lisp_Object str_obj;
1737 /* We do not need to mark the coding systems or charsets stored
1738 within the stream because they are stored in a global list
1739 and automatically marked. */
1741 XSETLSTREAM (str_obj, str);
1743 if (str->imp->marker)
1744 return (str->imp->marker) (str_obj, markobj);
1749 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1750 so we read data from the other end, decode it, and store it into DATA. */
1753 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1755 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1756 unsigned char *orig_data = data;
1758 int error_occurred = 0;
1760 /* We need to interface to mule_decode(), which expects to take some
1761 amount of data and store the result into a Dynarr. We have
1762 mule_decode() store into str->runoff, and take data from there
1765 /* We loop until we have enough data, reading chunks from the other
1766 end and decoding it. */
1769 /* Take data from the runoff if we can. Make sure to take at
1770 most SIZE bytes, and delete the data from the runoff. */
1771 if (Dynarr_length (str->runoff) > 0)
1773 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1774 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1775 Dynarr_delete_many (str->runoff, 0, chunk);
1781 break; /* No more room for data */
1783 if (str->flags & CODING_STATE_END)
1784 /* This means that on the previous iteration, we hit the EOF on
1785 the other end. We loop once more so that mule_decode() can
1786 output any final stuff it may be holding, or any "go back
1787 to a sane state" escape sequences. (This latter makes sense
1788 during encoding.) */
1791 /* Exhausted the runoff, so get some more. DATA has at least
1792 SIZE bytes left of storage in it, so it's OK to read directly
1793 into it. (We'll be overwriting above, after we've decoded it
1794 into the runoff.) */
1795 read_size = Lstream_read (str->other_end, data, size);
1802 /* There might be some more end data produced in the translation.
1803 See the comment above. */
1804 str->flags |= CODING_STATE_END;
1805 mule_decode (stream, data, str->runoff, read_size);
1808 if (data - orig_data == 0)
1809 return error_occurred ? -1 : 0;
1811 return data - orig_data;
1815 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1817 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1820 /* Decode all our data into the runoff, and then attempt to write
1821 it all out to the other end. Remove whatever chunk we succeeded
1823 mule_decode (stream, data, str->runoff, size);
1824 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1825 Dynarr_length (str->runoff));
1827 Dynarr_delete_many (str->runoff, 0, retval);
1828 /* Do NOT return retval. The return value indicates how much
1829 of the incoming data was written, not how many bytes were
1835 reset_decoding_stream (struct decoding_stream *str)
1838 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1840 Lisp_Object coding_system;
1841 XSETCODING_SYSTEM (coding_system, str->codesys);
1842 reset_iso2022 (coding_system, &str->iso2022);
1844 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1846 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1849 str->flags = str->ch = 0;
1853 decoding_rewinder (Lstream *stream)
1855 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1856 reset_decoding_stream (str);
1857 Dynarr_reset (str->runoff);
1858 return Lstream_rewind (str->other_end);
1862 decoding_seekable_p (Lstream *stream)
1864 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1865 return Lstream_seekable_p (str->other_end);
1869 decoding_flusher (Lstream *stream)
1871 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1872 return Lstream_flush (str->other_end);
1876 decoding_closer (Lstream *stream)
1878 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1879 if (stream->flags & LSTREAM_FL_WRITE)
1881 str->flags |= CODING_STATE_END;
1882 decoding_writer (stream, 0, 0);
1884 Dynarr_free (str->runoff);
1886 if (str->iso2022.composite_chars)
1887 Dynarr_free (str->iso2022.composite_chars);
1889 return Lstream_close (str->other_end);
1893 decoding_stream_coding_system (Lstream *stream)
1895 Lisp_Object coding_system;
1896 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1898 XSETCODING_SYSTEM (coding_system, str->codesys);
1899 return subsidiary_coding_system (coding_system, str->eol_type);
1903 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1905 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1906 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1908 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1909 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1910 reset_decoding_stream (str);
1913 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
1914 stream for writing, no automatic code detection will be performed.
1915 The reason for this is that automatic code detection requires a
1916 seekable input. Things will also fail if you open a decoding
1917 stream for reading using a non-fully-specified coding system and
1918 a non-seekable input stream. */
1921 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1924 Lstream *lstr = Lstream_new (lstream_decoding, mode);
1925 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1929 str->other_end = stream;
1930 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1931 str->eol_type = EOL_AUTODETECT;
1932 if (!strcmp (mode, "r")
1933 && Lstream_seekable_p (stream))
1934 /* We can determine the coding system now. */
1935 determine_real_coding_system (stream, &codesys, &str->eol_type);
1936 set_decoding_stream_coding_system (lstr, codesys);
1937 str->decst.eol_type = str->eol_type;
1938 str->decst.mask = ~0;
1939 XSETLSTREAM (obj, lstr);
1944 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
1946 return make_decoding_stream_1 (stream, codesys, "r");
1950 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
1952 return make_decoding_stream_1 (stream, codesys, "w");
1955 /* Note: the decode_coding_* functions all take the same
1956 arguments as mule_decode(), which is to say some SRC data of
1957 size N, which is to be stored into dynamic array DST.
1958 DECODING is the stream within which the decoding is
1959 taking place, but no data is actually read from or
1960 written to that stream; that is handled in decoding_reader()
1961 or decoding_writer(). This allows the same functions to
1962 be used for both reading and writing. */
1965 mule_decode (Lstream *decoding, CONST unsigned char *src,
1966 unsigned_char_dynarr *dst, unsigned int n)
1968 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
1970 /* If necessary, do encoding-detection now. We do this when
1971 we're a writing stream or a non-seekable reading stream,
1972 meaning that we can't just process the whole input,
1973 rewind, and start over. */
1975 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
1976 str->eol_type == EOL_AUTODETECT)
1978 Lisp_Object codesys;
1980 XSETCODING_SYSTEM (codesys, str->codesys);
1981 detect_coding_type (&str->decst, src, n,
1982 CODING_SYSTEM_TYPE (str->codesys) !=
1983 CODESYS_AUTODETECT);
1984 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
1985 str->decst.mask != ~0)
1986 /* #### This is cheesy. What we really ought to do is
1987 buffer up a certain amount of data so as to get a
1988 less random result. */
1989 codesys = coding_system_from_mask (str->decst.mask);
1990 str->eol_type = str->decst.eol_type;
1991 if (XCODING_SYSTEM (codesys) != str->codesys)
1993 /* Preserve the CODING_STATE_END flag in case it was set.
1994 If we erase it, bad things might happen. */
1995 int was_end = str->flags & CODING_STATE_END;
1996 set_decoding_stream_coding_system (decoding, codesys);
1998 str->flags |= CODING_STATE_END;
2002 switch (CODING_SYSTEM_TYPE (str->codesys))
2005 case CODESYS_INTERNAL:
2006 Dynarr_add_many (dst, src, n);
2009 case CODESYS_AUTODETECT:
2010 /* If we got this far and still haven't decided on the coding
2011 system, then do no conversion. */
2012 case CODESYS_NO_CONVERSION:
2013 decode_coding_no_conversion (decoding, src, dst, n);
2016 case CODESYS_SHIFT_JIS:
2017 decode_coding_sjis (decoding, src, dst, n);
2020 decode_coding_big5 (decoding, src, dst, n);
2023 ccl_driver (&str->ccl, src, dst, n, 0);
2025 case CODESYS_ISO2022:
2026 decode_coding_iso2022 (decoding, src, dst, n);
2034 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2035 Decode the text between START and END which is encoded in CODING-SYSTEM.
2036 This is useful if you've read in encoded text from a file without decoding
2037 it (e.g. you read in a JIS-formatted file but used the `binary' or
2038 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2039 Return length of decoded text.
2040 BUFFER defaults to the current buffer if unspecified.
2042 (start, end, coding_system, buffer))
2045 struct buffer *buf = decode_buffer (buffer, 0);
2046 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2047 Lstream *istr, *ostr;
2048 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2050 get_buffer_range_char (buf, start, end, &b, &e, 0);
2052 barf_if_buffer_read_only (buf, b, e);
2054 coding_system = Fget_coding_system (coding_system);
2055 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2056 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2057 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2059 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2060 Fget_coding_system (Qbinary));
2061 istr = XLSTREAM (instream);
2062 ostr = XLSTREAM (outstream);
2063 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2065 /* The chain of streams looks like this:
2067 [BUFFER] <----- send through
2068 ------> [ENCODE AS BINARY]
2069 ------> [DECODE AS SPECIFIED]
2075 char tempbuf[1024]; /* some random amount */
2076 Bufpos newpos, even_newer_pos;
2077 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2078 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2082 newpos = lisp_buffer_stream_startpos (istr);
2083 Lstream_write (ostr, tempbuf, size_in_bytes);
2084 even_newer_pos = lisp_buffer_stream_startpos (istr);
2085 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2088 Lstream_close (istr);
2089 Lstream_close (ostr);
2091 Lstream_delete (istr);
2092 Lstream_delete (ostr);
2093 Lstream_delete (XLSTREAM (de_outstream));
2094 Lstream_delete (XLSTREAM (lb_outstream));
2099 /************************************************************************/
2100 /* Converting to an external encoding ("encoding") */
2101 /************************************************************************/
2103 /* An encoding stream is an output stream. When you create the
2104 stream, you specify the coding system that governs the encoding
2105 and another stream that the resulting encoded data is to be
2106 sent to, and then start sending data to it. */
2108 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2110 struct encoding_stream
2112 /* Coding system that governs the conversion. */
2113 struct Lisp_Coding_System *codesys;
2115 /* Stream that we read the encoded data from or
2116 write the decoded data to. */
2119 /* If we are reading, then we can return only a fixed amount of
2120 data, so if the conversion resulted in too much data, we store it
2121 here for retrieval the next time around. */
2122 unsigned_char_dynarr *runoff;
2124 /* FLAGS holds flags indicating the current state of the encoding.
2125 Some of these flags are dependent on the coding system. */
2128 /* CH holds a partially built-up character. Since we only deal
2129 with one- and two-byte characters at the moment, we only use
2130 this to store the first byte of a two-byte character. */
2133 /* Additional information used by the ISO2022 encoder. */
2136 /* CHARSET holds the character sets currently assigned to the G0
2137 through G3 registers. It is initialized from the array
2138 INITIAL_CHARSET in CODESYS. */
2139 Lisp_Object charset[4];
2141 /* Which registers are currently invoked into the left (GL) and
2142 right (GR) halves of the 8-bit encoding space? */
2143 int register_left, register_right;
2145 /* Whether we need to explicitly designate the charset in the
2146 G? register before using it. It is initialized from the
2147 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2148 unsigned char force_charset_on_output[4];
2150 /* Other state variables that need to be preserved across
2152 Lisp_Object current_charset;
2154 int current_char_boundary;
2157 /* Additional information (the state of the running CCL program)
2158 used by the CCL encoder. */
2159 struct ccl_program ccl;
2163 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2164 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2166 static int encoding_rewinder (Lstream *stream);
2167 static int encoding_seekable_p (Lstream *stream);
2168 static int encoding_flusher (Lstream *stream);
2169 static int encoding_closer (Lstream *stream);
2171 static Lisp_Object encoding_marker (Lisp_Object stream,
2172 void (*markobj) (Lisp_Object));
2174 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2175 sizeof (struct encoding_stream));
2178 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2180 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2181 Lisp_Object str_obj;
2183 /* We do not need to mark the coding systems or charsets stored
2184 within the stream because they are stored in a global list
2185 and automatically marked. */
2187 XSETLSTREAM (str_obj, str);
2189 if (str->imp->marker)
2190 return (str->imp->marker) (str_obj, markobj);
2195 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2196 so we read data from the other end, encode it, and store it into DATA. */
2199 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2201 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2202 unsigned char *orig_data = data;
2204 int error_occurred = 0;
2206 /* We need to interface to mule_encode(), which expects to take some
2207 amount of data and store the result into a Dynarr. We have
2208 mule_encode() store into str->runoff, and take data from there
2211 /* We loop until we have enough data, reading chunks from the other
2212 end and encoding it. */
2215 /* Take data from the runoff if we can. Make sure to take at
2216 most SIZE bytes, and delete the data from the runoff. */
2217 if (Dynarr_length (str->runoff) > 0)
2219 int chunk = min ((int) size, Dynarr_length (str->runoff));
2220 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2221 Dynarr_delete_many (str->runoff, 0, chunk);
2227 break; /* No more room for data */
2229 if (str->flags & CODING_STATE_END)
2230 /* This means that on the previous iteration, we hit the EOF on
2231 the other end. We loop once more so that mule_encode() can
2232 output any final stuff it may be holding, or any "go back
2233 to a sane state" escape sequences. (This latter makes sense
2234 during encoding.) */
2237 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2238 left of storage in it, so it's OK to read directly into it.
2239 (We'll be overwriting above, after we've encoded it into the
2241 read_size = Lstream_read (str->other_end, data, size);
2248 /* There might be some more end data produced in the translation.
2249 See the comment above. */
2250 str->flags |= CODING_STATE_END;
2251 mule_encode (stream, data, str->runoff, read_size);
2254 if (data == orig_data)
2255 return error_occurred ? -1 : 0;
2257 return data - orig_data;
2261 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2263 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2266 /* Encode all our data into the runoff, and then attempt to write
2267 it all out to the other end. Remove whatever chunk we succeeded
2269 mule_encode (stream, data, str->runoff, size);
2270 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2271 Dynarr_length (str->runoff));
2273 Dynarr_delete_many (str->runoff, 0, retval);
2274 /* Do NOT return retval. The return value indicates how much
2275 of the incoming data was written, not how many bytes were
2281 reset_encoding_stream (struct encoding_stream *str)
2284 switch (CODING_SYSTEM_TYPE (str->codesys))
2286 case CODESYS_ISO2022:
2290 for (i = 0; i < 4; i++)
2292 str->iso2022.charset[i] =
2293 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2294 str->iso2022.force_charset_on_output[i] =
2295 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2297 str->iso2022.register_left = 0;
2298 str->iso2022.register_right = 1;
2299 str->iso2022.current_charset = Qnil;
2300 str->iso2022.current_half = 0;
2301 str->iso2022.current_char_boundary = 1;
2305 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2312 str->flags = str->ch = 0;
2316 encoding_rewinder (Lstream *stream)
2318 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2319 reset_encoding_stream (str);
2320 Dynarr_reset (str->runoff);
2321 return Lstream_rewind (str->other_end);
2325 encoding_seekable_p (Lstream *stream)
2327 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2328 return Lstream_seekable_p (str->other_end);
2332 encoding_flusher (Lstream *stream)
2334 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2335 return Lstream_flush (str->other_end);
2339 encoding_closer (Lstream *stream)
2341 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2342 if (stream->flags & LSTREAM_FL_WRITE)
2344 str->flags |= CODING_STATE_END;
2345 encoding_writer (stream, 0, 0);
2347 Dynarr_free (str->runoff);
2348 return Lstream_close (str->other_end);
2352 encoding_stream_coding_system (Lstream *stream)
2354 Lisp_Object coding_system;
2355 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2357 XSETCODING_SYSTEM (coding_system, str->codesys);
2358 return coding_system;
2362 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2364 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2365 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2367 reset_encoding_stream (str);
2371 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2374 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2375 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2379 str->runoff = Dynarr_new (unsigned_char);
2380 str->other_end = stream;
2381 set_encoding_stream_coding_system (lstr, codesys);
2382 XSETLSTREAM (obj, lstr);
2387 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2389 return make_encoding_stream_1 (stream, codesys, "r");
2393 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2395 return make_encoding_stream_1 (stream, codesys, "w");
2398 /* Convert N bytes of internally-formatted data stored in SRC to an
2399 external format, according to the encoding stream ENCODING.
2400 Store the encoded data into DST. */
2403 mule_encode (Lstream *encoding, CONST unsigned char *src,
2404 unsigned_char_dynarr *dst, unsigned int n)
2406 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2408 switch (CODING_SYSTEM_TYPE (str->codesys))
2411 case CODESYS_INTERNAL:
2412 Dynarr_add_many (dst, src, n);
2415 case CODESYS_AUTODETECT:
2416 /* If we got this far and still haven't decided on the coding
2417 system, then do no conversion. */
2418 case CODESYS_NO_CONVERSION:
2419 encode_coding_no_conversion (encoding, src, dst, n);
2422 case CODESYS_SHIFT_JIS:
2423 encode_coding_sjis (encoding, src, dst, n);
2426 encode_coding_big5 (encoding, src, dst, n);
2429 ccl_driver (&str->ccl, src, dst, n, 0);
2431 case CODESYS_ISO2022:
2432 encode_coding_iso2022 (encoding, src, dst, n);
2440 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2441 Encode the text between START and END using CODING-SYSTEM.
2442 This will, for example, convert Japanese characters into stuff such as
2443 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2444 text. BUFFER defaults to the current buffer if unspecified.
2446 (start, end, coding_system, buffer))
2449 struct buffer *buf = decode_buffer (buffer, 0);
2450 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2451 Lstream *istr, *ostr;
2452 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2454 get_buffer_range_char (buf, start, end, &b, &e, 0);
2456 barf_if_buffer_read_only (buf, b, e);
2458 coding_system = Fget_coding_system (coding_system);
2459 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2460 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2461 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2462 Fget_coding_system (Qbinary));
2463 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2465 istr = XLSTREAM (instream);
2466 ostr = XLSTREAM (outstream);
2467 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2468 /* The chain of streams looks like this:
2470 [BUFFER] <----- send through
2471 ------> [ENCODE AS SPECIFIED]
2472 ------> [DECODE AS BINARY]
2477 char tempbuf[1024]; /* some random amount */
2478 Bufpos newpos, even_newer_pos;
2479 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2480 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2484 newpos = lisp_buffer_stream_startpos (istr);
2485 Lstream_write (ostr, tempbuf, size_in_bytes);
2486 even_newer_pos = lisp_buffer_stream_startpos (istr);
2487 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2493 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2494 Lstream_close (istr);
2495 Lstream_close (ostr);
2497 Lstream_delete (istr);
2498 Lstream_delete (ostr);
2499 Lstream_delete (XLSTREAM (de_outstream));
2500 Lstream_delete (XLSTREAM (lb_outstream));
2501 return make_int (retlen);
2507 /************************************************************************/
2508 /* Shift-JIS methods */
2509 /************************************************************************/
2511 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2512 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2513 as is. A character of JISX0201-Kana (TYPE94 character set) is
2514 encoded by "position-code + 0x80". A character of JISX0208
2515 (TYPE94x94 character set) is encoded in 2-byte but two
2516 position-codes are divided and shifted so that it fit in the range
2519 --- CODE RANGE of Shift-JIS ---
2520 (character set) (range)
2522 JISX0201-Kana 0xA0 .. 0xDF
2523 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2524 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2525 -------------------------------
2529 /* Is this the first byte of a Shift-JIS two-byte char? */
2531 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2532 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2534 /* Is this the second byte of a Shift-JIS two-byte char? */
2536 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2537 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2539 #define BYTE_SJIS_KATAKANA_P(c) \
2540 ((c) >= 0xA1 && (c) <= 0xDF)
2543 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2551 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2553 if (st->shift_jis.in_second_byte)
2555 st->shift_jis.in_second_byte = 0;
2559 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2560 st->shift_jis.in_second_byte = 1;
2562 return CODING_CATEGORY_SHIFT_JIS_MASK;
2565 /* Convert Shift-JIS data to internal format. */
2568 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2569 unsigned_char_dynarr *dst, unsigned int n)
2572 unsigned int flags, ch;
2573 enum eol_type eol_type;
2574 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2576 CODING_STREAM_DECOMPOSE (str, flags, ch);
2577 eol_type = str->eol_type;
2585 /* Previous character was first byte of Shift-JIS Kanji char. */
2586 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2588 unsigned char e1, e2;
2590 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2591 DECODE_SJIS (ch, c, e1, e2);
2592 Dynarr_add (dst, e1);
2593 Dynarr_add (dst, e2);
2597 DECODE_ADD_BINARY_CHAR (ch, dst);
2598 DECODE_ADD_BINARY_CHAR (c, dst);
2604 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2605 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2607 else if (BYTE_SJIS_KATAKANA_P (c))
2609 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2610 Dynarr_add (dst, c);
2613 DECODE_ADD_BINARY_CHAR (c, dst);
2615 label_continue_loop:;
2618 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2620 CODING_STREAM_COMPOSE (str, flags, ch);
2623 /* Convert internally-formatted data to Shift-JIS. */
2626 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2627 unsigned_char_dynarr *dst, unsigned int n)
2630 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2631 unsigned int flags, ch;
2632 enum eol_type eol_type;
2634 CODING_STREAM_DECOMPOSE (str, flags, ch);
2635 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2642 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2643 Dynarr_add (dst, '\r');
2644 if (eol_type != EOL_CR)
2645 Dynarr_add (dst, '\n');
2648 else if (BYTE_ASCII_P (c))
2650 Dynarr_add (dst, c);
2653 else if (BUFBYTE_LEADING_BYTE_P (c))
2654 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2655 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2656 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2659 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2661 Dynarr_add (dst, c);
2664 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2665 ch == LEADING_BYTE_JAPANESE_JISX0208)
2669 unsigned char j1, j2;
2670 ENCODE_SJIS (ch, c, j1, j2);
2671 Dynarr_add (dst, j1);
2672 Dynarr_add (dst, j2);
2678 CODING_STREAM_COMPOSE (str, flags, ch);
2681 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2682 Decode a JISX0208 character of Shift-JIS coding-system.
2683 CODE is the character code in Shift-JIS as a cons of type bytes.
2684 Return the corresponding character.
2688 unsigned char c1, c2, s1, s2;
2691 CHECK_INT (XCAR (code));
2692 CHECK_INT (XCDR (code));
2693 s1 = XINT (XCAR (code));
2694 s2 = XINT (XCDR (code));
2695 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2696 BYTE_SJIS_TWO_BYTE_2_P (s2))
2698 DECODE_SJIS (s1, s2, c1, c2);
2699 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2700 c1 & 0x7F, c2 & 0x7F));
2706 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2707 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2708 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2712 Lisp_Object charset;
2715 CHECK_CHAR_COERCE_INT (ch);
2716 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2717 if (EQ (charset, Vcharset_japanese_jisx0208))
2719 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2720 return Fcons (make_int (s1), make_int (s2));
2727 /************************************************************************/
2729 /************************************************************************/
2731 /* BIG5 is a coding system encoding two character sets: ASCII and
2732 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2733 character set and is encoded in two-byte.
2735 --- CODE RANGE of BIG5 ---
2736 (character set) (range)
2738 Big5 (1st byte) 0xA1 .. 0xFE
2739 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2740 --------------------------
2742 Since the number of characters in Big5 is larger than maximum
2743 characters in Emacs' charset (96x96), it can't be handled as one
2744 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2745 and `charset-big5-2'. Both <type>s are TYPE94x94. The former
2746 contains frequently used characters and the latter contains less
2747 frequently used characters. */
2749 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2750 ((c) >= 0xA1 && (c) <= 0xFE)
2752 /* Is this the second byte of a Shift-JIS two-byte char? */
2754 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2755 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2757 /* Number of Big5 characters which have the same code in 1st byte. */
2759 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2761 /* Code conversion macros. These are macros because they are used in
2762 inner loops during code conversion.
2764 Note that temporary variables in macros introduce the classic
2765 dynamic-scoping problems with variable names. We use capital-
2766 lettered variables in the assumption that XEmacs does not use
2767 capital letters in variables except in a very formalized way
2770 /* Convert Big5 code (b1, b2) into its internal string representation
2773 /* There is a much simpler way to split the Big5 charset into two.
2774 For the moment I'm going to leave the algorithm as-is because it
2775 claims to separate out the most-used characters into a single
2776 charset, which perhaps will lead to optimizations in various
2779 The way the algorithm works is something like this:
2781 Big5 can be viewed as a 94x157 charset, where the row is
2782 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2783 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2784 the split between low and high column numbers is apparently
2785 meaningless; ascending rows produce less and less frequent chars.
2786 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2787 the first charset, and the upper half (0xC9 .. 0xFE) to the
2788 second. To do the conversion, we convert the character into
2789 a single number where 0 .. 156 is the first row, 157 .. 313
2790 is the second, etc. That way, the characters are ordered by
2791 decreasing frequency. Then we just chop the space in two
2792 and coerce the result into a 94x94 space.
2795 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2797 int B1 = b1, B2 = b2; \
2799 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2803 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2807 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2808 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2810 c1 = I / (0xFF - 0xA1) + 0xA1; \
2811 c2 = I % (0xFF - 0xA1) + 0xA1; \
2814 /* Convert the internal string representation of a Big5 character
2815 (lb, c1, c2) into Big5 code (b1, b2). */
2817 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2819 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2821 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2823 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2825 b1 = I / BIG5_SAME_ROW + 0xA1; \
2826 b2 = I % BIG5_SAME_ROW; \
2827 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2831 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2839 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2840 (c >= 0x80 && c <= 0xA0))
2842 if (st->big5.in_second_byte)
2844 st->big5.in_second_byte = 0;
2845 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2849 st->big5.in_second_byte = 1;
2851 return CODING_CATEGORY_BIG5_MASK;
2854 /* Convert Big5 data to internal format. */
2857 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2858 unsigned_char_dynarr *dst, unsigned int n)
2861 unsigned int flags, ch;
2862 enum eol_type eol_type;
2863 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2865 CODING_STREAM_DECOMPOSE (str, flags, ch);
2866 eol_type = str->eol_type;
2873 /* Previous character was first byte of Big5 char. */
2874 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2876 unsigned char b1, b2, b3;
2877 DECODE_BIG5 (ch, c, b1, b2, b3);
2878 Dynarr_add (dst, b1);
2879 Dynarr_add (dst, b2);
2880 Dynarr_add (dst, b3);
2884 DECODE_ADD_BINARY_CHAR (ch, dst);
2885 DECODE_ADD_BINARY_CHAR (c, dst);
2891 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2892 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2895 DECODE_ADD_BINARY_CHAR (c, dst);
2897 label_continue_loop:;
2900 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2902 CODING_STREAM_COMPOSE (str, flags, ch);
2905 /* Convert internally-formatted data to Big5. */
2908 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2909 unsigned_char_dynarr *dst, unsigned int n)
2912 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2913 unsigned int flags, ch;
2914 enum eol_type eol_type;
2916 CODING_STREAM_DECOMPOSE (str, flags, ch);
2917 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2924 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2925 Dynarr_add (dst, '\r');
2926 if (eol_type != EOL_CR)
2927 Dynarr_add (dst, '\n');
2929 else if (BYTE_ASCII_P (c))
2932 Dynarr_add (dst, c);
2934 else if (BUFBYTE_LEADING_BYTE_P (c))
2936 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
2937 c == LEADING_BYTE_CHINESE_BIG5_2)
2939 /* A recognized leading byte. */
2941 continue; /* not done with this character. */
2943 /* otherwise just ignore this character. */
2945 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
2946 ch == LEADING_BYTE_CHINESE_BIG5_2)
2948 /* Previous char was a recognized leading byte. */
2950 continue; /* not done with this character. */
2954 /* Encountering second byte of a Big5 character. */
2955 unsigned char b1, b2;
2957 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
2958 Dynarr_add (dst, b1);
2959 Dynarr_add (dst, b2);
2965 CODING_STREAM_COMPOSE (str, flags, ch);
2969 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
2970 Decode a Big5 character CODE of BIG5 coding-system.
2971 CODE is the character code in BIG5, a cons of two integers.
2972 Return the corresponding character.
2976 unsigned char c1, c2, b1, b2;
2979 CHECK_INT (XCAR (code));
2980 CHECK_INT (XCDR (code));
2981 b1 = XINT (XCAR (code));
2982 b2 = XINT (XCDR (code));
2983 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
2984 BYTE_BIG5_TWO_BYTE_2_P (b2))
2987 Lisp_Object charset;
2988 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
2989 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
2990 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
2996 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
2997 Encode the Big5 character CH to BIG5 coding-system.
2998 Return the corresponding character code in Big5.
3002 Lisp_Object charset;
3005 CHECK_CHAR_COERCE_INT (ch);
3006 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3007 if (EQ (charset, Vcharset_chinese_big5_1) ||
3008 EQ (charset, Vcharset_chinese_big5_2))
3010 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3012 return Fcons (make_int (b1), make_int (b2));
3019 /************************************************************************/
3020 /* ISO2022 methods */
3021 /************************************************************************/
3023 /* The following note describes the coding system ISO2022 briefly.
3024 Since the intention of this note is to help understanding of the
3025 programs in this file, some parts are NOT ACCURATE or OVERLY
3026 SIMPLIFIED. For thorough understanding, please refer to the
3027 original document of ISO2022.
3029 ISO2022 provides many mechanisms to encode several character sets
3030 in 7-bit and 8-bit environments. If one chooses 7-bit environment,
3031 all text is encoded by codes of less than 128. This may make the
3032 encoded text a little bit longer, but the text get more stability
3033 to pass through several gateways (some of them strip off MSB).
3035 There are two kind of character sets: control character set and
3036 graphic character set. The former contains control characters such
3037 as `newline' and `escape' to provide control functions (control
3038 functions are provided also by escape sequence). The latter
3039 contains graphic characters such as 'A' and '-'. Emacs recognizes
3040 two control character sets and many graphic character sets.
3042 Graphic character sets are classified into one of four types,
3043 according to the dimension and number of characters in the set:
3044 TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each
3045 character set is assigned an identification byte, unique for each
3046 type, called "final character" (denoted as <F> hereafter). The <F>
3047 of each character set is decided by ECMA(*) when it is registered
3048 in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for
3051 Note (*): ECMA = European Computer Manufacturers Association
3053 Here are examples of graphic character set [NAME(<F>)]:
3054 o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3055 o TYPE96 -- right-half-of-ISO8859-1('A'), ...
3056 o TYPE94x94 -- GB2312('A'), JISX0208('B'), ...
3057 o TYPE96x96 -- none for the moment
3059 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
3060 C0 [0x00..0x1F] -- control character plane 0
3061 GL [0x20..0x7F] -- graphic character plane 0
3062 C1 [0x80..0x9F] -- control character plane 1
3063 GR [0xA0..0xFF] -- graphic character plane 1
3065 A control character set is directly designated and invoked to C0 or
3066 C1 by an escape sequence. The most common case is that:
3067 - ISO646's control character set is designated/invoked to C0, and
3068 - ISO6429's control character set is designated/invoked to C1,
3069 and usually these designations/invocations are omitted in encoded
3070 text. In a 7-bit environment, only C0 can be used, and a control
3071 character for C1 is encoded by an appropriate escape sequence to
3072 fit into the environment. All control characters for C1 are
3073 defined to have corresponding escape sequences.
3075 A graphic character set is at first designated to one of four
3076 graphic registers (G0 through G3), then these graphic registers are
3077 invoked to GL or GR. These designations and invocations can be
3078 done independently. The most common case is that G0 is invoked to
3079 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3080 these invocations and designations are omitted in encoded text.
3081 In a 7-bit environment, only GL can be used.
3083 When a graphic character set of TYPE94 or TYPE94x94 is invoked to
3084 GL, codes 0x20 and 0x7F of the GL area work as control characters
3085 SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area
3088 There are two ways of invocation: locking-shift and single-shift.
3089 With locking-shift, the invocation lasts until the next different
3090 invocation, whereas with single-shift, the invocation works only
3091 for the following character and doesn't affect locking-shift.
3092 Invocations are done by the following control characters or escape
3095 ----------------------------------------------------------------------
3096 abbrev function cntrl escape seq description
3097 ----------------------------------------------------------------------
3098 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3099 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3100 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR
3101 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3102 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR
3103 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3104 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR
3105 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3106 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3107 ----------------------------------------------------------------------
3108 The first four are for locking-shift. Control characters for these
3109 functions are defined by macros ISO_CODE_XXX in `coding.h'.
3111 Designations are done by the following escape sequences.
3112 ----------------------------------------------------------------------
3113 escape sequence description
3114 ----------------------------------------------------------------------
3115 ESC '(' <F> designate TYPE94<F> to G0
3116 ESC ')' <F> designate TYPE94<F> to G1
3117 ESC '*' <F> designate TYPE94<F> to G2
3118 ESC '+' <F> designate TYPE94<F> to G3
3119 ESC ',' <F> designate TYPE96<F> to G0 (*)
3120 ESC '-' <F> designate TYPE96<F> to G1
3121 ESC '.' <F> designate TYPE96<F> to G2
3122 ESC '/' <F> designate TYPE96<F> to G3
3123 ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**)
3124 ESC '$' ')' <F> designate TYPE94x94<F> to G1
3125 ESC '$' '*' <F> designate TYPE94x94<F> to G2
3126 ESC '$' '+' <F> designate TYPE94x94<F> to G3
3127 ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*)
3128 ESC '$' '-' <F> designate TYPE96x96<F> to G1
3129 ESC '$' '.' <F> designate TYPE96x96<F> to G2
3130 ESC '$' '/' <F> designate TYPE96x96<F> to G3
3131 ----------------------------------------------------------------------
3132 In this list, "TYPE94<F>" means a graphic character set of type TYPE94
3133 and final character <F>, and etc.
3135 Note (*): Although these designations are not allowed in ISO2022,
3136 Emacs accepts them on decoding, and produces them on encoding
3137 TYPE96 or TYPE96x96 character set in a coding system which is
3138 characterized as 7-bit environment, non-locking-shift, and
3141 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3142 '(' can be omitted. We call this as "short-form" here after.
3144 Now you may notice that there are a lot of ways for encoding the
3145 same multilingual text in ISO2022. Actually, there exist many
3146 coding systems such as Compound Text (used in X's inter client
3147 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3148 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3149 localized platforms), and all of these are variants of ISO2022.
3151 In addition to the above, Emacs handles two more kinds of escape
3152 sequences: ISO6429's direction specification and Emacs' private
3153 sequence for specifying character composition.
3155 ISO6429's direction specification takes the following format:
3156 o CSI ']' -- end of the current direction
3157 o CSI '0' ']' -- end of the current direction
3158 o CSI '1' ']' -- start of left-to-right text
3159 o CSI '2' ']' -- start of right-to-left text
3160 The control character CSI (0x9B: control sequence introducer) is
3161 abbreviated to the escape sequence ESC '[' in 7-bit environment.
3163 Character composition specification takes the following format:
3164 o ESC '0' -- start character composition
3165 o ESC '1' -- end character composition
3166 Since these are not standard escape sequences of any ISO, the use
3167 of them for these meanings is restricted to Emacs only. */
3170 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3174 for (i = 0; i < 4; i++)
3176 if (!NILP (coding_system))
3178 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3180 iso->charset[i] = Qt;
3181 iso->invalid_designated[i] = 0;
3183 iso->esc = ISO_ESC_NOTHING;
3184 iso->esc_bytes_index = 0;
3185 iso->register_left = 0;
3186 iso->register_right = 1;
3187 iso->switched_dir_and_no_valid_charset_yet = 0;
3188 iso->invalid_switch_dir = 0;
3189 iso->output_direction_sequence = 0;
3190 iso->output_literally = 0;
3191 if (iso->composite_chars)
3192 Dynarr_reset (iso->composite_chars);
3196 fit_to_be_escape_quoted (unsigned char c)
3213 /* Parse one byte of an ISO2022 escape sequence.
3214 If the result is an invalid escape sequence, return 0 and
3215 do not change anything in STR. Otherwise, if the result is
3216 an incomplete escape sequence, update ISO2022.ESC and
3217 ISO2022.ESC_BYTES and return -1. Otherwise, update
3218 all the state variables (but not ISO2022.ESC_BYTES) and
3221 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3222 or invocation of an invalid character set and treat that as
3223 an unrecognized escape sequence. */
3226 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3227 unsigned char c, unsigned int *flags,
3228 int check_invalid_charsets)
3230 /* (1) If we're at the end of a designation sequence, CS is the
3231 charset being designated and REG is the register to designate
3234 (2) If we're at the end of a locking-shift sequence, REG is
3235 the register to invoke and HALF (0 == left, 1 == right) is
3236 the half to invoke it into.
3238 (3) If we're at the end of a single-shift sequence, REG is
3239 the register to invoke. */
3240 Lisp_Object cs = Qnil;
3243 /* NOTE: This code does goto's all over the fucking place.
3244 The reason for this is that we're basically implementing
3245 a state machine here, and hierarchical languages like C
3246 don't really provide a clean way of doing this. */
3248 if (! (*flags & CODING_STATE_ESCAPE))
3249 /* At beginning of escape sequence; we need to reset our
3250 escape-state variables. */
3251 iso->esc = ISO_ESC_NOTHING;
3253 iso->output_literally = 0;
3254 iso->output_direction_sequence = 0;
3258 case ISO_ESC_NOTHING:
3259 iso->esc_bytes_index = 0;
3262 case ISO_CODE_ESC: /* Start escape sequence */
3263 *flags |= CODING_STATE_ESCAPE;
3267 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3268 *flags |= CODING_STATE_ESCAPE;
3269 iso->esc = ISO_ESC_5_11;
3272 case ISO_CODE_SO: /* locking shift 1 */
3275 case ISO_CODE_SI: /* locking shift 0 */
3279 case ISO_CODE_SS2: /* single shift */
3282 case ISO_CODE_SS3: /* single shift */
3286 default: /* Other control characters */
3293 /**** single shift ****/
3295 case 'N': /* single shift 2 */
3298 case 'O': /* single shift 3 */
3302 /**** locking shift ****/
3304 case '~': /* locking shift 1 right */
3307 case 'n': /* locking shift 2 */
3310 case '}': /* locking shift 2 right */
3313 case 'o': /* locking shift 3 */
3316 case '|': /* locking shift 3 right */
3320 /**** composite ****/
3323 iso->esc = ISO_ESC_START_COMPOSITE;
3324 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3325 CODING_STATE_COMPOSITE;
3329 iso->esc = ISO_ESC_END_COMPOSITE;
3330 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3331 ~CODING_STATE_COMPOSITE;
3334 /**** directionality ****/
3337 iso->esc = ISO_ESC_5_11;
3340 /**** designation ****/
3342 case '$': /* multibyte charset prefix */
3343 iso->esc = ISO_ESC_2_4;
3347 if (0x28 <= c && c <= 0x2F)
3349 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3353 /* This function is called with CODESYS equal to nil when
3354 doing coding-system detection. */
3356 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3357 && fit_to_be_escape_quoted (c))
3359 iso->esc = ISO_ESC_LITERAL;
3360 *flags &= CODING_STATE_ISO2022_LOCK;
3370 /**** directionality ****/
3372 case ISO_ESC_5_11: /* ISO6429 direction control */
3375 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3376 goto directionality;
3378 if (c == '0') iso->esc = ISO_ESC_5_11_0;
3379 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3380 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3384 case ISO_ESC_5_11_0:
3387 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3388 goto directionality;
3392 case ISO_ESC_5_11_1:
3395 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3396 goto directionality;
3400 case ISO_ESC_5_11_2:
3403 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
3404 goto directionality;
3409 iso->esc = ISO_ESC_DIRECTIONALITY;
3410 /* Various junk here to attempt to preserve the direction sequences
3411 literally in the text if they would otherwise be swallowed due
3412 to invalid designations that don't show up as actual charset
3413 changes in the text. */
3414 if (iso->invalid_switch_dir)
3416 /* We already inserted a direction switch literally into the
3417 text. We assume (#### this may not be right) that the
3418 next direction switch is the one going the other way,
3419 and we need to output that literally as well. */
3420 iso->output_literally = 1;
3421 iso->invalid_switch_dir = 0;
3427 /* If we are in the thrall of an invalid designation,
3428 then stick the directionality sequence literally into the
3429 output stream so it ends up in the original text again. */
3430 for (jj = 0; jj < 4; jj++)
3431 if (iso->invalid_designated[jj])
3435 iso->output_literally = 1;
3436 iso->invalid_switch_dir = 1;
3439 /* Indicate that we haven't yet seen a valid designation,
3440 so that if a switch-dir is directly followed by an
3441 invalid designation, both get inserted literally. */
3442 iso->switched_dir_and_no_valid_charset_yet = 1;
3447 /**** designation ****/
3450 if (0x28 <= c && c <= 0x2F)
3452 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
3455 if (0x40 <= c && c <= 0x42)
3457 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
3458 *flags & CODING_STATE_R2L ?
3459 CHARSET_RIGHT_TO_LEFT :
3460 CHARSET_LEFT_TO_RIGHT);
3470 if (c < '0' || c > '~')
3471 return 0; /* bad final byte */
3473 if (iso->esc >= ISO_ESC_2_8 &&
3474 iso->esc <= ISO_ESC_2_15)
3476 type = ((iso->esc >= ISO_ESC_2_12) ?
3477 CHARSET_TYPE_96 : CHARSET_TYPE_94);
3478 reg = (iso->esc - ISO_ESC_2_8) & 3;
3480 else if (iso->esc >= ISO_ESC_2_4_8 &&
3481 iso->esc <= ISO_ESC_2_4_15)
3483 type = ((iso->esc >= ISO_ESC_2_4_12) ?
3484 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
3485 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
3489 /* Can this ever be reached? -slb */
3493 cs = CHARSET_BY_ATTRIBUTES (type, c,
3494 *flags & CODING_STATE_R2L ?
3495 CHARSET_RIGHT_TO_LEFT :
3496 CHARSET_LEFT_TO_RIGHT);
3502 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
3506 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
3507 /* can't invoke something that ain't there. */
3509 iso->esc = ISO_ESC_SINGLE_SHIFT;
3510 *flags &= CODING_STATE_ISO2022_LOCK;
3512 *flags |= CODING_STATE_SS2;
3514 *flags |= CODING_STATE_SS3;
3518 if (check_invalid_charsets &&
3519 !CHARSETP (iso->charset[reg]))
3520 /* can't invoke something that ain't there. */
3523 iso->register_right = reg;
3525 iso->register_left = reg;
3526 *flags &= CODING_STATE_ISO2022_LOCK;
3527 iso->esc = ISO_ESC_LOCKING_SHIFT;
3531 if (NILP (cs) && check_invalid_charsets)
3533 iso->invalid_designated[reg] = 1;
3534 iso->charset[reg] = Vcharset_ascii;
3535 iso->esc = ISO_ESC_DESIGNATE;
3536 *flags &= CODING_STATE_ISO2022_LOCK;
3537 iso->output_literally = 1;
3538 if (iso->switched_dir_and_no_valid_charset_yet)
3540 /* We encountered a switch-direction followed by an
3541 invalid designation. Ensure that the switch-direction
3542 gets outputted; otherwise it will probably get eaten
3543 when the text is written out again. */
3544 iso->switched_dir_and_no_valid_charset_yet = 0;
3545 iso->output_direction_sequence = 1;
3546 /* And make sure that the switch-dir going the other
3547 way gets outputted, as well. */
3548 iso->invalid_switch_dir = 1;
3552 /* This function is called with CODESYS equal to nil when
3553 doing coding-system detection. */
3554 if (!NILP (codesys))
3556 charset_conversion_spec_dynarr *dyn =
3557 XCODING_SYSTEM (codesys)->iso2022.input_conv;
3563 for (i = 0; i < Dynarr_length (dyn); i++)
3565 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
3566 if (EQ (cs, spec->from_charset))
3567 cs = spec->to_charset;
3572 iso->charset[reg] = cs;
3573 iso->esc = ISO_ESC_DESIGNATE;
3574 *flags &= CODING_STATE_ISO2022_LOCK;
3575 if (iso->invalid_designated[reg])
3577 iso->invalid_designated[reg] = 0;
3578 iso->output_literally = 1;
3580 if (iso->switched_dir_and_no_valid_charset_yet)
3581 iso->switched_dir_and_no_valid_charset_yet = 0;
3586 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
3592 /* #### There are serious deficiencies in the recognition mechanism
3593 here. This needs to be much smarter if it's going to cut it. */
3595 if (!st->iso2022.initted)
3597 reset_iso2022 (Qnil, &st->iso2022.iso);
3598 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
3599 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3600 CODING_CATEGORY_ISO_8_1_MASK |
3601 CODING_CATEGORY_ISO_8_2_MASK |
3602 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
3603 st->iso2022.flags = 0;
3604 st->iso2022.high_byte_count = 0;
3605 st->iso2022.saw_single_shift = 0;
3606 st->iso2022.initted = 1;
3609 mask = st->iso2022.mask;
3616 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3617 st->iso2022.high_byte_count++;
3621 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
3623 if (st->iso2022.high_byte_count & 1)
3624 /* odd number of high bytes; assume not iso-8-2 */
3625 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3627 st->iso2022.high_byte_count = 0;
3628 st->iso2022.saw_single_shift = 0;
3630 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3632 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
3633 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
3634 { /* control chars */
3637 /* Allow and ignore control characters that you might
3638 reasonably see in a text file */
3643 case 8: /* backspace */
3644 case 11: /* vertical tab */
3645 case 12: /* form feed */
3646 case 26: /* MS-DOS C-z junk */
3647 case 31: /* '^_' -- for info */
3648 goto label_continue_loop;
3655 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
3658 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
3659 &st->iso2022.flags, 0))
3661 switch (st->iso2022.iso.esc)
3663 case ISO_ESC_DESIGNATE:
3664 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
3665 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3667 case ISO_ESC_LOCKING_SHIFT:
3668 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
3669 goto ran_out_of_chars;
3670 case ISO_ESC_SINGLE_SHIFT:
3671 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
3672 st->iso2022.saw_single_shift = 1;
3681 goto ran_out_of_chars;
3684 label_continue_loop:;
3693 postprocess_iso2022_mask (int mask)
3695 /* #### kind of cheesy */
3696 /* If seven-bit ISO is allowed, then assume that the encoding is
3697 entirely seven-bit and turn off the eight-bit ones. */
3698 if (mask & CODING_CATEGORY_ISO_7_MASK)
3699 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3700 CODING_CATEGORY_ISO_8_1_MASK |
3701 CODING_CATEGORY_ISO_8_2_MASK);
3705 /* If FLAGS is a null pointer or specifies right-to-left motion,
3706 output a switch-dir-to-left-to-right sequence to DST.
3707 Also update FLAGS if it is not a null pointer.
3708 If INTERNAL_P is set, we are outputting in internal format and
3709 need to handle the CSI differently. */
3712 restore_left_to_right_direction (struct Lisp_Coding_System *codesys,
3713 unsigned_char_dynarr *dst,
3714 unsigned int *flags,
3717 if (!flags || (*flags & CODING_STATE_R2L))
3719 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3721 Dynarr_add (dst, ISO_CODE_ESC);
3722 Dynarr_add (dst, '[');
3724 else if (internal_p)
3725 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3727 Dynarr_add (dst, ISO_CODE_CSI);
3728 Dynarr_add (dst, '0');
3729 Dynarr_add (dst, ']');
3731 *flags &= ~CODING_STATE_R2L;
3735 /* If FLAGS is a null pointer or specifies a direction different from
3736 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
3737 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
3738 sequence to DST. Also update FLAGS if it is not a null pointer.
3739 If INTERNAL_P is set, we are outputting in internal format and
3740 need to handle the CSI differently. */
3743 ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys,
3744 unsigned_char_dynarr *dst, unsigned int *flags,
3747 if ((!flags || (*flags & CODING_STATE_R2L)) &&
3748 direction == CHARSET_LEFT_TO_RIGHT)
3749 restore_left_to_right_direction (codesys, dst, flags, internal_p);
3750 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
3751 && (!flags || !(*flags & CODING_STATE_R2L)) &&
3752 direction == CHARSET_RIGHT_TO_LEFT)
3754 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3756 Dynarr_add (dst, ISO_CODE_ESC);
3757 Dynarr_add (dst, '[');
3759 else if (internal_p)
3760 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3762 Dynarr_add (dst, ISO_CODE_CSI);
3763 Dynarr_add (dst, '2');
3764 Dynarr_add (dst, ']');
3766 *flags |= CODING_STATE_R2L;
3770 /* Convert ISO2022-format data to internal format. */
3773 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
3774 unsigned_char_dynarr *dst, unsigned int n)
3777 unsigned int flags, ch;
3778 enum eol_type eol_type;
3779 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3780 Lisp_Object coding_system;
3781 unsigned_char_dynarr *real_dst = dst;
3783 CODING_STREAM_DECOMPOSE (str, flags, ch);
3784 eol_type = str->eol_type;
3785 XSETCODING_SYSTEM (coding_system, str->codesys);
3787 if (flags & CODING_STATE_COMPOSITE)
3788 dst = str->iso2022.composite_chars;
3793 if (flags & CODING_STATE_ESCAPE)
3794 { /* Within ESC sequence */
3795 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
3800 switch (str->iso2022.esc)
3802 case ISO_ESC_START_COMPOSITE:
3803 if (str->iso2022.composite_chars)
3804 Dynarr_reset (str->iso2022.composite_chars);
3806 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
3807 dst = str->iso2022.composite_chars;
3809 case ISO_ESC_END_COMPOSITE:
3811 Bufbyte comstr[MAX_EMCHAR_LEN];
3813 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
3814 Dynarr_length (dst));
3816 len = set_charptr_emchar (comstr, emch);
3817 Dynarr_add_many (dst, comstr, len);
3821 case ISO_ESC_LITERAL:
3822 DECODE_ADD_BINARY_CHAR (c, dst);
3826 /* Everything else handled already */
3831 /* Attempted error recovery. */
3832 if (str->iso2022.output_direction_sequence)
3833 ensure_correct_direction (flags & CODING_STATE_R2L ?
3834 CHARSET_RIGHT_TO_LEFT :
3835 CHARSET_LEFT_TO_RIGHT,
3836 str->codesys, dst, 0, 1);
3837 /* More error recovery. */
3838 if (!retval || str->iso2022.output_literally)
3840 /* Output the (possibly invalid) sequence */
3842 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
3843 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
3844 flags &= CODING_STATE_ISO2022_LOCK;
3846 n++, src--;/* Repeat the loop with the same character. */
3849 /* No sense in reprocessing the final byte of the
3850 escape sequence; it could mess things up anyway.
3852 DECODE_ADD_BINARY_CHAR (c, dst);
3857 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
3858 { /* Control characters */
3860 /***** Error-handling *****/
3862 /* If we were in the middle of a character, dump out the
3863 partial character. */
3864 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3866 /* If we just saw a single-shift character, dump it out.
3867 This may dump out the wrong sort of single-shift character,
3868 but least it will give an indication that something went
3870 if (flags & CODING_STATE_SS2)
3872 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
3873 flags &= ~CODING_STATE_SS2;
3875 if (flags & CODING_STATE_SS3)
3877 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
3878 flags &= ~CODING_STATE_SS3;
3881 /***** Now handle the control characters. *****/
3884 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3886 flags &= CODING_STATE_ISO2022_LOCK;
3888 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
3889 DECODE_ADD_BINARY_CHAR (c, dst);
3892 { /* Graphic characters */
3893 Lisp_Object charset;
3897 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3899 /* Now determine the charset. */
3900 reg = ((flags & CODING_STATE_SS2) ? 2
3901 : (flags & CODING_STATE_SS3) ? 3
3902 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
3903 : str->iso2022.register_left);
3904 charset = str->iso2022.charset[reg];
3906 /* Error checking: */
3907 if (NILP (charset) || str->iso2022.invalid_designated[reg]
3908 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
3909 && XCHARSET_CHARS (charset) == 94))
3910 /* Mrmph. We are trying to invoke a register that has no
3911 or an invalid charset in it, or trying to add a character
3912 outside the range of the charset. Insert that char literally
3913 to preserve it for the output. */
3915 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3916 DECODE_ADD_BINARY_CHAR (c, dst);
3921 /* Things are probably hunky-dorey. */
3923 /* Fetch reverse charset, maybe. */
3924 if (((flags & CODING_STATE_R2L) &&
3925 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
3927 (!(flags & CODING_STATE_R2L) &&
3928 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
3930 Lisp_Object new_charset =
3931 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
3932 if (!NILP (new_charset))
3933 charset = new_charset;
3936 lb = XCHARSET_LEADING_BYTE (charset);
3937 switch (XCHARSET_REP_BYTES (charset))
3940 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3941 Dynarr_add (dst, c & 0x7F);
3944 case 2: /* one-byte official */
3945 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3946 Dynarr_add (dst, lb);
3947 Dynarr_add (dst, c | 0x80);
3950 case 3: /* one-byte private or two-byte official */
3951 if (XCHARSET_PRIVATE_P (charset))
3953 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3954 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
3955 Dynarr_add (dst, lb);
3956 Dynarr_add (dst, c | 0x80);
3962 Dynarr_add (dst, lb);
3963 Dynarr_add (dst, ch | 0x80);
3964 Dynarr_add (dst, c | 0x80);
3972 default: /* two-byte private */
3975 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
3976 Dynarr_add (dst, lb);
3977 Dynarr_add (dst, ch | 0x80);
3978 Dynarr_add (dst, c | 0x80);
3987 flags &= CODING_STATE_ISO2022_LOCK;
3990 label_continue_loop:;
3993 if (flags & CODING_STATE_END)
3994 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3996 CODING_STREAM_COMPOSE (str, flags, ch);
4000 /***** ISO2022 encoder *****/
4002 /* Designate CHARSET into register REG. */
4005 iso2022_designate (Lisp_Object charset, unsigned char reg,
4006 struct encoding_stream *str, unsigned_char_dynarr *dst)
4008 CONST char *inter94 = "()*+", *inter96= ",-./";
4010 unsigned char final;
4011 Lisp_Object old_charset = str->iso2022.charset[reg];
4013 str->iso2022.charset[reg] = charset;
4014 if (!CHARSETP (charset))
4015 /* charset might be an initial nil or t. */
4017 type = XCHARSET_TYPE (charset);
4018 final = XCHARSET_FINAL (charset);
4019 if (!str->iso2022.force_charset_on_output[reg] &&
4020 CHARSETP (old_charset) &&
4021 XCHARSET_TYPE (old_charset) == type &&
4022 XCHARSET_FINAL (old_charset) == final)
4025 str->iso2022.force_charset_on_output[reg] = 0;
4028 charset_conversion_spec_dynarr *dyn =
4029 str->codesys->iso2022.output_conv;
4035 for (i = 0; i < Dynarr_length (dyn); i++)
4037 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4038 if (EQ (charset, spec->from_charset))
4039 charset = spec->to_charset;
4044 Dynarr_add (dst, ISO_CODE_ESC);
4047 case CHARSET_TYPE_94:
4048 Dynarr_add (dst, inter94[reg]);
4050 case CHARSET_TYPE_96:
4051 Dynarr_add (dst, inter96[reg]);
4053 case CHARSET_TYPE_94X94:
4054 Dynarr_add (dst, '$');
4056 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4059 Dynarr_add (dst, inter94[reg]);
4061 case CHARSET_TYPE_96X96:
4062 Dynarr_add (dst, '$');
4063 Dynarr_add (dst, inter96[reg]);
4066 Dynarr_add (dst, final);
4070 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4072 if (str->iso2022.register_left != 0)
4074 Dynarr_add (dst, ISO_CODE_SI);
4075 str->iso2022.register_left = 0;
4080 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4082 if (str->iso2022.register_left != 1)
4084 Dynarr_add (dst, ISO_CODE_SO);
4085 str->iso2022.register_left = 1;
4089 /* Convert internally-formatted data to ISO2022 format. */
4092 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4093 unsigned_char_dynarr *dst, unsigned int n)
4095 unsigned char charmask, c;
4096 unsigned int flags, ch;
4097 enum eol_type eol_type;
4098 unsigned char char_boundary;
4099 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4100 struct Lisp_Coding_System *codesys = str->codesys;
4102 Lisp_Object charset;
4105 /* flags for handling composite chars. We do a little switcharoo
4106 on the source while we're outputting the composite char. */
4107 unsigned int saved_n = 0;
4108 CONST unsigned char *saved_src = NULL;
4109 int in_composite = 0;
4111 CODING_STREAM_DECOMPOSE (str, flags, ch);
4112 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4113 char_boundary = str->iso2022.current_char_boundary;
4114 charset = str->iso2022.current_charset;
4115 half = str->iso2022.current_half;
4122 if (BYTE_ASCII_P (c))
4123 { /* Processing ASCII character */
4126 restore_left_to_right_direction (codesys, dst, &flags, 0);
4128 /* Make sure G0 contains ASCII */
4129 if ((c > ' ' && c < ISO_CODE_DEL) ||
4130 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4132 ensure_normal_shift (str, dst);
4133 iso2022_designate (Vcharset_ascii, 0, str, dst);
4136 /* If necessary, restore everything to the default state
4139 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4141 restore_left_to_right_direction (codesys, dst, &flags, 0);
4143 ensure_normal_shift (str, dst);
4145 for (i = 0; i < 4; i++)
4147 Lisp_Object initial_charset =
4148 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4149 iso2022_designate (initial_charset, i, str, dst);
4154 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4155 Dynarr_add (dst, '\r');
4156 if (eol_type != EOL_CR)
4157 Dynarr_add (dst, c);
4161 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4162 && fit_to_be_escape_quoted (c))
4163 Dynarr_add (dst, ISO_CODE_ESC);
4164 Dynarr_add (dst, c);
4169 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4170 { /* Processing Leading Byte */
4172 charset = CHARSET_BY_LEADING_BYTE (c);
4173 if (LEADING_BYTE_PREFIX_P(c))
4175 else if (!EQ (charset, Vcharset_control_1)
4176 && !EQ (charset, Vcharset_composite))
4180 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4181 codesys, dst, &flags, 0);
4183 /* Now determine which register to use. */
4185 for (i = 0; i < 4; i++)
4187 if (EQ (charset, str->iso2022.charset[i]) ||
4189 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4198 if (XCHARSET_GRAPHIC (charset) != 0)
4200 if (!NILP (str->iso2022.charset[1]) &&
4201 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4202 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4204 else if (!NILP (str->iso2022.charset[2]))
4206 else if (!NILP (str->iso2022.charset[3]))
4215 iso2022_designate (charset, reg, str, dst);
4217 /* Now invoke that register. */
4221 ensure_normal_shift (str, dst);
4226 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4228 ensure_shift_out (str, dst);
4236 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4238 Dynarr_add (dst, ISO_CODE_ESC);
4239 Dynarr_add (dst, 'N');
4244 Dynarr_add (dst, ISO_CODE_SS2);
4250 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4252 Dynarr_add (dst, ISO_CODE_ESC);
4253 Dynarr_add (dst, 'O');
4258 Dynarr_add (dst, ISO_CODE_SS3);
4270 { /* Processing Non-ASCII character */
4271 charmask = (half == 0 ? 0x7F : 0xFF);
4273 if (EQ (charset, Vcharset_control_1))
4275 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4276 && fit_to_be_escape_quoted (c))
4277 Dynarr_add (dst, ISO_CODE_ESC);
4278 /* you asked for it ... */
4279 Dynarr_add (dst, c - 0x20);
4283 switch (XCHARSET_REP_BYTES (charset))
4286 Dynarr_add (dst, c & charmask);
4289 if (XCHARSET_PRIVATE_P (charset))
4291 Dynarr_add (dst, c & charmask);
4296 if (EQ (charset, Vcharset_composite))
4300 /* #### Bother! We don't know how to
4302 Dynarr_add (dst, '~');
4306 Emchar emch = MAKE_CHAR (Vcharset_composite,
4307 ch & 0x7F, c & 0x7F);
4308 Lisp_Object lstr = composite_char_string (emch);
4312 src = XSTRING_DATA (lstr);
4313 n = XSTRING_LENGTH (lstr);
4314 Dynarr_add (dst, ISO_CODE_ESC);
4315 Dynarr_add (dst, '0'); /* start composing */
4320 Dynarr_add (dst, ch & charmask);
4321 Dynarr_add (dst, c & charmask);
4334 Dynarr_add (dst, ch & charmask);
4335 Dynarr_add (dst, c & charmask);
4356 Dynarr_add (dst, ISO_CODE_ESC);
4357 Dynarr_add (dst, '1'); /* end composing */
4358 goto back_to_square_n; /* Wheeeeeeeee ..... */
4361 if (char_boundary && flags & CODING_STATE_END)
4363 restore_left_to_right_direction (codesys, dst, &flags, 0);
4364 ensure_normal_shift (str, dst);
4365 for (i = 0; i < 4; i++)
4367 Lisp_Object initial_charset =
4368 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4369 iso2022_designate (initial_charset, i, str, dst);
4373 CODING_STREAM_COMPOSE (str, flags, ch);
4374 str->iso2022.current_char_boundary = char_boundary;
4375 str->iso2022.current_charset = charset;
4376 str->iso2022.current_half = half;
4378 /* Verbum caro factum est! */
4382 /************************************************************************/
4383 /* No-conversion methods */
4384 /************************************************************************/
4386 /* This is used when reading in "binary" files -- i.e. files that may
4387 contain all 256 possible byte values and that are not to be
4388 interpreted as being in any particular decoding. */
4390 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4391 unsigned_char_dynarr *dst, unsigned int n)
4394 unsigned int flags, ch;
4395 enum eol_type eol_type;
4396 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4398 CODING_STREAM_DECOMPOSE (str, flags, ch);
4399 eol_type = str->eol_type;
4405 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4406 DECODE_ADD_BINARY_CHAR (c, dst);
4407 label_continue_loop:;
4410 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4412 CODING_STREAM_COMPOSE (str, flags, ch);
4416 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4417 unsigned_char_dynarr *dst, unsigned int n)
4420 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4421 unsigned int flags, ch;
4422 enum eol_type eol_type;
4424 CODING_STREAM_DECOMPOSE (str, flags, ch);
4425 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4432 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4433 Dynarr_add (dst, '\r');
4434 if (eol_type != EOL_CR)
4435 Dynarr_add (dst, '\n');
4438 else if (BYTE_ASCII_P (c))
4441 Dynarr_add (dst, c);
4443 else if (BUFBYTE_LEADING_BYTE_P (c))
4446 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
4447 c == LEADING_BYTE_CONTROL_1)
4450 Dynarr_add (dst, '~'); /* untranslatable character */
4454 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
4455 Dynarr_add (dst, c);
4456 else if (ch == LEADING_BYTE_CONTROL_1)
4459 Dynarr_add (dst, c - 0x20);
4461 /* else it should be the second or third byte of an
4462 untranslatable character, so ignore it */
4467 CODING_STREAM_COMPOSE (str, flags, ch);
4471 /************************************************************************/
4472 /* Simple internal/external functions */
4473 /************************************************************************/
4475 static Extbyte_dynarr *conversion_out_dynarr;
4476 static Bufbyte_dynarr *conversion_in_dynarr;
4478 /* Determine coding system from coding format */
4480 /* #### not correct for all values of `fmt'! */
4482 external_data_format_to_coding_system (enum external_data_format fmt)
4486 case FORMAT_FILENAME:
4487 case FORMAT_TERMINAL:
4488 if (EQ (Vfile_name_coding_system, Qnil) ||
4489 EQ (Vfile_name_coding_system, Qbinary))
4492 return Fget_coding_system (Vfile_name_coding_system);
4495 return Fget_coding_system (Qctext);
4503 convert_to_external_format (CONST Bufbyte *ptr,
4506 enum external_data_format fmt)
4508 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
4510 if (!conversion_out_dynarr)
4511 conversion_out_dynarr = Dynarr_new (Extbyte);
4513 Dynarr_reset (conversion_out_dynarr);
4515 if (NILP (coding_system))
4517 CONST Bufbyte *end = ptr + len;
4522 (BYTE_ASCII_P (*ptr)) ? *ptr :
4523 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
4524 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
4527 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
4531 #ifdef ERROR_CHECK_BUFPOS
4532 assert (ptr == end);
4537 Lisp_Object instream, outstream, da_outstream;
4538 Lstream *istr, *ostr;
4539 struct gcpro gcpro1, gcpro2, gcpro3;
4540 char tempbuf[1024]; /* some random amount */
4542 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4543 da_outstream = make_dynarr_output_stream
4544 ((unsigned_char_dynarr *) conversion_out_dynarr);
4546 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
4547 istr = XLSTREAM (instream);
4548 ostr = XLSTREAM (outstream);
4549 GCPRO3 (instream, outstream, da_outstream);
4552 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4555 Lstream_write (ostr, tempbuf, size_in_bytes);
4557 Lstream_close (istr);
4558 Lstream_close (ostr);
4560 Lstream_delete (istr);
4561 Lstream_delete (ostr);
4562 Lstream_delete (XLSTREAM (da_outstream));
4565 *len_out = Dynarr_length (conversion_out_dynarr);
4566 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
4567 return Dynarr_atp (conversion_out_dynarr, 0);
4571 convert_from_external_format (CONST Extbyte *ptr,
4574 enum external_data_format fmt)
4576 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
4578 if (!conversion_in_dynarr)
4579 conversion_in_dynarr = Dynarr_new (Bufbyte);
4581 Dynarr_reset (conversion_in_dynarr);
4583 if (NILP (coding_system))
4585 CONST Extbyte *end = ptr + len;
4586 for (; ptr < end; ptr++)
4589 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
4594 Lisp_Object instream, outstream, da_outstream;
4595 Lstream *istr, *ostr;
4596 struct gcpro gcpro1, gcpro2, gcpro3;
4597 char tempbuf[1024]; /* some random amount */
4599 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4600 da_outstream = make_dynarr_output_stream
4601 ((unsigned_char_dynarr *) conversion_in_dynarr);
4603 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
4604 istr = XLSTREAM (instream);
4605 ostr = XLSTREAM (outstream);
4606 GCPRO3 (instream, outstream, da_outstream);
4609 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4612 Lstream_write (ostr, tempbuf, size_in_bytes);
4614 Lstream_close (istr);
4615 Lstream_close (ostr);
4617 Lstream_delete (istr);
4618 Lstream_delete (ostr);
4619 Lstream_delete (XLSTREAM (da_outstream));
4622 *len_out = Dynarr_length (conversion_in_dynarr);
4623 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
4624 return Dynarr_atp (conversion_in_dynarr, 0);
4628 /************************************************************************/
4629 /* Initialization */
4630 /************************************************************************/
4633 syms_of_mule_coding (void)
4635 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
4636 deferror (&Qcoding_system_error, "coding-system-error",
4637 "Coding-system error", Qio_error);
4639 DEFSUBR (Fcoding_system_p);
4640 DEFSUBR (Ffind_coding_system);
4641 DEFSUBR (Fget_coding_system);
4642 DEFSUBR (Fcoding_system_list);
4643 DEFSUBR (Fcoding_system_name);
4644 DEFSUBR (Fmake_coding_system);
4645 DEFSUBR (Fcopy_coding_system);
4646 DEFSUBR (Fsubsidiary_coding_system);
4648 DEFSUBR (Fcoding_system_type);
4649 DEFSUBR (Fcoding_system_doc_string);
4651 DEFSUBR (Fcoding_system_charset);
4653 DEFSUBR (Fcoding_system_property);
4655 DEFSUBR (Fcoding_category_list);
4656 DEFSUBR (Fset_coding_priority_list);
4657 DEFSUBR (Fcoding_priority_list);
4658 DEFSUBR (Fset_coding_category_system);
4659 DEFSUBR (Fcoding_category_system);
4661 DEFSUBR (Fdetect_coding_region);
4662 DEFSUBR (Fdecode_coding_region);
4663 DEFSUBR (Fencode_coding_region);
4665 DEFSUBR (Fdecode_shift_jis_char);
4666 DEFSUBR (Fencode_shift_jis_char);
4667 DEFSUBR (Fdecode_big5_char);
4668 DEFSUBR (Fencode_big5_char);
4670 defsymbol (&Qcoding_system_p, "coding-system-p");
4671 defsymbol (&Qno_conversion, "no-conversion");
4673 defsymbol (&Qbig5, "big5");
4674 defsymbol (&Qshift_jis, "shift-jis");
4675 defsymbol (&Qccl, "ccl");
4676 defsymbol (&Qiso2022, "iso2022");
4678 defsymbol (&Qmnemonic, "mnemonic");
4679 defsymbol (&Qeol_type, "eol-type");
4680 defsymbol (&Qpost_read_conversion, "post-read-conversion");
4681 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
4683 defsymbol (&Qcr, "cr");
4684 defsymbol (&Qlf, "lf");
4685 defsymbol (&Qcrlf, "crlf");
4686 defsymbol (&Qeol_cr, "eol-cr");
4687 defsymbol (&Qeol_lf, "eol-lf");
4688 defsymbol (&Qeol_crlf, "eol-crlf");
4690 defsymbol (&Qcharset_g0, "charset-g0");
4691 defsymbol (&Qcharset_g1, "charset-g1");
4692 defsymbol (&Qcharset_g2, "charset-g2");
4693 defsymbol (&Qcharset_g3, "charset-g3");
4694 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
4695 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
4696 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
4697 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
4698 defsymbol (&Qno_iso6429, "no-iso6429");
4699 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
4700 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
4702 defsymbol (&Qshort, "short");
4703 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
4704 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
4705 defsymbol (&Qseven, "seven");
4706 defsymbol (&Qlock_shift, "lock-shift");
4707 defsymbol (&Qescape_quoted, "escape-quoted");
4709 defsymbol (&Qencode, "encode");
4710 defsymbol (&Qdecode, "decode");
4713 defsymbol (&Qctext, "ctext");
4714 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
4716 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
4718 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
4720 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
4722 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
4724 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
4726 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
4729 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
4734 lstream_type_create_mule_coding (void)
4736 LSTREAM_HAS_METHOD (decoding, reader);
4737 LSTREAM_HAS_METHOD (decoding, writer);
4738 LSTREAM_HAS_METHOD (decoding, rewinder);
4739 LSTREAM_HAS_METHOD (decoding, seekable_p);
4740 LSTREAM_HAS_METHOD (decoding, flusher);
4741 LSTREAM_HAS_METHOD (decoding, closer);
4742 LSTREAM_HAS_METHOD (decoding, marker);
4744 LSTREAM_HAS_METHOD (encoding, reader);
4745 LSTREAM_HAS_METHOD (encoding, writer);
4746 LSTREAM_HAS_METHOD (encoding, rewinder);
4747 LSTREAM_HAS_METHOD (encoding, seekable_p);
4748 LSTREAM_HAS_METHOD (encoding, flusher);
4749 LSTREAM_HAS_METHOD (encoding, closer);
4750 LSTREAM_HAS_METHOD (encoding, marker);
4754 vars_of_mule_coding (void)
4758 /* Initialize to something reasonable ... */
4759 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
4761 coding_category_system[i] = Qnil;
4762 coding_category_by_priority[i] = i;
4765 Fprovide (intern ("file-coding"));
4767 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
4768 Coding system used for TTY keyboard input.
4769 Not used under a windowing system.
4771 Vkeyboard_coding_system = Qnil;
4773 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
4774 Coding system used for TTY display output.
4775 Not used under a windowing system.
4777 Vterminal_coding_system = Qnil;
4779 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
4780 Overriding coding system used when writing a file or process.
4781 You should *bind* this, not set it. If this is non-nil, it specifies
4782 the coding system that will be used when a file or process is read
4783 in, and overrides `buffer-file-coding-system-for-read',
4784 `insert-file-contents-pre-hook', etc. Use those variables instead of
4785 this one for permanent changes to the environment.
4787 Vcoding_system_for_read = Qnil;
4789 DEFVAR_LISP ("coding-system-for-write",
4790 &Vcoding_system_for_write /*
4791 Overriding coding system used when writing a file or process.
4792 You should *bind* this, not set it. If this is non-nil, it specifies
4793 the coding system that will be used when a file or process is wrote
4794 in, and overrides `buffer-file-coding-system',
4795 `write-region-pre-hook', etc. Use those variables instead of this one
4796 for permanent changes to the environment.
4798 Vcoding_system_for_write = Qnil;
4800 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
4801 Coding system used to convert pathnames when accessing files.
4803 Vfile_name_coding_system = Qnil;
4805 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
4806 Non-nil means the buffer contents are regarded as multi-byte form
4807 of characters, not a binary code. This affects the display, file I/O,
4808 and behaviors of various editing commands.
4810 Setting this to nil does not do anything.
4812 enable_multibyte_characters = 1;
4816 complex_vars_of_mule_coding (void)
4818 staticpro (&Vcoding_system_hash_table);
4819 Vcoding_system_hash_table =
4820 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
4822 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
4824 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
4826 struct codesys_prop csp; \
4828 csp.prop_type = (Prop_Type); \
4829 Dynarr_add (the_codesys_prop_dynarr, csp); \
4832 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
4833 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
4834 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
4835 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
4836 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
4837 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
4838 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
4840 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
4841 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
4842 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
4843 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
4844 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
4845 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
4846 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
4847 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
4848 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
4849 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
4850 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
4851 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
4852 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
4853 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
4854 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
4855 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
4856 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
4858 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
4859 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
4861 /* Need to create this here or we're really screwed. */
4862 Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
4863 list2 (Qmnemonic, build_string ("Noconv")));
4865 Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
4868 /* Need this for bootstrapping */
4869 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
4870 Fget_coding_system (Qno_conversion);