1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
35 #include "file-coding.h"
37 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
39 Lisp_Object Vkeyboard_coding_system;
40 Lisp_Object Vterminal_coding_system;
41 Lisp_Object Vcoding_system_for_read;
42 Lisp_Object Vcoding_system_for_write;
43 Lisp_Object Vfile_name_coding_system;
45 /* Table of symbols identifying each coding category. */
46 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
48 /* Coding system currently associated with each coding category. */
49 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
51 /* Table of all coding categories in decreasing order of priority.
52 This describes a permutation of the possible coding categories. */
53 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
55 Lisp_Object Qcoding_system_p;
57 Lisp_Object Qno_conversion, Qccl, Qiso2022;
58 /* Qinternal in general.c */
60 Lisp_Object Qmnemonic, Qeol_type;
61 Lisp_Object Qcr, Qcrlf, Qlf;
62 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
63 Lisp_Object Qpost_read_conversion;
64 Lisp_Object Qpre_write_conversion;
67 Lisp_Object Qbig5, Qshift_jis;
68 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
69 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
70 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
71 Lisp_Object Qno_iso6429;
72 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
73 Lisp_Object Qctext, Qescape_quoted;
74 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
76 Lisp_Object Qencode, Qdecode;
78 Lisp_Object Vcoding_system_hash_table;
80 int enable_multibyte_characters;
83 /* Additional information used by the ISO2022 decoder and detector. */
84 struct iso2022_decoder
86 /* CHARSET holds the character sets currently assigned to the G0
87 through G3 variables. It is initialized from the array
88 INITIAL_CHARSET in CODESYS. */
89 Lisp_Object charset[4];
91 /* Which registers are currently invoked into the left (GL) and
92 right (GR) halves of the 8-bit encoding space? */
93 int register_left, register_right;
95 /* ISO_ESC holds a value indicating part of an escape sequence
96 that has already been seen. */
97 enum iso_esc_flag esc;
99 /* This records the bytes we've seen so far in an escape sequence,
100 in case the sequence is invalid (we spit out the bytes unchanged). */
101 unsigned char esc_bytes[8];
103 /* Index for next byte to store in ISO escape sequence. */
106 /* Stuff seen so far when composing a string. */
107 unsigned_char_dynarr *composite_chars;
109 /* If we saw an invalid designation sequence for a particular
110 register, we flag it here and switch to ASCII. The next time we
111 see a valid designation for this register, we turn off the flag
112 and do the designation normally, but pretend the sequence was
113 invalid. The effect of all this is that (most of the time) the
114 escape sequences for both the switch to the unknown charset, and
115 the switch back to the known charset, get inserted literally into
116 the buffer and saved out as such. The hope is that we can
117 preserve the escape sequences so that the resulting written out
118 file makes sense. If we don't do any of this, the designation
119 to the invalid charset will be preserved but that switch back
120 to the known charset will probably get eaten because it was
121 the same charset that was already present in the register. */
122 unsigned char invalid_designated[4];
124 /* We try to do similar things as above for direction-switching
125 sequences. If we encountered a direction switch while an
126 invalid designation was present, or an invalid designation
127 just after a direction switch (i.e. no valid designation
128 encountered yet), we insert the direction-switch escape
129 sequence literally into the output stream, and later on
130 insert the corresponding direction-restoring escape sequence
132 unsigned int switched_dir_and_no_valid_charset_yet :1;
133 unsigned int invalid_switch_dir :1;
135 /* Tells the decoder to output the escape sequence literally
136 even though it was valid. Used in the games we play to
137 avoid lossage when we encounter invalid designations. */
138 unsigned int output_literally :1;
139 /* We encountered a direction switch followed by an invalid
140 designation. We didn't output the direction switch
141 literally because we didn't know about the invalid designation;
142 but we have to do so now. */
143 unsigned int output_direction_sequence :1;
146 EXFUN (Fcopy_coding_system, 2);
148 struct detection_state;
149 static int detect_coding_sjis (struct detection_state *st,
150 CONST unsigned char *src,
152 static void decode_coding_sjis (Lstream *decoding,
153 CONST unsigned char *src,
154 unsigned_char_dynarr *dst,
156 static void encode_coding_sjis (Lstream *encoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static int detect_coding_big5 (struct detection_state *st,
161 CONST unsigned char *src,
163 static void decode_coding_big5 (Lstream *decoding,
164 CONST unsigned char *src,
165 unsigned_char_dynarr *dst, unsigned int n);
166 static void encode_coding_big5 (Lstream *encoding,
167 CONST unsigned char *src,
168 unsigned_char_dynarr *dst, unsigned int n);
169 static int postprocess_iso2022_mask (int mask);
170 static void reset_iso2022 (Lisp_Object coding_system,
171 struct iso2022_decoder *iso);
172 static int detect_coding_iso2022 (struct detection_state *st,
173 CONST unsigned char *src,
175 static void decode_coding_iso2022 (Lstream *decoding,
176 CONST unsigned char *src,
177 unsigned_char_dynarr *dst, unsigned int n);
178 static void encode_coding_iso2022 (Lstream *encoding,
179 CONST unsigned char *src,
180 unsigned_char_dynarr *dst, unsigned int n);
182 static void decode_coding_no_conversion (Lstream *decoding,
183 CONST unsigned char *src,
184 unsigned_char_dynarr *dst,
186 static void encode_coding_no_conversion (Lstream *encoding,
187 CONST unsigned char *src,
188 unsigned_char_dynarr *dst,
190 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
191 unsigned_char_dynarr *dst, unsigned int n);
192 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
193 unsigned_char_dynarr *dst, unsigned int n);
195 typedef struct codesys_prop codesys_prop;
204 Dynarr_declare (codesys_prop);
205 } codesys_prop_dynarr;
207 codesys_prop_dynarr *the_codesys_prop_dynarr;
209 enum codesys_prop_enum
212 CODESYS_PROP_ISO2022,
217 /************************************************************************/
218 /* Coding system functions */
219 /************************************************************************/
221 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
222 static void print_coding_system (Lisp_Object, Lisp_Object, int);
223 static void finalize_coding_system (void *header, int for_disksave);
225 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
226 mark_coding_system, print_coding_system,
227 finalize_coding_system,
228 0, 0, struct Lisp_Coding_System);
231 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
233 struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
235 markobj (CODING_SYSTEM_NAME (codesys));
236 markobj (CODING_SYSTEM_DOC_STRING (codesys));
237 markobj (CODING_SYSTEM_MNEMONIC (codesys));
238 markobj (CODING_SYSTEM_EOL_LF (codesys));
239 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
240 markobj (CODING_SYSTEM_EOL_CR (codesys));
242 switch (CODING_SYSTEM_TYPE (codesys))
246 case CODESYS_ISO2022:
247 for (i = 0; i < 4; i++)
248 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
249 if (codesys->iso2022.input_conv)
251 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
253 struct charset_conversion_spec *ccs =
254 Dynarr_atp (codesys->iso2022.input_conv, i);
255 markobj (ccs->from_charset);
256 markobj (ccs->to_charset);
259 if (codesys->iso2022.output_conv)
261 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
263 struct charset_conversion_spec *ccs =
264 Dynarr_atp (codesys->iso2022.output_conv, i);
265 markobj (ccs->from_charset);
266 markobj (ccs->to_charset);
272 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
273 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
280 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
281 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
285 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
288 struct Lisp_Coding_System *c = XCODING_SYSTEM (obj);
290 error ("printing unreadable object #<coding_system 0x%x>",
293 write_c_string ("#<coding_system ", printcharfun);
294 print_internal (c->name, printcharfun, 1);
295 write_c_string (">", printcharfun);
299 finalize_coding_system (void *header, int for_disksave)
301 struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header;
302 /* Since coding systems never go away, this function is not
303 necessary. But it would be necessary if we changed things
304 so that coding systems could go away. */
305 if (!for_disksave) /* see comment in lstream.c */
307 switch (CODING_SYSTEM_TYPE (c))
310 case CODESYS_ISO2022:
311 if (c->iso2022.input_conv)
313 Dynarr_free (c->iso2022.input_conv);
314 c->iso2022.input_conv = 0;
316 if (c->iso2022.output_conv)
318 Dynarr_free (c->iso2022.output_conv);
319 c->iso2022.output_conv = 0;
330 symbol_to_eol_type (Lisp_Object symbol)
332 CHECK_SYMBOL (symbol);
333 if (NILP (symbol)) return EOL_AUTODETECT;
334 if (EQ (symbol, Qlf)) return EOL_LF;
335 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
336 if (EQ (symbol, Qcr)) return EOL_CR;
338 signal_simple_error ("Unrecognized eol type", symbol);
339 return EOL_AUTODETECT; /* not reached */
343 eol_type_to_symbol (enum eol_type type)
348 case EOL_LF: return Qlf;
349 case EOL_CRLF: return Qcrlf;
350 case EOL_CR: return Qcr;
351 case EOL_AUTODETECT: return Qnil;
356 setup_eol_coding_systems (struct Lisp_Coding_System *codesys)
358 Lisp_Object codesys_obj;
359 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
360 char *codesys_name = (char *) alloca (len + 7);
362 char *codesys_mnemonic=0;
364 Lisp_Object codesys_name_sym, sub_codesys_obj;
368 XSETCODING_SYSTEM (codesys_obj, codesys);
370 memcpy (codesys_name,
371 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
373 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
375 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
376 codesys_mnemonic = (char *) alloca (mlen + 7);
377 memcpy (codesys_mnemonic,
378 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
381 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
382 strcpy (codesys_name + len, "-" op_sys); \
384 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
385 codesys_name_sym = intern (codesys_name); \
386 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
387 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
389 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
390 build_string (codesys_mnemonic); \
391 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
394 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
395 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
396 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
399 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
400 Return t if OBJECT is a coding system.
401 A coding system is an object that defines how text containing multiple
402 character sets is encoded into a stream of (typically 8-bit) bytes.
403 The coding system is used to decode the stream into a series of
404 characters (which may be from multiple charsets) when the text is read
405 from a file or process, and is used to encode the text back into the
406 same format when it is written out to a file or process.
408 For example, many ISO2022-compliant coding systems (such as Compound
409 Text, which is used for inter-client data under the X Window System)
410 use escape sequences to switch between different charsets -- Japanese
411 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
412 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
413 `make-coding-system' for more information.
415 Coding systems are normally identified using a symbol, and the
416 symbol is accepted in place of the actual coding system object whenever
417 a coding system is called for. (This is similar to how faces work.)
421 return CODING_SYSTEMP (object) ? Qt : Qnil;
424 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
425 Retrieve the coding system of the given name.
427 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
428 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
429 If there is no such coding system, nil is returned. Otherwise the
430 associated coding system object is returned.
432 (coding_system_or_name))
434 if (CODING_SYSTEMP (coding_system_or_name))
435 return coding_system_or_name;
437 if (NILP (coding_system_or_name))
438 coding_system_or_name = Qbinary;
440 CHECK_SYMBOL (coding_system_or_name);
442 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
445 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
446 Retrieve the coding system of the given name.
447 Same as `find-coding-system' except that if there is no such
448 coding system, an error is signaled instead of returning nil.
452 Lisp_Object coding_system = Ffind_coding_system (name);
454 if (NILP (coding_system))
455 signal_simple_error ("No such coding system", name);
456 return coding_system;
459 /* We store the coding systems in hash tables with the names as the key and the
460 actual coding system object as the value. Occasionally we need to use them
461 in a list format. These routines provide us with that. */
462 struct coding_system_list_closure
464 Lisp_Object *coding_system_list;
468 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
469 void *coding_system_list_closure)
471 /* This function can GC */
472 struct coding_system_list_closure *cscl =
473 (struct coding_system_list_closure *) coding_system_list_closure;
474 Lisp_Object *coding_system_list = cscl->coding_system_list;
476 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
477 *coding_system_list);
481 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
482 Return a list of the names of all defined coding systems.
486 Lisp_Object coding_system_list = Qnil;
488 struct coding_system_list_closure coding_system_list_closure;
490 GCPRO1 (coding_system_list);
491 coding_system_list_closure.coding_system_list = &coding_system_list;
492 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
493 &coding_system_list_closure);
496 return coding_system_list;
499 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
500 Return the name of the given coding system.
504 coding_system = Fget_coding_system (coding_system);
505 return XCODING_SYSTEM_NAME (coding_system);
508 static struct Lisp_Coding_System *
509 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
511 struct Lisp_Coding_System *codesys =
512 alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system);
514 zero_lcrecord (codesys);
515 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
516 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
517 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
518 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
519 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
520 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
521 CODING_SYSTEM_TYPE (codesys) = type;
522 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
524 if (type == CODESYS_ISO2022)
527 for (i = 0; i < 4; i++)
528 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
530 else if (type == CODESYS_CCL)
532 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
533 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
536 CODING_SYSTEM_NAME (codesys) = name;
542 /* Given a list of charset conversion specs as specified in a Lisp
543 program, parse it into STORE_HERE. */
546 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
547 Lisp_Object spec_list)
551 EXTERNAL_LIST_LOOP (rest, spec_list)
553 Lisp_Object car = XCAR (rest);
554 Lisp_Object from, to;
555 struct charset_conversion_spec spec;
557 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
558 signal_simple_error ("Invalid charset conversion spec", car);
559 from = Fget_charset (XCAR (car));
560 to = Fget_charset (XCAR (XCDR (car)));
561 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
562 signal_simple_error_2
563 ("Attempted conversion between different charset types",
565 spec.from_charset = from;
566 spec.to_charset = to;
568 Dynarr_add (store_here, spec);
572 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
573 specs, return the equivalent as the Lisp programmer would see it.
575 If LOAD_HERE is 0, return Qnil. */
578 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
585 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
587 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
588 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
591 return Fnreverse (result);
596 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
597 Register symbol NAME as a coding system.
599 TYPE describes the conversion method used and should be one of
602 Automatic conversion. XEmacs attempts to detect the coding system
605 No conversion. Use this for binary files and such. On output,
606 graphic characters that are not in ASCII or Latin-1 will be
607 replaced by a ?. (For a no-conversion-encoded buffer, these
608 characters will only be present if you explicitly insert them.)
610 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
612 Any ISO2022-compliant encoding. Among other things, this includes
613 JIS (the Japanese encoding commonly used for e-mail), EUC (the
614 standard Unix encoding for Japanese and other languages), and
615 Compound Text (the encoding used in X11). You can specify more
616 specific information about the conversion with the FLAGS argument.
618 Big5 (the encoding commonly used for Taiwanese).
620 The conversion is performed using a user-written pseudo-code
621 program. CCL (Code Conversion Language) is the name of this
624 Write out or read in the raw contents of the memory representing
625 the buffer's text. This is primarily useful for debugging
626 purposes, and is only enabled when XEmacs has been compiled with
627 DEBUG_XEMACS defined (via the --debug configure option).
628 WARNING: Reading in a file using 'internal conversion can result
629 in an internal inconsistency in the memory representing a
630 buffer's text, which will produce unpredictable results and may
631 cause XEmacs to crash. Under normal circumstances you should
632 never use 'internal conversion.
634 DOC-STRING is a string describing the coding system.
636 PROPS is a property list, describing the specific nature of the
637 character set. Recognized properties are:
640 String to be displayed in the modeline when this coding system is
644 End-of-line conversion to be used. It should be one of
647 Automatically detect the end-of-line type (LF, CRLF,
648 or CR). Also generate subsidiary coding systems named
649 `NAME-unix', `NAME-dos', and `NAME-mac', that are
650 identical to this coding system but have an EOL-TYPE
651 value of 'lf, 'crlf, and 'cr, respectively.
653 The end of a line is marked externally using ASCII LF.
654 Since this is also the way that XEmacs represents an
655 end-of-line internally, specifying this option results
656 in no end-of-line conversion. This is the standard
657 format for Unix text files.
659 The end of a line is marked externally using ASCII
660 CRLF. This is the standard format for MS-DOS text
663 The end of a line is marked externally using ASCII CR.
664 This is the standard format for Macintosh text files.
666 Automatically detect the end-of-line type but do not
667 generate subsidiary coding systems. (This value is
668 converted to nil when stored internally, and
669 `coding-system-property' will return nil.)
671 'post-read-conversion
672 Function called after a file has been read in, to perform the
673 decoding. Called with two arguments, BEG and END, denoting
674 a region of the current buffer to be decoded.
676 'pre-write-conversion
677 Function called before a file is written out, to perform the
678 encoding. Called with two arguments, BEG and END, denoting
679 a region of the current buffer to be encoded.
682 The following additional properties are recognized if TYPE is 'iso2022:
688 The character set initially designated to the G0 - G3 registers.
689 The value should be one of
691 -- A charset object (designate that character set)
692 -- nil (do not ever use this register)
693 -- t (no character set is initially designated to
694 the register, but may be later on; this automatically
695 sets the corresponding `force-g*-on-output' property)
701 If non-nil, send an explicit designation sequence on output before
702 using the specified register.
705 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
706 "ESC $ B" on output in place of the full designation sequences
707 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
710 If non-nil, don't designate ASCII to G0 at each end of line on output.
711 Setting this to non-nil also suppresses other state-resetting that
712 normally happens at the end of a line.
715 If non-nil, don't designate ASCII to G0 before control chars on output.
718 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
722 If non-nil, use locking-shift (SO/SI) instead of single-shift
723 or designation by escape sequence.
726 If non-nil, don't use ISO6429's direction specification.
729 If non-nil, literal control characters that are the same as
730 the beginning of a recognized ISO2022 or ISO6429 escape sequence
731 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
732 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
733 so that they can be properly distinguished from an escape sequence.
734 (Note that doing this results in a non-portable encoding.) This
735 encoding flag is used for byte-compiled files. Note that ESC
736 is a good choice for a quoting character because there are no
737 escape sequences whose second byte is a character from the Control-0
738 or Control-1 character sets; this is explicitly disallowed by the
741 'input-charset-conversion
742 A list of conversion specifications, specifying conversion of
743 characters in one charset to another when decoding is performed.
744 Each specification is a list of two elements: the source charset,
745 and the destination charset.
747 'output-charset-conversion
748 A list of conversion specifications, specifying conversion of
749 characters in one charset to another when encoding is performed.
750 The form of each specification is the same as for
751 'input-charset-conversion.
754 The following additional properties are recognized (and required)
758 CCL program used for decoding (converting to internal format).
761 CCL program used for encoding (converting to external format).
763 (name, type, doc_string, props))
765 struct Lisp_Coding_System *codesys;
766 Lisp_Object rest, key, value;
767 enum coding_system_type ty;
768 int need_to_setup_eol_systems = 1;
770 /* Convert type to constant */
771 if (NILP (type) || EQ (type, Qundecided))
772 { ty = CODESYS_AUTODETECT; }
774 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
775 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
776 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
777 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
779 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
781 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
784 signal_simple_error ("Invalid coding system type", type);
788 codesys = allocate_coding_system (ty, name);
790 if (NILP (doc_string))
791 doc_string = build_string ("");
793 CHECK_STRING (doc_string);
794 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
796 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
798 if (EQ (key, Qmnemonic))
801 CHECK_STRING (value);
802 CODING_SYSTEM_MNEMONIC (codesys) = value;
805 else if (EQ (key, Qeol_type))
807 need_to_setup_eol_systems = NILP (value);
810 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
813 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
814 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
816 else if (ty == CODESYS_ISO2022)
818 #define FROB_INITIAL_CHARSET(charset_num) \
819 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
820 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
822 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
823 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
824 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
825 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
827 #define FROB_FORCE_CHARSET(charset_num) \
828 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
830 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
831 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
832 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
833 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
835 #define FROB_BOOLEAN_PROPERTY(prop) \
836 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
838 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
839 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
840 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
841 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
842 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
843 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
844 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
846 else if (EQ (key, Qinput_charset_conversion))
848 codesys->iso2022.input_conv =
849 Dynarr_new (charset_conversion_spec);
850 parse_charset_conversion_specs (codesys->iso2022.input_conv,
853 else if (EQ (key, Qoutput_charset_conversion))
855 codesys->iso2022.output_conv =
856 Dynarr_new (charset_conversion_spec);
857 parse_charset_conversion_specs (codesys->iso2022.output_conv,
861 signal_simple_error ("Unrecognized property", key);
863 else if (EQ (type, Qccl))
865 if (EQ (key, Qdecode))
867 CHECK_VECTOR (value);
868 CODING_SYSTEM_CCL_DECODE (codesys) = value;
870 else if (EQ (key, Qencode))
872 CHECK_VECTOR (value);
873 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
876 signal_simple_error ("Unrecognized property", key);
880 signal_simple_error ("Unrecognized property", key);
883 if (need_to_setup_eol_systems)
884 setup_eol_coding_systems (codesys);
887 Lisp_Object codesys_obj;
888 XSETCODING_SYSTEM (codesys_obj, codesys);
889 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
894 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
895 Copy OLD-CODING-SYSTEM to NEW-NAME.
896 If NEW-NAME does not name an existing coding system, a new one will
899 (old_coding_system, new_name))
901 Lisp_Object new_coding_system;
902 old_coding_system = Fget_coding_system (old_coding_system);
903 new_coding_system = Ffind_coding_system (new_name);
904 if (NILP (new_coding_system))
906 XSETCODING_SYSTEM (new_coding_system,
907 allocate_coding_system
908 (XCODING_SYSTEM_TYPE (old_coding_system),
910 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
914 struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
915 struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
916 memcpy (((char *) to ) + sizeof (to->header),
917 ((char *) from) + sizeof (from->header),
918 sizeof (*from) - sizeof (from->header));
921 return new_coding_system;
925 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
927 struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
928 Lisp_Object new_coding_system;
930 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
931 return coding_system;
935 case EOL_AUTODETECT: return coding_system;
936 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
937 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
938 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
942 return NILP (new_coding_system) ? coding_system : new_coding_system;
945 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
946 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
948 (coding_system, eol_type))
950 coding_system = Fget_coding_system (coding_system);
952 return subsidiary_coding_system (coding_system,
953 symbol_to_eol_type (eol_type));
957 /************************************************************************/
958 /* Coding system accessors */
959 /************************************************************************/
961 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
962 Return the doc string for CODING-SYSTEM.
966 coding_system = Fget_coding_system (coding_system);
967 return XCODING_SYSTEM_DOC_STRING (coding_system);
970 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
971 Return the type of CODING-SYSTEM.
975 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
978 case CODESYS_AUTODETECT: return Qundecided;
980 case CODESYS_SHIFT_JIS: return Qshift_jis;
981 case CODESYS_ISO2022: return Qiso2022;
982 case CODESYS_BIG5: return Qbig5;
983 case CODESYS_CCL: return Qccl;
985 case CODESYS_NO_CONVERSION: return Qno_conversion;
987 case CODESYS_INTERNAL: return Qinternal;
994 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
997 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
999 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1002 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1003 Return initial charset of CODING-SYSTEM designated to GNUM.
1006 (coding_system, gnum))
1008 coding_system = Fget_coding_system (coding_system);
1011 return coding_system_charset (coding_system, XINT (gnum));
1015 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1016 Return the PROP property of CODING-SYSTEM.
1018 (coding_system, prop))
1021 enum coding_system_type type;
1023 coding_system = Fget_coding_system (coding_system);
1024 CHECK_SYMBOL (prop);
1025 type = XCODING_SYSTEM_TYPE (coding_system);
1027 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1028 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1031 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1033 case CODESYS_PROP_ALL_OK:
1036 case CODESYS_PROP_ISO2022:
1037 if (type != CODESYS_ISO2022)
1039 ("Property only valid in ISO2022 coding systems",
1043 case CODESYS_PROP_CCL:
1044 if (type != CODESYS_CCL)
1046 ("Property only valid in CCL coding systems",
1056 signal_simple_error ("Unrecognized property", prop);
1058 if (EQ (prop, Qname))
1059 return XCODING_SYSTEM_NAME (coding_system);
1060 else if (EQ (prop, Qtype))
1061 return Fcoding_system_type (coding_system);
1062 else if (EQ (prop, Qdoc_string))
1063 return XCODING_SYSTEM_DOC_STRING (coding_system);
1064 else if (EQ (prop, Qmnemonic))
1065 return XCODING_SYSTEM_MNEMONIC (coding_system);
1066 else if (EQ (prop, Qeol_type))
1067 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1068 else if (EQ (prop, Qeol_lf))
1069 return XCODING_SYSTEM_EOL_LF (coding_system);
1070 else if (EQ (prop, Qeol_crlf))
1071 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1072 else if (EQ (prop, Qeol_cr))
1073 return XCODING_SYSTEM_EOL_CR (coding_system);
1074 else if (EQ (prop, Qpost_read_conversion))
1075 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1076 else if (EQ (prop, Qpre_write_conversion))
1077 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1079 else if (type == CODESYS_ISO2022)
1081 if (EQ (prop, Qcharset_g0))
1082 return coding_system_charset (coding_system, 0);
1083 else if (EQ (prop, Qcharset_g1))
1084 return coding_system_charset (coding_system, 1);
1085 else if (EQ (prop, Qcharset_g2))
1086 return coding_system_charset (coding_system, 2);
1087 else if (EQ (prop, Qcharset_g3))
1088 return coding_system_charset (coding_system, 3);
1090 #define FORCE_CHARSET(charset_num) \
1091 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1092 (coding_system, charset_num) ? Qt : Qnil)
1094 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1095 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1096 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1097 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1099 #define LISP_BOOLEAN(prop) \
1100 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1102 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1103 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1104 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1105 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1106 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1107 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1108 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1110 else if (EQ (prop, Qinput_charset_conversion))
1112 unparse_charset_conversion_specs
1113 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1114 else if (EQ (prop, Qoutput_charset_conversion))
1116 unparse_charset_conversion_specs
1117 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1121 else if (type == CODESYS_CCL)
1123 if (EQ (prop, Qdecode))
1124 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1125 else if (EQ (prop, Qencode))
1126 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1134 return Qnil; /* not reached */
1138 /************************************************************************/
1139 /* Coding category functions */
1140 /************************************************************************/
1143 decode_coding_category (Lisp_Object symbol)
1147 CHECK_SYMBOL (symbol);
1148 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1149 if (EQ (coding_category_symbol[i], symbol))
1152 signal_simple_error ("Unrecognized coding category", symbol);
1153 return 0; /* not reached */
1156 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1157 Return a list of all recognized coding categories.
1162 Lisp_Object list = Qnil;
1164 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1165 list = Fcons (coding_category_symbol[i], list);
1169 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1170 Change the priority order of the coding categories.
1171 LIST should be list of coding categories, in descending order of
1172 priority. Unspecified coding categories will be lower in priority
1173 than all specified ones, in the same relative order they were in
1178 int category_to_priority[CODING_CATEGORY_LAST + 1];
1182 /* First generate a list that maps coding categories to priorities. */
1184 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1185 category_to_priority[i] = -1;
1187 /* Highest priority comes from the specified list. */
1189 EXTERNAL_LIST_LOOP (rest, list)
1191 int cat = decode_coding_category (XCAR (rest));
1193 if (category_to_priority[cat] >= 0)
1194 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1195 category_to_priority[cat] = i++;
1198 /* Now go through the existing categories by priority to retrieve
1199 the categories not yet specified and preserve their priority
1201 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1203 int cat = coding_category_by_priority[j];
1204 if (category_to_priority[cat] < 0)
1205 category_to_priority[cat] = i++;
1208 /* Now we need to construct the inverse of the mapping we just
1211 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1212 coding_category_by_priority[category_to_priority[i]] = i;
1214 /* Phew! That was confusing. */
1218 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1219 Return a list of coding categories in descending order of priority.
1224 Lisp_Object list = Qnil;
1226 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1227 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1232 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1233 Change the coding system associated with a coding category.
1235 (coding_category, coding_system))
1237 int cat = decode_coding_category (coding_category);
1239 coding_system = Fget_coding_system (coding_system);
1240 coding_category_system[cat] = coding_system;
1244 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1245 Return the coding system associated with a coding category.
1249 int cat = decode_coding_category (coding_category);
1250 Lisp_Object sys = coding_category_system[cat];
1253 return XCODING_SYSTEM_NAME (sys);
1258 /************************************************************************/
1259 /* Detecting the encoding of data */
1260 /************************************************************************/
1262 struct detection_state
1264 enum eol_type eol_type;
1286 struct iso2022_decoder iso;
1288 int high_byte_count;
1289 unsigned int saw_single_shift:1;
1302 acceptable_control_char_p (int c)
1306 /* Allow and ignore control characters that you might
1307 reasonably see in a text file */
1312 case 8: /* backspace */
1313 case 11: /* vertical tab */
1314 case 12: /* form feed */
1315 case 26: /* MS-DOS C-z junk */
1316 case 31: /* '^_' -- for info */
1324 mask_has_at_most_one_bit_p (int mask)
1326 /* Perhaps the only thing useful you learn from intensive Microsoft
1327 technical interviews */
1328 return (mask & (mask - 1)) == 0;
1331 static enum eol_type
1332 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1341 st->eol.just_saw_cr = 1;
1346 if (st->eol.just_saw_cr)
1348 else if (st->eol.seen_anything)
1351 else if (st->eol.just_saw_cr)
1353 st->eol.just_saw_cr = 0;
1355 st->eol.seen_anything = 1;
1358 return EOL_AUTODETECT;
1361 /* Attempt to determine the encoding and EOL type of the given text.
1362 Before calling this function for the first type, you must initialize
1363 st->eol_type as appropriate and initialize st->mask to ~0.
1365 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1368 st->mask holds the determined coding category mask, or ~0 if only
1369 ASCII has been seen so far.
1373 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1374 is present in st->mask
1375 1 == definitive answers are here for both st->eol_type and st->mask
1379 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1380 unsigned int n, int just_do_eol)
1384 if (st->eol_type == EOL_AUTODETECT)
1385 st->eol_type = detect_eol_type (st, src, n);
1388 return st->eol_type != EOL_AUTODETECT;
1390 if (!st->seen_non_ascii)
1392 for (; n; n--, src++)
1395 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1397 st->seen_non_ascii = 1;
1399 st->shift_jis.mask = ~0;
1401 st->iso2022.mask = ~0;
1411 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1412 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1413 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1414 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1415 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1416 st->big5.mask = detect_coding_big5 (st, src, n);
1418 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask;
1421 int retval = mask_has_at_most_one_bit_p (st->mask);
1422 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1423 return retval && st->eol_type != EOL_AUTODETECT;
1428 coding_system_from_mask (int mask)
1432 /* If the file was entirely or basically ASCII, use the
1433 default value of `buffer-file-coding-system'. */
1434 Lisp_Object retval =
1435 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1438 retval = Ffind_coding_system (retval);
1442 (Qbad_variable, Qwarning,
1443 "Invalid `default-buffer-file-coding-system', set to nil");
1444 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1448 retval = Fget_coding_system (Qno_conversion);
1456 mask = postprocess_iso2022_mask (mask);
1458 /* Look through the coding categories by priority and find
1459 the first one that is allowed. */
1460 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1462 cat = coding_category_by_priority[i];
1463 if ((mask & (1 << cat)) &&
1464 !NILP (coding_category_system[cat]))
1468 return coding_category_system[cat];
1470 return Fget_coding_system (Qno_conversion);
1474 /* Given a seekable read stream and potential coding system and EOL type
1475 as specified, do any autodetection that is called for. If the
1476 coding system and/or EOL type are not autodetect, they will be left
1477 alone; but this function will never return an autodetect coding system
1480 This function does not automatically fetch subsidiary coding systems;
1481 that should be unnecessary with the explicit eol-type argument. */
1484 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1485 enum eol_type *eol_type_in_out)
1487 struct detection_state decst;
1489 if (*eol_type_in_out == EOL_AUTODETECT)
1490 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1493 decst.eol_type = *eol_type_in_out;
1496 /* If autodetection is called for, do it now. */
1497 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1498 *eol_type_in_out == EOL_AUTODETECT)
1503 unsigned char random_buffer[4096];
1506 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1509 if (detect_coding_type (&decst, random_buffer, nread,
1510 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1511 CODESYS_AUTODETECT))
1515 *eol_type_in_out = decst.eol_type;
1516 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1517 *codesys_in_out = coding_system_from_mask (decst.mask);
1520 /* If we absolutely can't determine the EOL type, just assume LF. */
1521 if (*eol_type_in_out == EOL_AUTODETECT)
1522 *eol_type_in_out = EOL_LF;
1524 Lstream_rewind (stream);
1527 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1528 Detect coding system of the text in the region between START and END.
1529 Returned a list of possible coding systems ordered by priority.
1530 If only ASCII characters are found, it returns 'undecided or one of
1531 its subsidiary coding systems according to a detected end-of-line
1532 type. Optional arg BUFFER defaults to the current buffer.
1534 (start, end, buffer))
1536 Lisp_Object val = Qnil;
1537 struct buffer *buf = decode_buffer (buffer, 0);
1539 Lisp_Object instream, lb_instream;
1540 Lstream *istr, *lb_istr;
1541 struct detection_state decst;
1542 struct gcpro gcpro1, gcpro2;
1544 get_buffer_range_char (buf, start, end, &b, &e, 0);
1545 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1546 lb_istr = XLSTREAM (lb_instream);
1547 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1548 istr = XLSTREAM (instream);
1549 GCPRO2 (instream, lb_instream);
1551 decst.eol_type = EOL_AUTODETECT;
1555 unsigned char random_buffer[4096];
1556 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1560 if (detect_coding_type (&decst, random_buffer, nread, 0))
1564 if (decst.mask == ~0)
1565 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1573 decst.mask = postprocess_iso2022_mask (decst.mask);
1575 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1577 int sys = coding_category_by_priority[i];
1578 if (decst.mask & (1 << sys))
1580 Lisp_Object codesys = coding_category_system[sys];
1581 if (!NILP (codesys))
1582 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1583 val = Fcons (codesys, val);
1587 Lstream_close (istr);
1589 Lstream_delete (istr);
1590 Lstream_delete (lb_istr);
1595 /************************************************************************/
1596 /* Converting to internal Mule format ("decoding") */
1597 /************************************************************************/
1599 /* A decoding stream is a stream used for decoding text (i.e.
1600 converting from some external format to internal format).
1601 The decoding-stream object keeps track of the actual coding
1602 stream, the stream that is at the other end, and data that
1603 needs to be persistent across the lifetime of the stream. */
1605 /* Handle the EOL stuff related to just-read-in character C.
1606 EOL_TYPE is the EOL type of the coding stream.
1607 FLAGS is the current value of FLAGS in the coding stream, and may
1608 be modified by this macro. (The macro only looks at the
1609 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1610 bytes are to be written. You need to also define a local goto
1611 label "label_continue_loop" that is at the end of the main
1612 character-reading loop.
1614 If C is a CR character, then this macro handles it entirely and
1615 jumps to label_continue_loop. Otherwise, this macro does not add
1616 anything to DST, and continues normally. You should continue
1617 processing C normally after this macro. */
1619 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1623 if (eol_type == EOL_CR) \
1624 Dynarr_add (dst, '\n'); \
1625 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1626 Dynarr_add (dst, c); \
1628 flags |= CODING_STATE_CR; \
1629 goto label_continue_loop; \
1631 else if (flags & CODING_STATE_CR) \
1632 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1634 Dynarr_add (dst, '\r'); \
1635 flags &= ~CODING_STATE_CR; \
1639 /* C should be a binary character in the range 0 - 255; convert
1640 to internal format and add to Dynarr DST. */
1642 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1644 if (BYTE_ASCII_P (c)) \
1645 Dynarr_add (dst, c); \
1646 else if (BYTE_C1_P (c)) \
1648 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1649 Dynarr_add (dst, c + 0x20); \
1653 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1654 Dynarr_add (dst, c); \
1658 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1662 DECODE_ADD_BINARY_CHAR (ch, dst); \
1667 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1669 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1670 if ((flags & CODING_STATE_END) && \
1671 (flags & CODING_STATE_CR)) \
1672 Dynarr_add (dst, '\r'); \
1675 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1677 struct decoding_stream
1679 /* Coding system that governs the conversion. */
1680 struct Lisp_Coding_System *codesys;
1682 /* Stream that we read the encoded data from or
1683 write the decoded data to. */
1686 /* If we are reading, then we can return only a fixed amount of
1687 data, so if the conversion resulted in too much data, we store it
1688 here for retrieval the next time around. */
1689 unsigned_char_dynarr *runoff;
1691 /* FLAGS holds flags indicating the current state of the decoding.
1692 Some of these flags are dependent on the coding system. */
1695 /* CH holds a partially built-up character. Since we only deal
1696 with one- and two-byte characters at the moment, we only use
1697 this to store the first byte of a two-byte character. */
1700 /* EOL_TYPE specifies the type of end-of-line conversion that
1701 currently applies. We need to keep this separate from the
1702 EOL type stored in CODESYS because the latter might indicate
1703 automatic EOL-type detection while the former will always
1704 indicate a particular EOL type. */
1705 enum eol_type eol_type;
1707 /* Additional ISO2022 information. We define the structure above
1708 because it's also needed by the detection routines. */
1709 struct iso2022_decoder iso2022;
1711 /* Additional information (the state of the running CCL program)
1712 used by the CCL decoder. */
1713 struct ccl_program ccl;
1715 struct detection_state decst;
1718 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1719 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1720 static int decoding_rewinder (Lstream *stream);
1721 static int decoding_seekable_p (Lstream *stream);
1722 static int decoding_flusher (Lstream *stream);
1723 static int decoding_closer (Lstream *stream);
1725 static Lisp_Object decoding_marker (Lisp_Object stream,
1726 void (*markobj) (Lisp_Object));
1728 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1729 sizeof (struct decoding_stream));
1732 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1734 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1735 Lisp_Object str_obj;
1737 /* We do not need to mark the coding systems or charsets stored
1738 within the stream because they are stored in a global list
1739 and automatically marked. */
1741 XSETLSTREAM (str_obj, str);
1743 if (str->imp->marker)
1744 return (str->imp->marker) (str_obj, markobj);
1749 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1750 so we read data from the other end, decode it, and store it into DATA. */
1753 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1755 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1756 unsigned char *orig_data = data;
1758 int error_occurred = 0;
1760 /* We need to interface to mule_decode(), which expects to take some
1761 amount of data and store the result into a Dynarr. We have
1762 mule_decode() store into str->runoff, and take data from there
1765 /* We loop until we have enough data, reading chunks from the other
1766 end and decoding it. */
1769 /* Take data from the runoff if we can. Make sure to take at
1770 most SIZE bytes, and delete the data from the runoff. */
1771 if (Dynarr_length (str->runoff) > 0)
1773 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1774 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1775 Dynarr_delete_many (str->runoff, 0, chunk);
1781 break; /* No more room for data */
1783 if (str->flags & CODING_STATE_END)
1784 /* This means that on the previous iteration, we hit the EOF on
1785 the other end. We loop once more so that mule_decode() can
1786 output any final stuff it may be holding, or any "go back
1787 to a sane state" escape sequences. (This latter makes sense
1788 during encoding.) */
1791 /* Exhausted the runoff, so get some more. DATA has at least
1792 SIZE bytes left of storage in it, so it's OK to read directly
1793 into it. (We'll be overwriting above, after we've decoded it
1794 into the runoff.) */
1795 read_size = Lstream_read (str->other_end, data, size);
1802 /* There might be some more end data produced in the translation.
1803 See the comment above. */
1804 str->flags |= CODING_STATE_END;
1805 mule_decode (stream, data, str->runoff, read_size);
1808 if (data - orig_data == 0)
1809 return error_occurred ? -1 : 0;
1811 return data - orig_data;
1815 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1817 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1820 /* Decode all our data into the runoff, and then attempt to write
1821 it all out to the other end. Remove whatever chunk we succeeded
1823 mule_decode (stream, data, str->runoff, size);
1824 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1825 Dynarr_length (str->runoff));
1827 Dynarr_delete_many (str->runoff, 0, retval);
1828 /* Do NOT return retval. The return value indicates how much
1829 of the incoming data was written, not how many bytes were
1835 reset_decoding_stream (struct decoding_stream *str)
1838 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1840 Lisp_Object coding_system;
1841 XSETCODING_SYSTEM (coding_system, str->codesys);
1842 reset_iso2022 (coding_system, &str->iso2022);
1844 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1846 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1849 str->flags = str->ch = 0;
1853 decoding_rewinder (Lstream *stream)
1855 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1856 reset_decoding_stream (str);
1857 Dynarr_reset (str->runoff);
1858 return Lstream_rewind (str->other_end);
1862 decoding_seekable_p (Lstream *stream)
1864 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1865 return Lstream_seekable_p (str->other_end);
1869 decoding_flusher (Lstream *stream)
1871 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1872 return Lstream_flush (str->other_end);
1876 decoding_closer (Lstream *stream)
1878 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1879 if (stream->flags & LSTREAM_FL_WRITE)
1881 str->flags |= CODING_STATE_END;
1882 decoding_writer (stream, 0, 0);
1884 Dynarr_free (str->runoff);
1886 if (str->iso2022.composite_chars)
1887 Dynarr_free (str->iso2022.composite_chars);
1889 return Lstream_close (str->other_end);
1893 decoding_stream_coding_system (Lstream *stream)
1895 Lisp_Object coding_system;
1896 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1898 XSETCODING_SYSTEM (coding_system, str->codesys);
1899 return subsidiary_coding_system (coding_system, str->eol_type);
1903 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1905 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1906 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1908 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1909 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1910 reset_decoding_stream (str);
1913 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
1914 stream for writing, no automatic code detection will be performed.
1915 The reason for this is that automatic code detection requires a
1916 seekable input. Things will also fail if you open a decoding
1917 stream for reading using a non-fully-specified coding system and
1918 a non-seekable input stream. */
1921 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1924 Lstream *lstr = Lstream_new (lstream_decoding, mode);
1925 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1929 str->other_end = stream;
1930 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1931 str->eol_type = EOL_AUTODETECT;
1932 if (!strcmp (mode, "r")
1933 && Lstream_seekable_p (stream))
1934 /* We can determine the coding system now. */
1935 determine_real_coding_system (stream, &codesys, &str->eol_type);
1936 set_decoding_stream_coding_system (lstr, codesys);
1937 str->decst.eol_type = str->eol_type;
1938 str->decst.mask = ~0;
1939 XSETLSTREAM (obj, lstr);
1944 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
1946 return make_decoding_stream_1 (stream, codesys, "r");
1950 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
1952 return make_decoding_stream_1 (stream, codesys, "w");
1955 /* Note: the decode_coding_* functions all take the same
1956 arguments as mule_decode(), which is to say some SRC data of
1957 size N, which is to be stored into dynamic array DST.
1958 DECODING is the stream within which the decoding is
1959 taking place, but no data is actually read from or
1960 written to that stream; that is handled in decoding_reader()
1961 or decoding_writer(). This allows the same functions to
1962 be used for both reading and writing. */
1965 mule_decode (Lstream *decoding, CONST unsigned char *src,
1966 unsigned_char_dynarr *dst, unsigned int n)
1968 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
1970 /* If necessary, do encoding-detection now. We do this when
1971 we're a writing stream or a non-seekable reading stream,
1972 meaning that we can't just process the whole input,
1973 rewind, and start over. */
1975 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
1976 str->eol_type == EOL_AUTODETECT)
1978 Lisp_Object codesys;
1980 XSETCODING_SYSTEM (codesys, str->codesys);
1981 detect_coding_type (&str->decst, src, n,
1982 CODING_SYSTEM_TYPE (str->codesys) !=
1983 CODESYS_AUTODETECT);
1984 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
1985 str->decst.mask != ~0)
1986 /* #### This is cheesy. What we really ought to do is
1987 buffer up a certain amount of data so as to get a
1988 less random result. */
1989 codesys = coding_system_from_mask (str->decst.mask);
1990 str->eol_type = str->decst.eol_type;
1991 if (XCODING_SYSTEM (codesys) != str->codesys)
1993 /* Preserve the CODING_STATE_END flag in case it was set.
1994 If we erase it, bad things might happen. */
1995 int was_end = str->flags & CODING_STATE_END;
1996 set_decoding_stream_coding_system (decoding, codesys);
1998 str->flags |= CODING_STATE_END;
2002 switch (CODING_SYSTEM_TYPE (str->codesys))
2005 case CODESYS_INTERNAL:
2006 Dynarr_add_many (dst, src, n);
2009 case CODESYS_AUTODETECT:
2010 /* If we got this far and still haven't decided on the coding
2011 system, then do no conversion. */
2012 case CODESYS_NO_CONVERSION:
2013 decode_coding_no_conversion (decoding, src, dst, n);
2016 case CODESYS_SHIFT_JIS:
2017 decode_coding_sjis (decoding, src, dst, n);
2020 decode_coding_big5 (decoding, src, dst, n);
2023 ccl_driver (&str->ccl, src, dst, n, 0);
2025 case CODESYS_ISO2022:
2026 decode_coding_iso2022 (decoding, src, dst, n);
2034 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2035 Decode the text between START and END which is encoded in CODING-SYSTEM.
2036 This is useful if you've read in encoded text from a file without decoding
2037 it (e.g. you read in a JIS-formatted file but used the `binary' or
2038 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2039 Return length of decoded text.
2040 BUFFER defaults to the current buffer if unspecified.
2042 (start, end, coding_system, buffer))
2045 struct buffer *buf = decode_buffer (buffer, 0);
2046 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2047 Lstream *istr, *ostr;
2048 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2050 get_buffer_range_char (buf, start, end, &b, &e, 0);
2052 barf_if_buffer_read_only (buf, b, e);
2054 coding_system = Fget_coding_system (coding_system);
2055 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2056 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2057 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2059 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2060 Fget_coding_system (Qbinary));
2061 istr = XLSTREAM (instream);
2062 ostr = XLSTREAM (outstream);
2063 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2065 /* The chain of streams looks like this:
2067 [BUFFER] <----- send through
2068 ------> [ENCODE AS BINARY]
2069 ------> [DECODE AS SPECIFIED]
2075 char tempbuf[1024]; /* some random amount */
2076 Bufpos newpos, even_newer_pos;
2077 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2078 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2082 newpos = lisp_buffer_stream_startpos (istr);
2083 Lstream_write (ostr, tempbuf, size_in_bytes);
2084 even_newer_pos = lisp_buffer_stream_startpos (istr);
2085 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2088 Lstream_close (istr);
2089 Lstream_close (ostr);
2091 Lstream_delete (istr);
2092 Lstream_delete (ostr);
2093 Lstream_delete (XLSTREAM (de_outstream));
2094 Lstream_delete (XLSTREAM (lb_outstream));
2099 /************************************************************************/
2100 /* Converting to an external encoding ("encoding") */
2101 /************************************************************************/
2103 /* An encoding stream is an output stream. When you create the
2104 stream, you specify the coding system that governs the encoding
2105 and another stream that the resulting encoded data is to be
2106 sent to, and then start sending data to it. */
2108 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2110 struct encoding_stream
2112 /* Coding system that governs the conversion. */
2113 struct Lisp_Coding_System *codesys;
2115 /* Stream that we read the encoded data from or
2116 write the decoded data to. */
2119 /* If we are reading, then we can return only a fixed amount of
2120 data, so if the conversion resulted in too much data, we store it
2121 here for retrieval the next time around. */
2122 unsigned_char_dynarr *runoff;
2124 /* FLAGS holds flags indicating the current state of the encoding.
2125 Some of these flags are dependent on the coding system. */
2128 /* CH holds a partially built-up character. Since we only deal
2129 with one- and two-byte characters at the moment, we only use
2130 this to store the first byte of a two-byte character. */
2133 /* Additional information used by the ISO2022 encoder. */
2136 /* CHARSET holds the character sets currently assigned to the G0
2137 through G3 registers. It is initialized from the array
2138 INITIAL_CHARSET in CODESYS. */
2139 Lisp_Object charset[4];
2141 /* Which registers are currently invoked into the left (GL) and
2142 right (GR) halves of the 8-bit encoding space? */
2143 int register_left, register_right;
2145 /* Whether we need to explicitly designate the charset in the
2146 G? register before using it. It is initialized from the
2147 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2148 unsigned char force_charset_on_output[4];
2150 /* Other state variables that need to be preserved across
2152 Lisp_Object current_charset;
2154 int current_char_boundary;
2157 /* Additional information (the state of the running CCL program)
2158 used by the CCL encoder. */
2159 struct ccl_program ccl;
2163 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2164 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2166 static int encoding_rewinder (Lstream *stream);
2167 static int encoding_seekable_p (Lstream *stream);
2168 static int encoding_flusher (Lstream *stream);
2169 static int encoding_closer (Lstream *stream);
2171 static Lisp_Object encoding_marker (Lisp_Object stream,
2172 void (*markobj) (Lisp_Object));
2174 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2175 sizeof (struct encoding_stream));
2178 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2180 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2181 Lisp_Object str_obj;
2183 /* We do not need to mark the coding systems or charsets stored
2184 within the stream because they are stored in a global list
2185 and automatically marked. */
2187 XSETLSTREAM (str_obj, str);
2189 if (str->imp->marker)
2190 return (str->imp->marker) (str_obj, markobj);
2195 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2196 so we read data from the other end, encode it, and store it into DATA. */
2199 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2201 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2202 unsigned char *orig_data = data;
2204 int error_occurred = 0;
2206 /* We need to interface to mule_encode(), which expects to take some
2207 amount of data and store the result into a Dynarr. We have
2208 mule_encode() store into str->runoff, and take data from there
2211 /* We loop until we have enough data, reading chunks from the other
2212 end and encoding it. */
2215 /* Take data from the runoff if we can. Make sure to take at
2216 most SIZE bytes, and delete the data from the runoff. */
2217 if (Dynarr_length (str->runoff) > 0)
2219 int chunk = min ((int) size, Dynarr_length (str->runoff));
2220 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2221 Dynarr_delete_many (str->runoff, 0, chunk);
2227 break; /* No more room for data */
2229 if (str->flags & CODING_STATE_END)
2230 /* This means that on the previous iteration, we hit the EOF on
2231 the other end. We loop once more so that mule_encode() can
2232 output any final stuff it may be holding, or any "go back
2233 to a sane state" escape sequences. (This latter makes sense
2234 during encoding.) */
2237 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2238 left of storage in it, so it's OK to read directly into it.
2239 (We'll be overwriting above, after we've encoded it into the
2241 read_size = Lstream_read (str->other_end, data, size);
2248 /* There might be some more end data produced in the translation.
2249 See the comment above. */
2250 str->flags |= CODING_STATE_END;
2251 mule_encode (stream, data, str->runoff, read_size);
2254 if (data == orig_data)
2255 return error_occurred ? -1 : 0;
2257 return data - orig_data;
2261 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2263 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2266 /* Encode all our data into the runoff, and then attempt to write
2267 it all out to the other end. Remove whatever chunk we succeeded
2269 mule_encode (stream, data, str->runoff, size);
2270 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2271 Dynarr_length (str->runoff));
2273 Dynarr_delete_many (str->runoff, 0, retval);
2274 /* Do NOT return retval. The return value indicates how much
2275 of the incoming data was written, not how many bytes were
2281 reset_encoding_stream (struct encoding_stream *str)
2284 switch (CODING_SYSTEM_TYPE (str->codesys))
2286 case CODESYS_ISO2022:
2290 for (i = 0; i < 4; i++)
2292 str->iso2022.charset[i] =
2293 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2294 str->iso2022.force_charset_on_output[i] =
2295 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2297 str->iso2022.register_left = 0;
2298 str->iso2022.register_right = 1;
2299 str->iso2022.current_charset = Qnil;
2300 str->iso2022.current_half = 0;
2301 str->iso2022.current_char_boundary = 1;
2305 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2312 str->flags = str->ch = 0;
2316 encoding_rewinder (Lstream *stream)
2318 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2319 reset_encoding_stream (str);
2320 Dynarr_reset (str->runoff);
2321 return Lstream_rewind (str->other_end);
2325 encoding_seekable_p (Lstream *stream)
2327 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2328 return Lstream_seekable_p (str->other_end);
2332 encoding_flusher (Lstream *stream)
2334 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2335 return Lstream_flush (str->other_end);
2339 encoding_closer (Lstream *stream)
2341 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2342 if (stream->flags & LSTREAM_FL_WRITE)
2344 str->flags |= CODING_STATE_END;
2345 encoding_writer (stream, 0, 0);
2347 Dynarr_free (str->runoff);
2348 return Lstream_close (str->other_end);
2352 encoding_stream_coding_system (Lstream *stream)
2354 Lisp_Object coding_system;
2355 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2357 XSETCODING_SYSTEM (coding_system, str->codesys);
2358 return coding_system;
2362 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2364 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2365 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2367 reset_encoding_stream (str);
2371 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2374 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2375 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2379 str->runoff = Dynarr_new (unsigned_char);
2380 str->other_end = stream;
2381 set_encoding_stream_coding_system (lstr, codesys);
2382 XSETLSTREAM (obj, lstr);
2387 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2389 return make_encoding_stream_1 (stream, codesys, "r");
2393 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2395 return make_encoding_stream_1 (stream, codesys, "w");
2398 /* Convert N bytes of internally-formatted data stored in SRC to an
2399 external format, according to the encoding stream ENCODING.
2400 Store the encoded data into DST. */
2403 mule_encode (Lstream *encoding, CONST unsigned char *src,
2404 unsigned_char_dynarr *dst, unsigned int n)
2406 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2408 switch (CODING_SYSTEM_TYPE (str->codesys))
2411 case CODESYS_INTERNAL:
2412 Dynarr_add_many (dst, src, n);
2415 case CODESYS_AUTODETECT:
2416 /* If we got this far and still haven't decided on the coding
2417 system, then do no conversion. */
2418 case CODESYS_NO_CONVERSION:
2419 encode_coding_no_conversion (encoding, src, dst, n);
2422 case CODESYS_SHIFT_JIS:
2423 encode_coding_sjis (encoding, src, dst, n);
2426 encode_coding_big5 (encoding, src, dst, n);
2429 ccl_driver (&str->ccl, src, dst, n, 0);
2431 case CODESYS_ISO2022:
2432 encode_coding_iso2022 (encoding, src, dst, n);
2440 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2441 Encode the text between START and END using CODING-SYSTEM.
2442 This will, for example, convert Japanese characters into stuff such as
2443 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2444 text. BUFFER defaults to the current buffer if unspecified.
2446 (start, end, coding_system, buffer))
2449 struct buffer *buf = decode_buffer (buffer, 0);
2450 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2451 Lstream *istr, *ostr;
2452 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2454 get_buffer_range_char (buf, start, end, &b, &e, 0);
2456 barf_if_buffer_read_only (buf, b, e);
2458 coding_system = Fget_coding_system (coding_system);
2459 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2460 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2461 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2462 Fget_coding_system (Qbinary));
2463 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2465 istr = XLSTREAM (instream);
2466 ostr = XLSTREAM (outstream);
2467 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2468 /* The chain of streams looks like this:
2470 [BUFFER] <----- send through
2471 ------> [ENCODE AS SPECIFIED]
2472 ------> [DECODE AS BINARY]
2477 char tempbuf[1024]; /* some random amount */
2478 Bufpos newpos, even_newer_pos;
2479 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2480 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2484 newpos = lisp_buffer_stream_startpos (istr);
2485 Lstream_write (ostr, tempbuf, size_in_bytes);
2486 even_newer_pos = lisp_buffer_stream_startpos (istr);
2487 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2493 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2494 Lstream_close (istr);
2495 Lstream_close (ostr);
2497 Lstream_delete (istr);
2498 Lstream_delete (ostr);
2499 Lstream_delete (XLSTREAM (de_outstream));
2500 Lstream_delete (XLSTREAM (lb_outstream));
2501 return make_int (retlen);
2507 /************************************************************************/
2508 /* Shift-JIS methods */
2509 /************************************************************************/
2511 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2512 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2513 as is. A character of JISX0201-Kana (TYPE94 character set) is
2514 encoded by "position-code + 0x80". A character of JISX0208
2515 (TYPE94x94 character set) is encoded in 2-byte but two
2516 position-codes are divided and shifted so that it fit in the range
2519 --- CODE RANGE of Shift-JIS ---
2520 (character set) (range)
2522 JISX0201-Kana 0xA0 .. 0xDF
2523 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2524 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2525 -------------------------------
2529 /* Is this the first byte of a Shift-JIS two-byte char? */
2531 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2532 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2534 /* Is this the second byte of a Shift-JIS two-byte char? */
2536 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2537 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2539 #define BYTE_SJIS_KATAKANA_P(c) \
2540 ((c) >= 0xA1 && (c) <= 0xDF)
2543 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2551 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2553 if (st->shift_jis.in_second_byte)
2555 st->shift_jis.in_second_byte = 0;
2559 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2560 st->shift_jis.in_second_byte = 1;
2562 return CODING_CATEGORY_SHIFT_JIS_MASK;
2565 /* Convert Shift-JIS data to internal format. */
2568 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2569 unsigned_char_dynarr *dst, unsigned int n)
2572 unsigned int flags, ch;
2573 enum eol_type eol_type;
2574 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2576 CODING_STREAM_DECOMPOSE (str, flags, ch);
2577 eol_type = str->eol_type;
2585 /* Previous character was first byte of Shift-JIS Kanji char. */
2586 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2588 unsigned char e1, e2;
2590 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2591 DECODE_SJIS (ch, c, e1, e2);
2592 Dynarr_add (dst, e1);
2593 Dynarr_add (dst, e2);
2597 DECODE_ADD_BINARY_CHAR (ch, dst);
2598 DECODE_ADD_BINARY_CHAR (c, dst);
2604 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2605 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2607 else if (BYTE_SJIS_KATAKANA_P (c))
2609 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2610 Dynarr_add (dst, c);
2613 DECODE_ADD_BINARY_CHAR (c, dst);
2615 label_continue_loop:;
2618 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2620 CODING_STREAM_COMPOSE (str, flags, ch);
2623 /* Convert internally-formatted data to Shift-JIS. */
2626 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2627 unsigned_char_dynarr *dst, unsigned int n)
2630 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2631 unsigned int flags, ch;
2632 enum eol_type eol_type;
2634 CODING_STREAM_DECOMPOSE (str, flags, ch);
2635 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2642 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2643 Dynarr_add (dst, '\r');
2644 if (eol_type != EOL_CR)
2645 Dynarr_add (dst, '\n');
2648 else if (BYTE_ASCII_P (c))
2650 Dynarr_add (dst, c);
2653 else if (BUFBYTE_LEADING_BYTE_P (c))
2654 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2655 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2656 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2659 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2661 Dynarr_add (dst, c);
2664 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2665 ch == LEADING_BYTE_JAPANESE_JISX0208)
2669 unsigned char j1, j2;
2670 ENCODE_SJIS (ch, c, j1, j2);
2671 Dynarr_add (dst, j1);
2672 Dynarr_add (dst, j2);
2678 CODING_STREAM_COMPOSE (str, flags, ch);
2681 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2682 Decode a JISX0208 character of Shift-JIS coding-system.
2683 CODE is the character code in Shift-JIS as a cons of type bytes.
2684 Return the corresponding character.
2688 unsigned char c1, c2, s1, s2;
2691 CHECK_INT (XCAR (code));
2692 CHECK_INT (XCDR (code));
2693 s1 = XINT (XCAR (code));
2694 s2 = XINT (XCDR (code));
2695 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2696 BYTE_SJIS_TWO_BYTE_2_P (s2))
2698 DECODE_SJIS (s1, s2, c1, c2);
2699 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2700 c1 & 0x7F, c2 & 0x7F));
2706 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2707 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2708 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2712 Lisp_Object charset;
2715 CHECK_CHAR_COERCE_INT (ch);
2716 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2717 if (EQ (charset, Vcharset_japanese_jisx0208))
2719 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2720 return Fcons (make_int (s1), make_int (s2));
2727 /************************************************************************/
2729 /************************************************************************/
2731 /* BIG5 is a coding system encoding two character sets: ASCII and
2732 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2733 character set and is encoded in two-byte.
2735 --- CODE RANGE of BIG5 ---
2736 (character set) (range)
2738 Big5 (1st byte) 0xA1 .. 0xFE
2739 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2740 --------------------------
2742 Since the number of characters in Big5 is larger than maximum
2743 characters in Emacs' charset (96x96), it can't be handled as one
2744 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2745 and `charset-big5-2'. Both <type>s are TYPE94x94. The former
2746 contains frequently used characters and the latter contains less
2747 frequently used characters. */
2749 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2750 ((c) >= 0xA1 && (c) <= 0xFE)
2752 /* Is this the second byte of a Shift-JIS two-byte char? */
2754 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2755 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2757 /* Number of Big5 characters which have the same code in 1st byte. */
2759 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2761 /* Code conversion macros. These are macros because they are used in
2762 inner loops during code conversion.
2764 Note that temporary variables in macros introduce the classic
2765 dynamic-scoping problems with variable names. We use capital-
2766 lettered variables in the assumption that XEmacs does not use
2767 capital letters in variables except in a very formalized way
2770 /* Convert Big5 code (b1, b2) into its internal string representation
2773 /* There is a much simpler way to split the Big5 charset into two.
2774 For the moment I'm going to leave the algorithm as-is because it
2775 claims to separate out the most-used characters into a single
2776 charset, which perhaps will lead to optimizations in various
2779 The way the algorithm works is something like this:
2781 Big5 can be viewed as a 94x157 charset, where the row is
2782 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2783 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2784 the split between low and high column numbers is apparently
2785 meaningless; ascending rows produce less and less frequent chars.
2786 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2787 the first charset, and the upper half (0xC9 .. 0xFE) to the
2788 second. To do the conversion, we convert the character into
2789 a single number where 0 .. 156 is the first row, 157 .. 313
2790 is the second, etc. That way, the characters are ordered by
2791 decreasing frequency. Then we just chop the space in two
2792 and coerce the result into a 94x94 space.
2795 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2797 int B1 = b1, B2 = b2; \
2799 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2803 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2807 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2808 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2810 c1 = I / (0xFF - 0xA1) + 0xA1; \
2811 c2 = I % (0xFF - 0xA1) + 0xA1; \
2814 /* Convert the internal string representation of a Big5 character
2815 (lb, c1, c2) into Big5 code (b1, b2). */
2817 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2819 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2821 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2823 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2825 b1 = I / BIG5_SAME_ROW + 0xA1; \
2826 b2 = I % BIG5_SAME_ROW; \
2827 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2831 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2839 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2840 (c >= 0x80 && c <= 0xA0))
2842 if (st->big5.in_second_byte)
2844 st->big5.in_second_byte = 0;
2845 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2849 st->big5.in_second_byte = 1;
2851 return CODING_CATEGORY_BIG5_MASK;
2854 /* Convert Big5 data to internal format. */
2857 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2858 unsigned_char_dynarr *dst, unsigned int n)
2861 unsigned int flags, ch;
2862 enum eol_type eol_type;
2863 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2865 CODING_STREAM_DECOMPOSE (str, flags, ch);
2866 eol_type = str->eol_type;
2873 /* Previous character was first byte of Big5 char. */
2874 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2876 unsigned char b1, b2, b3;
2877 DECODE_BIG5 (ch, c, b1, b2, b3);
2878 Dynarr_add (dst, b1);
2879 Dynarr_add (dst, b2);
2880 Dynarr_add (dst, b3);
2884 DECODE_ADD_BINARY_CHAR (ch, dst);
2885 DECODE_ADD_BINARY_CHAR (c, dst);
2891 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2892 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2895 DECODE_ADD_BINARY_CHAR (c, dst);
2897 label_continue_loop:;
2900 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2902 CODING_STREAM_COMPOSE (str, flags, ch);
2905 /* Convert internally-formatted data to Big5. */
2908 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2909 unsigned_char_dynarr *dst, unsigned int n)
2912 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2913 unsigned int flags, ch;
2914 enum eol_type eol_type;
2916 CODING_STREAM_DECOMPOSE (str, flags, ch);
2917 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2924 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2925 Dynarr_add (dst, '\r');
2926 if (eol_type != EOL_CR)
2927 Dynarr_add (dst, '\n');
2929 else if (BYTE_ASCII_P (c))
2932 Dynarr_add (dst, c);
2934 else if (BUFBYTE_LEADING_BYTE_P (c))
2936 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
2937 c == LEADING_BYTE_CHINESE_BIG5_2)
2939 /* A recognized leading byte. */
2941 continue; /* not done with this character. */
2943 /* otherwise just ignore this character. */
2945 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
2946 ch == LEADING_BYTE_CHINESE_BIG5_2)
2948 /* Previous char was a recognized leading byte. */
2950 continue; /* not done with this character. */
2954 /* Encountering second byte of a Big5 character. */
2955 unsigned char b1, b2;
2957 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
2958 Dynarr_add (dst, b1);
2959 Dynarr_add (dst, b2);
2965 CODING_STREAM_COMPOSE (str, flags, ch);
2969 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
2970 Decode a Big5 character CODE of BIG5 coding-system.
2971 CODE is the character code in BIG5, a cons of two integers.
2972 Return the corresponding character.
2976 unsigned char c1, c2, b1, b2;
2979 CHECK_INT (XCAR (code));
2980 CHECK_INT (XCDR (code));
2981 b1 = XINT (XCAR (code));
2982 b2 = XINT (XCDR (code));
2983 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
2984 BYTE_BIG5_TWO_BYTE_2_P (b2))
2987 Lisp_Object charset;
2988 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
2989 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
2990 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
2996 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
2997 Encode the Big5 character CH to BIG5 coding-system.
2998 Return the corresponding character code in Big5.
3002 Lisp_Object charset;
3005 CHECK_CHAR_COERCE_INT (ch);
3006 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3007 if (EQ (charset, Vcharset_chinese_big5_1) ||
3008 EQ (charset, Vcharset_chinese_big5_2))
3010 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3012 return Fcons (make_int (b1), make_int (b2));
3019 /************************************************************************/
3020 /* ISO2022 methods */
3021 /************************************************************************/
3023 /* The following note describes the coding system ISO2022 briefly.
3024 Since the intention of this note is to help understanding of the
3025 programs in this file, some parts are NOT ACCURATE or OVERLY
3026 SIMPLIFIED. For thorough understanding, please refer to the
3027 original document of ISO2022.
3029 ISO2022 provides many mechanisms to encode several character sets
3030 in 7-bit and 8-bit environments. If one chooses 7-bit environment,
3031 all text is encoded by codes of less than 128. This may make the
3032 encoded text a little bit longer, but the text get more stability
3033 to pass through several gateways (some of them strip off MSB).
3035 There are two kind of character sets: control character set and
3036 graphic character set. The former contains control characters such
3037 as `newline' and `escape' to provide control functions (control
3038 functions are provided also by escape sequence). The latter
3039 contains graphic characters such as 'A' and '-'. Emacs recognizes
3040 two control character sets and many graphic character sets.
3042 Graphic character sets are classified into one of four types,
3043 according to the dimension and number of characters in the set:
3044 TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each
3045 character set is assigned an identification byte, unique for each
3046 type, called "final character" (denoted as <F> hereafter). The <F>
3047 of each character set is decided by ECMA(*) when it is registered
3048 in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for
3051 Note (*): ECMA = European Computer Manufacturers Association
3053 Here are examples of graphic character set [NAME(<F>)]:
3054 o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3055 o TYPE96 -- right-half-of-ISO8859-1('A'), ...
3056 o TYPE94x94 -- GB2312('A'), JISX0208('B'), ...
3057 o TYPE96x96 -- none for the moment
3059 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
3060 C0 [0x00..0x1F] -- control character plane 0
3061 GL [0x20..0x7F] -- graphic character plane 0
3062 C1 [0x80..0x9F] -- control character plane 1
3063 GR [0xA0..0xFF] -- graphic character plane 1
3065 A control character set is directly designated and invoked to C0 or
3066 C1 by an escape sequence. The most common case is that:
3067 - ISO646's control character set is designated/invoked to C0, and
3068 - ISO6429's control character set is designated/invoked to C1,
3069 and usually these designations/invocations are omitted in encoded
3070 text. In a 7-bit environment, only C0 can be used, and a control
3071 character for C1 is encoded by an appropriate escape sequence to
3072 fit into the environment. All control characters for C1 are
3073 defined to have corresponding escape sequences.
3075 A graphic character set is at first designated to one of four
3076 graphic registers (G0 through G3), then these graphic registers are
3077 invoked to GL or GR. These designations and invocations can be
3078 done independently. The most common case is that G0 is invoked to
3079 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3080 these invocations and designations are omitted in encoded text.
3081 In a 7-bit environment, only GL can be used.
3083 When a graphic character set of TYPE94 or TYPE94x94 is invoked to
3084 GL, codes 0x20 and 0x7F of the GL area work as control characters
3085 SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area
3088 There are two ways of invocation: locking-shift and single-shift.
3089 With locking-shift, the invocation lasts until the next different
3090 invocation, whereas with single-shift, the invocation works only
3091 for the following character and doesn't affect locking-shift.
3092 Invocations are done by the following control characters or escape
3095 ----------------------------------------------------------------------
3096 abbrev function cntrl escape seq description
3097 ----------------------------------------------------------------------
3098 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3099 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3100 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR
3101 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3102 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR
3103 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3104 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR
3105 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3106 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3107 ----------------------------------------------------------------------
3108 The first four are for locking-shift. Control characters for these
3109 functions are defined by macros ISO_CODE_XXX in `coding.h'.
3111 Designations are done by the following escape sequences.
3112 ----------------------------------------------------------------------
3113 escape sequence description
3114 ----------------------------------------------------------------------
3115 ESC '(' <F> designate TYPE94<F> to G0
3116 ESC ')' <F> designate TYPE94<F> to G1
3117 ESC '*' <F> designate TYPE94<F> to G2
3118 ESC '+' <F> designate TYPE94<F> to G3
3119 ESC ',' <F> designate TYPE96<F> to G0 (*)
3120 ESC '-' <F> designate TYPE96<F> to G1
3121 ESC '.' <F> designate TYPE96<F> to G2
3122 ESC '/' <F> designate TYPE96<F> to G3
3123 ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**)
3124 ESC '$' ')' <F> designate TYPE94x94<F> to G1
3125 ESC '$' '*' <F> designate TYPE94x94<F> to G2
3126 ESC '$' '+' <F> designate TYPE94x94<F> to G3
3127 ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*)
3128 ESC '$' '-' <F> designate TYPE96x96<F> to G1
3129 ESC '$' '.' <F> designate TYPE96x96<F> to G2
3130 ESC '$' '/' <F> designate TYPE96x96<F> to G3
3131 ----------------------------------------------------------------------
3132 In this list, "TYPE94<F>" means a graphic character set of type TYPE94
3133 and final character <F>, and etc.
3135 Note (*): Although these designations are not allowed in ISO2022,
3136 Emacs accepts them on decoding, and produces them on encoding
3137 TYPE96 or TYPE96x96 character set in a coding system which is
3138 characterized as 7-bit environment, non-locking-shift, and
3141 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3142 '(' can be omitted. We call this as "short-form" here after.
3144 Now you may notice that there are a lot of ways for encoding the
3145 same multilingual text in ISO2022. Actually, there exist many
3146 coding systems such as Compound Text (used in X's inter client
3147 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3148 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3149 localized platforms), and all of these are variants of ISO2022.
3151 In addition to the above, Emacs handles two more kinds of escape
3152 sequences: ISO6429's direction specification and Emacs' private
3153 sequence for specifying character composition.
3155 ISO6429's direction specification takes the following format:
3156 o CSI ']' -- end of the current direction
3157 o CSI '0' ']' -- end of the current direction
3158 o CSI '1' ']' -- start of left-to-right text
3159 o CSI '2' ']' -- start of right-to-left text
3160 The control character CSI (0x9B: control sequence introducer) is
3161 abbreviated to the escape sequence ESC '[' in 7-bit environment.
3163 Character composition specification takes the following format:
3164 o ESC '0' -- start character composition
3165 o ESC '1' -- end character composition
3166 Since these are not standard escape sequences of any ISO, the use
3167 of them for these meanings is restricted to Emacs only. */
3170 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3174 for (i = 0; i < 4; i++)
3176 if (!NILP (coding_system))
3178 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3180 iso->charset[i] = Qt;
3181 iso->invalid_designated[i] = 0;
3183 iso->esc = ISO_ESC_NOTHING;
3184 iso->esc_bytes_index = 0;
3185 iso->register_left = 0;
3186 iso->register_right = 1;
3187 iso->switched_dir_and_no_valid_charset_yet = 0;
3188 iso->invalid_switch_dir = 0;
3189 iso->output_direction_sequence = 0;
3190 iso->output_literally = 0;
3191 if (iso->composite_chars)
3192 Dynarr_reset (iso->composite_chars);
3196 fit_to_be_escape_quoted (unsigned char c)
3213 /* Parse one byte of an ISO2022 escape sequence.
3214 If the result is an invalid escape sequence, return 0 and
3215 do not change anything in STR. Otherwise, if the result is
3216 an incomplete escape sequence, update ISO2022.ESC and
3217 ISO2022.ESC_BYTES and return -1. Otherwise, update
3218 all the state variables (but not ISO2022.ESC_BYTES) and
3221 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3222 or invocation of an invalid character set and treat that as
3223 an unrecognized escape sequence. */
3226 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3227 unsigned char c, unsigned int *flags,
3228 int check_invalid_charsets)
3230 /* (1) If we're at the end of a designation sequence, CS is the
3231 charset being designated and REG is the register to designate
3234 (2) If we're at the end of a locking-shift sequence, REG is
3235 the register to invoke and HALF (0 == left, 1 == right) is
3236 the half to invoke it into.
3238 (3) If we're at the end of a single-shift sequence, REG is
3239 the register to invoke. */
3240 Lisp_Object cs = Qnil;
3243 /* NOTE: This code does goto's all over the fucking place.
3244 The reason for this is that we're basically implementing
3245 a state machine here, and hierarchical languages like C
3246 don't really provide a clean way of doing this. */
3248 if (! (*flags & CODING_STATE_ESCAPE))
3249 /* At beginning of escape sequence; we need to reset our
3250 escape-state variables. */
3251 iso->esc = ISO_ESC_NOTHING;
3253 iso->output_literally = 0;
3254 iso->output_direction_sequence = 0;
3258 case ISO_ESC_NOTHING:
3259 iso->esc_bytes_index = 0;
3262 case ISO_CODE_ESC: /* Start escape sequence */
3263 *flags |= CODING_STATE_ESCAPE;
3267 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3268 *flags |= CODING_STATE_ESCAPE;
3269 iso->esc = ISO_ESC_5_11;
3272 case ISO_CODE_SO: /* locking shift 1 */
3275 case ISO_CODE_SI: /* locking shift 0 */
3279 case ISO_CODE_SS2: /* single shift */
3282 case ISO_CODE_SS3: /* single shift */
3286 default: /* Other control characters */
3293 /**** single shift ****/
3295 case 'N': /* single shift 2 */
3298 case 'O': /* single shift 3 */
3302 /**** locking shift ****/
3304 case '~': /* locking shift 1 right */
3307 case 'n': /* locking shift 2 */
3310 case '}': /* locking shift 2 right */
3313 case 'o': /* locking shift 3 */
3316 case '|': /* locking shift 3 right */
3320 /**** composite ****/
3323 iso->esc = ISO_ESC_START_COMPOSITE;
3324 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3325 CODING_STATE_COMPOSITE;
3329 iso->esc = ISO_ESC_END_COMPOSITE;
3330 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3331 ~CODING_STATE_COMPOSITE;
3334 /**** directionality ****/
3337 iso->esc = ISO_ESC_5_11;
3340 /**** designation ****/
3342 case '$': /* multibyte charset prefix */
3343 iso->esc = ISO_ESC_2_4;
3347 if (0x28 <= c && c <= 0x2F)
3349 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3353 /* This function is called with CODESYS equal to nil when
3354 doing coding-system detection. */
3356 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3357 && fit_to_be_escape_quoted (c))
3359 iso->esc = ISO_ESC_LITERAL;
3360 *flags &= CODING_STATE_ISO2022_LOCK;
3370 /**** directionality ****/
3372 case ISO_ESC_5_11: /* ISO6429 direction control */
3375 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3376 goto directionality;
3378 if (c == '0') iso->esc = ISO_ESC_5_11_0;
3379 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3380 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3384 case ISO_ESC_5_11_0:
3387 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3388 goto directionality;
3392 case ISO_ESC_5_11_1:
3395 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3396 goto directionality;
3400 case ISO_ESC_5_11_2:
3403 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
3404 goto directionality;
3409 iso->esc = ISO_ESC_DIRECTIONALITY;
3410 /* Various junk here to attempt to preserve the direction sequences
3411 literally in the text if they would otherwise be swallowed due
3412 to invalid designations that don't show up as actual charset
3413 changes in the text. */
3414 if (iso->invalid_switch_dir)
3416 /* We already inserted a direction switch literally into the
3417 text. We assume (#### this may not be right) that the
3418 next direction switch is the one going the other way,
3419 and we need to output that literally as well. */
3420 iso->output_literally = 1;
3421 iso->invalid_switch_dir = 0;
3427 /* If we are in the thrall of an invalid designation,
3428 then stick the directionality sequence literally into the
3429 output stream so it ends up in the original text again. */
3430 for (jj = 0; jj < 4; jj++)
3431 if (iso->invalid_designated[jj])
3435 iso->output_literally = 1;
3436 iso->invalid_switch_dir = 1;
3439 /* Indicate that we haven't yet seen a valid designation,
3440 so that if a switch-dir is directly followed by an
3441 invalid designation, both get inserted literally. */
3442 iso->switched_dir_and_no_valid_charset_yet = 1;
3447 /**** designation ****/
3450 if (0x28 <= c && c <= 0x2F)
3452 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
3455 if (0x40 <= c && c <= 0x42)
3457 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
3458 *flags & CODING_STATE_R2L ?
3459 CHARSET_RIGHT_TO_LEFT :
3460 CHARSET_LEFT_TO_RIGHT);
3470 if (c < '0' || c > '~')
3471 return 0; /* bad final byte */
3473 if (iso->esc >= ISO_ESC_2_8 &&
3474 iso->esc <= ISO_ESC_2_15)
3476 type = ((iso->esc >= ISO_ESC_2_12) ?
3477 CHARSET_TYPE_96 : CHARSET_TYPE_94);
3478 reg = (iso->esc - ISO_ESC_2_8) & 3;
3480 else if (iso->esc >= ISO_ESC_2_4_8 &&
3481 iso->esc <= ISO_ESC_2_4_15)
3483 type = ((iso->esc >= ISO_ESC_2_4_12) ?
3484 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
3485 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
3489 /* Can this ever be reached? -slb */
3493 cs = CHARSET_BY_ATTRIBUTES (type, c,
3494 *flags & CODING_STATE_R2L ?
3495 CHARSET_RIGHT_TO_LEFT :
3496 CHARSET_LEFT_TO_RIGHT);
3502 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
3506 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
3507 /* can't invoke something that ain't there. */
3509 iso->esc = ISO_ESC_SINGLE_SHIFT;
3510 *flags &= CODING_STATE_ISO2022_LOCK;
3512 *flags |= CODING_STATE_SS2;
3514 *flags |= CODING_STATE_SS3;
3518 if (check_invalid_charsets &&
3519 !CHARSETP (iso->charset[reg]))
3520 /* can't invoke something that ain't there. */
3523 iso->register_right = reg;
3525 iso->register_left = reg;
3526 *flags &= CODING_STATE_ISO2022_LOCK;
3527 iso->esc = ISO_ESC_LOCKING_SHIFT;
3531 if (NILP (cs) && check_invalid_charsets)
3533 iso->invalid_designated[reg] = 1;
3534 iso->charset[reg] = Vcharset_ascii;
3535 iso->esc = ISO_ESC_DESIGNATE;
3536 *flags &= CODING_STATE_ISO2022_LOCK;
3537 iso->output_literally = 1;
3538 if (iso->switched_dir_and_no_valid_charset_yet)
3540 /* We encountered a switch-direction followed by an
3541 invalid designation. Ensure that the switch-direction
3542 gets outputted; otherwise it will probably get eaten
3543 when the text is written out again. */
3544 iso->switched_dir_and_no_valid_charset_yet = 0;
3545 iso->output_direction_sequence = 1;
3546 /* And make sure that the switch-dir going the other
3547 way gets outputted, as well. */
3548 iso->invalid_switch_dir = 1;
3552 /* This function is called with CODESYS equal to nil when
3553 doing coding-system detection. */
3554 if (!NILP (codesys))
3556 charset_conversion_spec_dynarr *dyn =
3557 XCODING_SYSTEM (codesys)->iso2022.input_conv;
3563 for (i = 0; i < Dynarr_length (dyn); i++)
3565 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
3566 if (EQ (cs, spec->from_charset))
3567 cs = spec->to_charset;
3572 iso->charset[reg] = cs;
3573 iso->esc = ISO_ESC_DESIGNATE;
3574 *flags &= CODING_STATE_ISO2022_LOCK;
3575 if (iso->invalid_designated[reg])
3577 iso->invalid_designated[reg] = 0;
3578 iso->output_literally = 1;
3580 if (iso->switched_dir_and_no_valid_charset_yet)
3581 iso->switched_dir_and_no_valid_charset_yet = 0;
3586 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
3591 /* #### There are serious deficiencies in the recognition mechanism
3592 here. This needs to be much smarter if it's going to cut it.
3593 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
3594 it should be detected as Latin-1.
3595 All the ISO2022 stuff in this file should be synced up with the
3596 code from FSF Emacs-20.4, in which Mule should be more or less stable.
3597 Perhaps we should wait till R2L works in FSF Emacs? */
3599 if (!st->iso2022.initted)
3601 reset_iso2022 (Qnil, &st->iso2022.iso);
3602 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
3603 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3604 CODING_CATEGORY_ISO_8_1_MASK |
3605 CODING_CATEGORY_ISO_8_2_MASK |
3606 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
3607 st->iso2022.flags = 0;
3608 st->iso2022.high_byte_count = 0;
3609 st->iso2022.saw_single_shift = 0;
3610 st->iso2022.initted = 1;
3613 mask = st->iso2022.mask;
3620 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3621 st->iso2022.high_byte_count++;
3625 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
3627 if (st->iso2022.high_byte_count & 1)
3628 /* odd number of high bytes; assume not iso-8-2 */
3629 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3631 st->iso2022.high_byte_count = 0;
3632 st->iso2022.saw_single_shift = 0;
3634 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3636 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
3637 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
3638 { /* control chars */
3641 /* Allow and ignore control characters that you might
3642 reasonably see in a text file */
3647 case 8: /* backspace */
3648 case 11: /* vertical tab */
3649 case 12: /* form feed */
3650 case 26: /* MS-DOS C-z junk */
3651 case 31: /* '^_' -- for info */
3652 goto label_continue_loop;
3659 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
3662 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
3663 &st->iso2022.flags, 0))
3665 switch (st->iso2022.iso.esc)
3667 case ISO_ESC_DESIGNATE:
3668 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
3669 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3671 case ISO_ESC_LOCKING_SHIFT:
3672 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
3673 goto ran_out_of_chars;
3674 case ISO_ESC_SINGLE_SHIFT:
3675 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
3676 st->iso2022.saw_single_shift = 1;
3685 goto ran_out_of_chars;
3688 label_continue_loop:;
3697 postprocess_iso2022_mask (int mask)
3699 /* #### kind of cheesy */
3700 /* If seven-bit ISO is allowed, then assume that the encoding is
3701 entirely seven-bit and turn off the eight-bit ones. */
3702 if (mask & CODING_CATEGORY_ISO_7_MASK)
3703 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3704 CODING_CATEGORY_ISO_8_1_MASK |
3705 CODING_CATEGORY_ISO_8_2_MASK);
3709 /* If FLAGS is a null pointer or specifies right-to-left motion,
3710 output a switch-dir-to-left-to-right sequence to DST.
3711 Also update FLAGS if it is not a null pointer.
3712 If INTERNAL_P is set, we are outputting in internal format and
3713 need to handle the CSI differently. */
3716 restore_left_to_right_direction (struct Lisp_Coding_System *codesys,
3717 unsigned_char_dynarr *dst,
3718 unsigned int *flags,
3721 if (!flags || (*flags & CODING_STATE_R2L))
3723 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3725 Dynarr_add (dst, ISO_CODE_ESC);
3726 Dynarr_add (dst, '[');
3728 else if (internal_p)
3729 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3731 Dynarr_add (dst, ISO_CODE_CSI);
3732 Dynarr_add (dst, '0');
3733 Dynarr_add (dst, ']');
3735 *flags &= ~CODING_STATE_R2L;
3739 /* If FLAGS is a null pointer or specifies a direction different from
3740 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
3741 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
3742 sequence to DST. Also update FLAGS if it is not a null pointer.
3743 If INTERNAL_P is set, we are outputting in internal format and
3744 need to handle the CSI differently. */
3747 ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys,
3748 unsigned_char_dynarr *dst, unsigned int *flags,
3751 if ((!flags || (*flags & CODING_STATE_R2L)) &&
3752 direction == CHARSET_LEFT_TO_RIGHT)
3753 restore_left_to_right_direction (codesys, dst, flags, internal_p);
3754 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
3755 && (!flags || !(*flags & CODING_STATE_R2L)) &&
3756 direction == CHARSET_RIGHT_TO_LEFT)
3758 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3760 Dynarr_add (dst, ISO_CODE_ESC);
3761 Dynarr_add (dst, '[');
3763 else if (internal_p)
3764 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3766 Dynarr_add (dst, ISO_CODE_CSI);
3767 Dynarr_add (dst, '2');
3768 Dynarr_add (dst, ']');
3770 *flags |= CODING_STATE_R2L;
3774 /* Convert ISO2022-format data to internal format. */
3777 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
3778 unsigned_char_dynarr *dst, unsigned int n)
3780 unsigned int flags, ch;
3781 enum eol_type eol_type;
3782 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3783 Lisp_Object coding_system;
3784 unsigned_char_dynarr *real_dst = dst;
3786 CODING_STREAM_DECOMPOSE (str, flags, ch);
3787 eol_type = str->eol_type;
3788 XSETCODING_SYSTEM (coding_system, str->codesys);
3790 if (flags & CODING_STATE_COMPOSITE)
3791 dst = str->iso2022.composite_chars;
3795 unsigned char c = *src++;
3796 if (flags & CODING_STATE_ESCAPE)
3797 { /* Within ESC sequence */
3798 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
3803 switch (str->iso2022.esc)
3805 case ISO_ESC_START_COMPOSITE:
3806 if (str->iso2022.composite_chars)
3807 Dynarr_reset (str->iso2022.composite_chars);
3809 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
3810 dst = str->iso2022.composite_chars;
3812 case ISO_ESC_END_COMPOSITE:
3814 Bufbyte comstr[MAX_EMCHAR_LEN];
3816 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
3817 Dynarr_length (dst));
3819 len = set_charptr_emchar (comstr, emch);
3820 Dynarr_add_many (dst, comstr, len);
3824 case ISO_ESC_LITERAL:
3825 DECODE_ADD_BINARY_CHAR (c, dst);
3829 /* Everything else handled already */
3834 /* Attempted error recovery. */
3835 if (str->iso2022.output_direction_sequence)
3836 ensure_correct_direction (flags & CODING_STATE_R2L ?
3837 CHARSET_RIGHT_TO_LEFT :
3838 CHARSET_LEFT_TO_RIGHT,
3839 str->codesys, dst, 0, 1);
3840 /* More error recovery. */
3841 if (!retval || str->iso2022.output_literally)
3843 /* Output the (possibly invalid) sequence */
3845 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
3846 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
3847 flags &= CODING_STATE_ISO2022_LOCK;
3849 n++, src--;/* Repeat the loop with the same character. */
3852 /* No sense in reprocessing the final byte of the
3853 escape sequence; it could mess things up anyway.
3855 DECODE_ADD_BINARY_CHAR (c, dst);
3860 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
3861 { /* Control characters */
3863 /***** Error-handling *****/
3865 /* If we were in the middle of a character, dump out the
3866 partial character. */
3867 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3869 /* If we just saw a single-shift character, dump it out.
3870 This may dump out the wrong sort of single-shift character,
3871 but least it will give an indication that something went
3873 if (flags & CODING_STATE_SS2)
3875 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
3876 flags &= ~CODING_STATE_SS2;
3878 if (flags & CODING_STATE_SS3)
3880 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
3881 flags &= ~CODING_STATE_SS3;
3884 /***** Now handle the control characters. *****/
3887 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3889 flags &= CODING_STATE_ISO2022_LOCK;
3891 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
3892 DECODE_ADD_BINARY_CHAR (c, dst);
3895 { /* Graphic characters */
3896 Lisp_Object charset;
3900 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3902 /* Now determine the charset. */
3903 reg = ((flags & CODING_STATE_SS2) ? 2
3904 : (flags & CODING_STATE_SS3) ? 3
3905 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
3906 : str->iso2022.register_left);
3907 charset = str->iso2022.charset[reg];
3909 /* Error checking: */
3910 if (! CHARSETP (charset)
3911 || str->iso2022.invalid_designated[reg]
3912 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
3913 && XCHARSET_CHARS (charset) == 94))
3914 /* Mrmph. We are trying to invoke a register that has no
3915 or an invalid charset in it, or trying to add a character
3916 outside the range of the charset. Insert that char literally
3917 to preserve it for the output. */
3919 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3920 DECODE_ADD_BINARY_CHAR (c, dst);
3925 /* Things are probably hunky-dorey. */
3927 /* Fetch reverse charset, maybe. */
3928 if (((flags & CODING_STATE_R2L) &&
3929 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
3931 (!(flags & CODING_STATE_R2L) &&
3932 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
3934 Lisp_Object new_charset =
3935 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
3936 if (!NILP (new_charset))
3937 charset = new_charset;
3940 lb = XCHARSET_LEADING_BYTE (charset);
3941 switch (XCHARSET_REP_BYTES (charset))
3944 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3945 Dynarr_add (dst, c & 0x7F);
3948 case 2: /* one-byte official */
3949 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3950 Dynarr_add (dst, lb);
3951 Dynarr_add (dst, c | 0x80);
3954 case 3: /* one-byte private or two-byte official */
3955 if (XCHARSET_PRIVATE_P (charset))
3957 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3958 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
3959 Dynarr_add (dst, lb);
3960 Dynarr_add (dst, c | 0x80);
3966 Dynarr_add (dst, lb);
3967 Dynarr_add (dst, ch | 0x80);
3968 Dynarr_add (dst, c | 0x80);
3976 default: /* two-byte private */
3979 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
3980 Dynarr_add (dst, lb);
3981 Dynarr_add (dst, ch | 0x80);
3982 Dynarr_add (dst, c | 0x80);
3991 flags &= CODING_STATE_ISO2022_LOCK;
3994 label_continue_loop:;
3997 if (flags & CODING_STATE_END)
3998 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4000 CODING_STREAM_COMPOSE (str, flags, ch);
4004 /***** ISO2022 encoder *****/
4006 /* Designate CHARSET into register REG. */
4009 iso2022_designate (Lisp_Object charset, unsigned char reg,
4010 struct encoding_stream *str, unsigned_char_dynarr *dst)
4012 CONST char *inter94 = "()*+", *inter96= ",-./";
4014 unsigned char final;
4015 Lisp_Object old_charset = str->iso2022.charset[reg];
4017 str->iso2022.charset[reg] = charset;
4018 if (!CHARSETP (charset))
4019 /* charset might be an initial nil or t. */
4021 type = XCHARSET_TYPE (charset);
4022 final = XCHARSET_FINAL (charset);
4023 if (!str->iso2022.force_charset_on_output[reg] &&
4024 CHARSETP (old_charset) &&
4025 XCHARSET_TYPE (old_charset) == type &&
4026 XCHARSET_FINAL (old_charset) == final)
4029 str->iso2022.force_charset_on_output[reg] = 0;
4032 charset_conversion_spec_dynarr *dyn =
4033 str->codesys->iso2022.output_conv;
4039 for (i = 0; i < Dynarr_length (dyn); i++)
4041 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4042 if (EQ (charset, spec->from_charset))
4043 charset = spec->to_charset;
4048 Dynarr_add (dst, ISO_CODE_ESC);
4051 case CHARSET_TYPE_94:
4052 Dynarr_add (dst, inter94[reg]);
4054 case CHARSET_TYPE_96:
4055 Dynarr_add (dst, inter96[reg]);
4057 case CHARSET_TYPE_94X94:
4058 Dynarr_add (dst, '$');
4060 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4063 Dynarr_add (dst, inter94[reg]);
4065 case CHARSET_TYPE_96X96:
4066 Dynarr_add (dst, '$');
4067 Dynarr_add (dst, inter96[reg]);
4070 Dynarr_add (dst, final);
4074 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4076 if (str->iso2022.register_left != 0)
4078 Dynarr_add (dst, ISO_CODE_SI);
4079 str->iso2022.register_left = 0;
4084 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4086 if (str->iso2022.register_left != 1)
4088 Dynarr_add (dst, ISO_CODE_SO);
4089 str->iso2022.register_left = 1;
4093 /* Convert internally-formatted data to ISO2022 format. */
4096 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4097 unsigned_char_dynarr *dst, unsigned int n)
4099 unsigned char charmask, c;
4100 unsigned int flags, ch;
4101 enum eol_type eol_type;
4102 unsigned char char_boundary;
4103 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4104 struct Lisp_Coding_System *codesys = str->codesys;
4106 Lisp_Object charset;
4109 /* flags for handling composite chars. We do a little switcharoo
4110 on the source while we're outputting the composite char. */
4111 unsigned int saved_n = 0;
4112 CONST unsigned char *saved_src = NULL;
4113 int in_composite = 0;
4115 CODING_STREAM_DECOMPOSE (str, flags, ch);
4116 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4117 char_boundary = str->iso2022.current_char_boundary;
4118 charset = str->iso2022.current_charset;
4119 half = str->iso2022.current_half;
4126 if (BYTE_ASCII_P (c))
4127 { /* Processing ASCII character */
4130 restore_left_to_right_direction (codesys, dst, &flags, 0);
4132 /* Make sure G0 contains ASCII */
4133 if ((c > ' ' && c < ISO_CODE_DEL) ||
4134 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4136 ensure_normal_shift (str, dst);
4137 iso2022_designate (Vcharset_ascii, 0, str, dst);
4140 /* If necessary, restore everything to the default state
4143 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4145 restore_left_to_right_direction (codesys, dst, &flags, 0);
4147 ensure_normal_shift (str, dst);
4149 for (i = 0; i < 4; i++)
4151 Lisp_Object initial_charset =
4152 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4153 iso2022_designate (initial_charset, i, str, dst);
4158 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4159 Dynarr_add (dst, '\r');
4160 if (eol_type != EOL_CR)
4161 Dynarr_add (dst, c);
4165 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4166 && fit_to_be_escape_quoted (c))
4167 Dynarr_add (dst, ISO_CODE_ESC);
4168 Dynarr_add (dst, c);
4173 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4174 { /* Processing Leading Byte */
4176 charset = CHARSET_BY_LEADING_BYTE (c);
4177 if (LEADING_BYTE_PREFIX_P(c))
4179 else if (!EQ (charset, Vcharset_control_1)
4180 && !EQ (charset, Vcharset_composite))
4184 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4185 codesys, dst, &flags, 0);
4187 /* Now determine which register to use. */
4189 for (i = 0; i < 4; i++)
4191 if (EQ (charset, str->iso2022.charset[i]) ||
4193 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4202 if (XCHARSET_GRAPHIC (charset) != 0)
4204 if (!NILP (str->iso2022.charset[1]) &&
4205 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4206 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4208 else if (!NILP (str->iso2022.charset[2]))
4210 else if (!NILP (str->iso2022.charset[3]))
4219 iso2022_designate (charset, reg, str, dst);
4221 /* Now invoke that register. */
4225 ensure_normal_shift (str, dst);
4230 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4232 ensure_shift_out (str, dst);
4240 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4242 Dynarr_add (dst, ISO_CODE_ESC);
4243 Dynarr_add (dst, 'N');
4248 Dynarr_add (dst, ISO_CODE_SS2);
4254 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4256 Dynarr_add (dst, ISO_CODE_ESC);
4257 Dynarr_add (dst, 'O');
4262 Dynarr_add (dst, ISO_CODE_SS3);
4274 { /* Processing Non-ASCII character */
4275 charmask = (half == 0 ? 0x7F : 0xFF);
4277 if (EQ (charset, Vcharset_control_1))
4279 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4280 && fit_to_be_escape_quoted (c))
4281 Dynarr_add (dst, ISO_CODE_ESC);
4282 /* you asked for it ... */
4283 Dynarr_add (dst, c - 0x20);
4287 switch (XCHARSET_REP_BYTES (charset))
4290 Dynarr_add (dst, c & charmask);
4293 if (XCHARSET_PRIVATE_P (charset))
4295 Dynarr_add (dst, c & charmask);
4300 if (EQ (charset, Vcharset_composite))
4304 /* #### Bother! We don't know how to
4306 Dynarr_add (dst, '~');
4310 Emchar emch = MAKE_CHAR (Vcharset_composite,
4311 ch & 0x7F, c & 0x7F);
4312 Lisp_Object lstr = composite_char_string (emch);
4316 src = XSTRING_DATA (lstr);
4317 n = XSTRING_LENGTH (lstr);
4318 Dynarr_add (dst, ISO_CODE_ESC);
4319 Dynarr_add (dst, '0'); /* start composing */
4324 Dynarr_add (dst, ch & charmask);
4325 Dynarr_add (dst, c & charmask);
4338 Dynarr_add (dst, ch & charmask);
4339 Dynarr_add (dst, c & charmask);
4360 Dynarr_add (dst, ISO_CODE_ESC);
4361 Dynarr_add (dst, '1'); /* end composing */
4362 goto back_to_square_n; /* Wheeeeeeeee ..... */
4365 if (char_boundary && flags & CODING_STATE_END)
4367 restore_left_to_right_direction (codesys, dst, &flags, 0);
4368 ensure_normal_shift (str, dst);
4369 for (i = 0; i < 4; i++)
4371 Lisp_Object initial_charset =
4372 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4373 iso2022_designate (initial_charset, i, str, dst);
4377 CODING_STREAM_COMPOSE (str, flags, ch);
4378 str->iso2022.current_char_boundary = char_boundary;
4379 str->iso2022.current_charset = charset;
4380 str->iso2022.current_half = half;
4382 /* Verbum caro factum est! */
4386 /************************************************************************/
4387 /* No-conversion methods */
4388 /************************************************************************/
4390 /* This is used when reading in "binary" files -- i.e. files that may
4391 contain all 256 possible byte values and that are not to be
4392 interpreted as being in any particular decoding. */
4394 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4395 unsigned_char_dynarr *dst, unsigned int n)
4398 unsigned int flags, ch;
4399 enum eol_type eol_type;
4400 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4402 CODING_STREAM_DECOMPOSE (str, flags, ch);
4403 eol_type = str->eol_type;
4409 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4410 DECODE_ADD_BINARY_CHAR (c, dst);
4411 label_continue_loop:;
4414 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4416 CODING_STREAM_COMPOSE (str, flags, ch);
4420 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4421 unsigned_char_dynarr *dst, unsigned int n)
4424 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4425 unsigned int flags, ch;
4426 enum eol_type eol_type;
4428 CODING_STREAM_DECOMPOSE (str, flags, ch);
4429 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4436 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4437 Dynarr_add (dst, '\r');
4438 if (eol_type != EOL_CR)
4439 Dynarr_add (dst, '\n');
4442 else if (BYTE_ASCII_P (c))
4445 Dynarr_add (dst, c);
4447 else if (BUFBYTE_LEADING_BYTE_P (c))
4450 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
4451 c == LEADING_BYTE_CONTROL_1)
4454 Dynarr_add (dst, '~'); /* untranslatable character */
4458 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
4459 Dynarr_add (dst, c);
4460 else if (ch == LEADING_BYTE_CONTROL_1)
4463 Dynarr_add (dst, c - 0x20);
4465 /* else it should be the second or third byte of an
4466 untranslatable character, so ignore it */
4471 CODING_STREAM_COMPOSE (str, flags, ch);
4475 /************************************************************************/
4476 /* Simple internal/external functions */
4477 /************************************************************************/
4479 static Extbyte_dynarr *conversion_out_dynarr;
4480 static Bufbyte_dynarr *conversion_in_dynarr;
4482 /* Determine coding system from coding format */
4484 /* #### not correct for all values of `fmt'! */
4486 external_data_format_to_coding_system (enum external_data_format fmt)
4490 case FORMAT_FILENAME:
4491 case FORMAT_TERMINAL:
4492 if (EQ (Vfile_name_coding_system, Qnil) ||
4493 EQ (Vfile_name_coding_system, Qbinary))
4496 return Fget_coding_system (Vfile_name_coding_system);
4499 return Fget_coding_system (Qctext);
4507 convert_to_external_format (CONST Bufbyte *ptr,
4510 enum external_data_format fmt)
4512 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
4514 if (!conversion_out_dynarr)
4515 conversion_out_dynarr = Dynarr_new (Extbyte);
4517 Dynarr_reset (conversion_out_dynarr);
4519 if (NILP (coding_system))
4521 CONST Bufbyte *end = ptr + len;
4526 (BYTE_ASCII_P (*ptr)) ? *ptr :
4527 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
4528 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
4531 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
4535 #ifdef ERROR_CHECK_BUFPOS
4536 assert (ptr == end);
4541 Lisp_Object instream, outstream, da_outstream;
4542 Lstream *istr, *ostr;
4543 struct gcpro gcpro1, gcpro2, gcpro3;
4544 char tempbuf[1024]; /* some random amount */
4546 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4547 da_outstream = make_dynarr_output_stream
4548 ((unsigned_char_dynarr *) conversion_out_dynarr);
4550 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
4551 istr = XLSTREAM (instream);
4552 ostr = XLSTREAM (outstream);
4553 GCPRO3 (instream, outstream, da_outstream);
4556 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4559 Lstream_write (ostr, tempbuf, size_in_bytes);
4561 Lstream_close (istr);
4562 Lstream_close (ostr);
4564 Lstream_delete (istr);
4565 Lstream_delete (ostr);
4566 Lstream_delete (XLSTREAM (da_outstream));
4569 *len_out = Dynarr_length (conversion_out_dynarr);
4570 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
4571 return Dynarr_atp (conversion_out_dynarr, 0);
4575 convert_from_external_format (CONST Extbyte *ptr,
4578 enum external_data_format fmt)
4580 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
4582 if (!conversion_in_dynarr)
4583 conversion_in_dynarr = Dynarr_new (Bufbyte);
4585 Dynarr_reset (conversion_in_dynarr);
4587 if (NILP (coding_system))
4589 CONST Extbyte *end = ptr + len;
4590 for (; ptr < end; ptr++)
4593 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
4598 Lisp_Object instream, outstream, da_outstream;
4599 Lstream *istr, *ostr;
4600 struct gcpro gcpro1, gcpro2, gcpro3;
4601 char tempbuf[1024]; /* some random amount */
4603 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4604 da_outstream = make_dynarr_output_stream
4605 ((unsigned_char_dynarr *) conversion_in_dynarr);
4607 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
4608 istr = XLSTREAM (instream);
4609 ostr = XLSTREAM (outstream);
4610 GCPRO3 (instream, outstream, da_outstream);
4613 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4616 Lstream_write (ostr, tempbuf, size_in_bytes);
4618 Lstream_close (istr);
4619 Lstream_close (ostr);
4621 Lstream_delete (istr);
4622 Lstream_delete (ostr);
4623 Lstream_delete (XLSTREAM (da_outstream));
4626 *len_out = Dynarr_length (conversion_in_dynarr);
4627 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
4628 return Dynarr_atp (conversion_in_dynarr, 0);
4632 /************************************************************************/
4633 /* Initialization */
4634 /************************************************************************/
4637 syms_of_mule_coding (void)
4639 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
4640 deferror (&Qcoding_system_error, "coding-system-error",
4641 "Coding-system error", Qio_error);
4643 DEFSUBR (Fcoding_system_p);
4644 DEFSUBR (Ffind_coding_system);
4645 DEFSUBR (Fget_coding_system);
4646 DEFSUBR (Fcoding_system_list);
4647 DEFSUBR (Fcoding_system_name);
4648 DEFSUBR (Fmake_coding_system);
4649 DEFSUBR (Fcopy_coding_system);
4650 DEFSUBR (Fsubsidiary_coding_system);
4652 DEFSUBR (Fcoding_system_type);
4653 DEFSUBR (Fcoding_system_doc_string);
4655 DEFSUBR (Fcoding_system_charset);
4657 DEFSUBR (Fcoding_system_property);
4659 DEFSUBR (Fcoding_category_list);
4660 DEFSUBR (Fset_coding_priority_list);
4661 DEFSUBR (Fcoding_priority_list);
4662 DEFSUBR (Fset_coding_category_system);
4663 DEFSUBR (Fcoding_category_system);
4665 DEFSUBR (Fdetect_coding_region);
4666 DEFSUBR (Fdecode_coding_region);
4667 DEFSUBR (Fencode_coding_region);
4669 DEFSUBR (Fdecode_shift_jis_char);
4670 DEFSUBR (Fencode_shift_jis_char);
4671 DEFSUBR (Fdecode_big5_char);
4672 DEFSUBR (Fencode_big5_char);
4674 defsymbol (&Qcoding_system_p, "coding-system-p");
4675 defsymbol (&Qno_conversion, "no-conversion");
4677 defsymbol (&Qbig5, "big5");
4678 defsymbol (&Qshift_jis, "shift-jis");
4679 defsymbol (&Qccl, "ccl");
4680 defsymbol (&Qiso2022, "iso2022");
4682 defsymbol (&Qmnemonic, "mnemonic");
4683 defsymbol (&Qeol_type, "eol-type");
4684 defsymbol (&Qpost_read_conversion, "post-read-conversion");
4685 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
4687 defsymbol (&Qcr, "cr");
4688 defsymbol (&Qlf, "lf");
4689 defsymbol (&Qcrlf, "crlf");
4690 defsymbol (&Qeol_cr, "eol-cr");
4691 defsymbol (&Qeol_lf, "eol-lf");
4692 defsymbol (&Qeol_crlf, "eol-crlf");
4694 defsymbol (&Qcharset_g0, "charset-g0");
4695 defsymbol (&Qcharset_g1, "charset-g1");
4696 defsymbol (&Qcharset_g2, "charset-g2");
4697 defsymbol (&Qcharset_g3, "charset-g3");
4698 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
4699 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
4700 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
4701 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
4702 defsymbol (&Qno_iso6429, "no-iso6429");
4703 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
4704 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
4706 defsymbol (&Qshort, "short");
4707 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
4708 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
4709 defsymbol (&Qseven, "seven");
4710 defsymbol (&Qlock_shift, "lock-shift");
4711 defsymbol (&Qescape_quoted, "escape-quoted");
4713 defsymbol (&Qencode, "encode");
4714 defsymbol (&Qdecode, "decode");
4717 defsymbol (&Qctext, "ctext");
4718 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
4720 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
4722 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
4724 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
4726 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
4728 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
4730 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
4733 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
4738 lstream_type_create_mule_coding (void)
4740 LSTREAM_HAS_METHOD (decoding, reader);
4741 LSTREAM_HAS_METHOD (decoding, writer);
4742 LSTREAM_HAS_METHOD (decoding, rewinder);
4743 LSTREAM_HAS_METHOD (decoding, seekable_p);
4744 LSTREAM_HAS_METHOD (decoding, flusher);
4745 LSTREAM_HAS_METHOD (decoding, closer);
4746 LSTREAM_HAS_METHOD (decoding, marker);
4748 LSTREAM_HAS_METHOD (encoding, reader);
4749 LSTREAM_HAS_METHOD (encoding, writer);
4750 LSTREAM_HAS_METHOD (encoding, rewinder);
4751 LSTREAM_HAS_METHOD (encoding, seekable_p);
4752 LSTREAM_HAS_METHOD (encoding, flusher);
4753 LSTREAM_HAS_METHOD (encoding, closer);
4754 LSTREAM_HAS_METHOD (encoding, marker);
4758 vars_of_mule_coding (void)
4762 /* Initialize to something reasonable ... */
4763 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
4765 coding_category_system[i] = Qnil;
4766 coding_category_by_priority[i] = i;
4769 Fprovide (intern ("file-coding"));
4771 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
4772 Coding system used for TTY keyboard input.
4773 Not used under a windowing system.
4775 Vkeyboard_coding_system = Qnil;
4777 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
4778 Coding system used for TTY display output.
4779 Not used under a windowing system.
4781 Vterminal_coding_system = Qnil;
4783 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
4784 Overriding coding system used when writing a file or process.
4785 You should *bind* this, not set it. If this is non-nil, it specifies
4786 the coding system that will be used when a file or process is read
4787 in, and overrides `buffer-file-coding-system-for-read',
4788 `insert-file-contents-pre-hook', etc. Use those variables instead of
4789 this one for permanent changes to the environment.
4791 Vcoding_system_for_read = Qnil;
4793 DEFVAR_LISP ("coding-system-for-write",
4794 &Vcoding_system_for_write /*
4795 Overriding coding system used when writing a file or process.
4796 You should *bind* this, not set it. If this is non-nil, it specifies
4797 the coding system that will be used when a file or process is wrote
4798 in, and overrides `buffer-file-coding-system',
4799 `write-region-pre-hook', etc. Use those variables instead of this one
4800 for permanent changes to the environment.
4802 Vcoding_system_for_write = Qnil;
4804 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
4805 Coding system used to convert pathnames when accessing files.
4807 Vfile_name_coding_system = Qnil;
4809 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
4810 Non-nil means the buffer contents are regarded as multi-byte form
4811 of characters, not a binary code. This affects the display, file I/O,
4812 and behaviors of various editing commands.
4814 Setting this to nil does not do anything.
4816 enable_multibyte_characters = 1;
4820 complex_vars_of_mule_coding (void)
4822 staticpro (&Vcoding_system_hash_table);
4823 Vcoding_system_hash_table =
4824 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
4826 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
4828 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
4830 struct codesys_prop csp; \
4832 csp.prop_type = (Prop_Type); \
4833 Dynarr_add (the_codesys_prop_dynarr, csp); \
4836 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
4837 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
4838 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
4839 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
4840 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
4841 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
4842 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
4844 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
4845 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
4846 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
4847 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
4848 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
4849 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
4850 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
4851 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
4852 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
4853 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
4854 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
4855 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
4856 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
4857 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
4858 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
4859 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
4860 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
4862 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
4863 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
4865 /* Need to create this here or we're really screwed. */
4866 Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
4867 list2 (Qmnemonic, build_string ("Noconv")));
4869 Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
4872 /* Need this for bootstrapping */
4873 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
4874 Fget_coding_system (Qno_conversion);