1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 #if 0 /* while file-coding not split up */
36 #include "mule-coding.h"
38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
40 Lisp_Object Vkeyboard_coding_system;
41 Lisp_Object Vterminal_coding_system;
42 Lisp_Object Vcoding_system_for_read;
43 Lisp_Object Vcoding_system_for_write;
44 Lisp_Object Vfile_name_coding_system;
46 /* Table of symbols identifying each coding category. */
47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
49 /* Coding system currently associated with each coding category. */
50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
52 /* Table of all coding categories in decreasing order of priority.
53 This describes a permutation of the possible coding categories. */
54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
56 Lisp_Object Qcoding_system_p;
58 Lisp_Object Qbig5, Qshift_jis, Qno_conversion, Qccl, Qiso2022;
59 /* Qinternal in general.c */
61 Lisp_Object Qmnemonic, Qeol_type;
62 Lisp_Object Qcr, Qcrlf, Qlf;
63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
64 Lisp_Object Qpost_read_conversion;
65 Lisp_Object Qpre_write_conversion;
67 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
68 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
69 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
70 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
71 Lisp_Object Qno_iso6429, Qescape_quoted;
72 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
74 Lisp_Object Qencode, Qdecode;
78 Lisp_Object Vcoding_system_hash_table;
80 int enable_multibyte_characters;
82 /* Additional information used by the ISO2022 decoder and detector. */
83 struct iso2022_decoder
85 /* CHARSET holds the character sets currently assigned to the G0
86 through G3 variables. It is initialized from the array
87 INITIAL_CHARSET in CODESYS. */
88 Lisp_Object charset[4];
90 /* Which registers are currently invoked into the left (GL) and
91 right (GR) halves of the 8-bit encoding space? */
92 int register_left, register_right;
94 /* ISO_ESC holds a value indicating part of an escape sequence
95 that has already been seen. */
96 enum iso_esc_flag esc;
98 /* This records the bytes we've seen so far in an escape sequence,
99 in case the sequence is invalid (we spit out the bytes unchanged). */
100 unsigned char esc_bytes[8];
102 /* Index for next byte to store in ISO escape sequence. */
105 /* Stuff seen so far when composing a string. */
106 unsigned_char_dynarr *composite_chars;
108 /* If we saw an invalid designation sequence for a particular
109 register, we flag it here and switch to ASCII. The next time we
110 see a valid designation for this register, we turn off the flag
111 and do the designation normally, but pretend the sequence was
112 invalid. The effect of all this is that (most of the time) the
113 escape sequences for both the switch to the unknown charset, and
114 the switch back to the known charset, get inserted literally into
115 the buffer and saved out as such. The hope is that we can
116 preserve the escape sequences so that the resulting written out
117 file makes sense. If we don't do any of this, the designation
118 to the invalid charset will be preserved but that switch back
119 to the known charset will probably get eaten because it was
120 the same charset that was already present in the register. */
121 unsigned char invalid_designated[4];
123 /* We try to do similar things as above for direction-switching
124 sequences. If we encountered a direction switch while an
125 invalid designation was present, or an invalid designation
126 just after a direction switch (i.e. no valid designation
127 encountered yet), we insert the direction-switch escape
128 sequence literally into the output stream, and later on
129 insert the corresponding direction-restoring escape sequence
131 unsigned int switched_dir_and_no_valid_charset_yet :1;
132 unsigned int invalid_switch_dir :1;
134 /* Tells the decoder to output the escape sequence literally
135 even though it was valid. Used in the games we play to
136 avoid lossage when we encounter invalid designations. */
137 unsigned int output_literally :1;
138 /* We encountered a direction switch followed by an invalid
139 designation. We didn't output the direction switch
140 literally because we didn't know about the invalid designation;
141 but we have to do so now. */
142 unsigned int output_direction_sequence :1;
145 EXFUN (Fcopy_coding_system, 2);
146 struct detection_state;
147 static int detect_coding_sjis (struct detection_state *st,
148 CONST unsigned char *src,
150 static void decode_coding_sjis (Lstream *decoding,
151 CONST unsigned char *src,
152 unsigned_char_dynarr *dst,
154 static void encode_coding_sjis (Lstream *encoding,
155 CONST unsigned char *src,
156 unsigned_char_dynarr *dst,
158 static int detect_coding_big5 (struct detection_state *st,
159 CONST unsigned char *src,
161 static void decode_coding_big5 (Lstream *decoding,
162 CONST unsigned char *src,
163 unsigned_char_dynarr *dst, unsigned int n);
164 static void encode_coding_big5 (Lstream *encoding,
165 CONST unsigned char *src,
166 unsigned_char_dynarr *dst, unsigned int n);
167 static int postprocess_iso2022_mask (int mask);
168 static void reset_iso2022 (Lisp_Object coding_system,
169 struct iso2022_decoder *iso);
170 static int detect_coding_iso2022 (struct detection_state *st,
171 CONST unsigned char *src,
173 static void decode_coding_iso2022 (Lstream *decoding,
174 CONST unsigned char *src,
175 unsigned_char_dynarr *dst, unsigned int n);
176 static void encode_coding_iso2022 (Lstream *encoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static void decode_coding_no_conversion (Lstream *decoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst,
183 static void encode_coding_no_conversion (Lstream *encoding,
184 CONST unsigned char *src,
185 unsigned_char_dynarr *dst,
187 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
188 unsigned_char_dynarr *dst, unsigned int n);
189 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
190 unsigned_char_dynarr *dst, unsigned int n);
192 typedef struct codesys_prop codesys_prop;
201 Dynarr_declare (codesys_prop);
202 } codesys_prop_dynarr;
204 codesys_prop_dynarr *the_codesys_prop_dynarr;
206 enum codesys_prop_enum
209 CODESYS_PROP_ISO2022,
214 /************************************************************************/
215 /* Coding system functions */
216 /************************************************************************/
219 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
221 struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
223 markobj (CODING_SYSTEM_NAME (codesys));
224 markobj (CODING_SYSTEM_DOC_STRING (codesys));
225 markobj (CODING_SYSTEM_MNEMONIC (codesys));
226 markobj (CODING_SYSTEM_EOL_LF (codesys));
227 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
228 markobj (CODING_SYSTEM_EOL_CR (codesys));
230 switch (CODING_SYSTEM_TYPE (codesys))
233 case CODESYS_ISO2022:
234 for (i = 0; i < 4; i++)
235 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
236 if (codesys->iso2022.input_conv)
238 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
240 struct charset_conversion_spec *ccs =
241 Dynarr_atp (codesys->iso2022.input_conv, i);
242 markobj (ccs->from_charset);
243 markobj (ccs->to_charset);
246 if (codesys->iso2022.output_conv)
248 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
250 struct charset_conversion_spec *ccs =
251 Dynarr_atp (codesys->iso2022.output_conv, i);
252 markobj (ccs->from_charset);
253 markobj (ccs->to_charset);
259 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
260 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
266 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
267 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
271 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
274 struct Lisp_Coding_System *c = XCODING_SYSTEM (obj);
276 error ("printing unreadable object #<coding_system 0x%x>",
279 write_c_string ("#<coding_system ", printcharfun);
280 print_internal (c->name, printcharfun, 1);
281 write_c_string (">", printcharfun);
285 finalize_coding_system (void *header, int for_disksave)
287 struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header;
288 /* Since coding systems never go away, this function is not
289 necessary. But it would be necessary if we changed things
290 so that coding systems could go away. */
291 if (!for_disksave) /* see comment in lstream.c */
293 switch (CODING_SYSTEM_TYPE (c))
295 case CODESYS_ISO2022:
296 if (c->iso2022.input_conv)
298 Dynarr_free (c->iso2022.input_conv);
299 c->iso2022.input_conv = 0;
301 if (c->iso2022.output_conv)
303 Dynarr_free (c->iso2022.output_conv);
304 c->iso2022.output_conv = 0;
314 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
315 mark_coding_system, print_coding_system,
316 finalize_coding_system,
317 0, 0, struct Lisp_Coding_System);
320 symbol_to_eol_type (Lisp_Object symbol)
322 CHECK_SYMBOL (symbol);
323 if (NILP (symbol)) return EOL_AUTODETECT;
324 if (EQ (symbol, Qlf)) return EOL_LF;
325 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
326 if (EQ (symbol, Qcr)) return EOL_CR;
328 signal_simple_error ("Unrecognized eol type", symbol);
329 return EOL_AUTODETECT; /* not reached */
333 eol_type_to_symbol (enum eol_type type)
338 case EOL_LF: return Qlf;
339 case EOL_CRLF: return Qcrlf;
340 case EOL_CR: return Qcr;
341 case EOL_AUTODETECT: return Qnil;
346 setup_eol_coding_systems (struct Lisp_Coding_System *codesys)
348 Lisp_Object codesys_obj;
349 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
350 char *codesys_name = (char *) alloca (len + 7);
351 Lisp_Object codesys_name_sym, sub_codesys_obj;
355 XSETCODING_SYSTEM (codesys_obj, codesys);
357 memcpy (codesys_name,
358 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
360 #define DEFINE_SUB_CODESYS(op_sys, Type) do { \
361 strcpy (codesys_name + len, "-" op_sys); \
362 codesys_name_sym = intern (codesys_name); \
363 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
364 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
365 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
368 DEFINE_SUB_CODESYS("unix", EOL_LF);
369 DEFINE_SUB_CODESYS("dos", EOL_CRLF);
370 DEFINE_SUB_CODESYS("mac", EOL_CR);
373 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
374 Return t if OBJECT is a coding system.
375 A coding system is an object that defines how text containing multiple
376 character sets is encoded into a stream of (typically 8-bit) bytes.
377 The coding system is used to decode the stream into a series of
378 characters (which may be from multiple charsets) when the text is read
379 from a file or process, and is used to encode the text back into the
380 same format when it is written out to a file or process.
382 For example, many ISO2022-compliant coding systems (such as Compound
383 Text, which is used for inter-client data under the X Window System)
384 use escape sequences to switch between different charsets -- Japanese
385 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
386 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
387 `make-coding-system' for more information.
389 Coding systems are normally identified using a symbol, and the
390 symbol is accepted in place of the actual coding system object whenever
391 a coding system is called for. (This is similar to how faces work.)
395 return CODING_SYSTEMP (object) ? Qt : Qnil;
398 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
399 Retrieve the coding system of the given name.
401 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
402 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
403 If there is no such coding system, nil is returned. Otherwise the
404 associated coding system object is returned.
406 (coding_system_or_name))
408 if (NILP (coding_system_or_name))
409 coding_system_or_name = Qbinary;
410 if (CODING_SYSTEMP (coding_system_or_name))
411 return coding_system_or_name;
412 CHECK_SYMBOL (coding_system_or_name);
414 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
417 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
418 Retrieve the coding system of the given name.
419 Same as `find-coding-system' except that if there is no such
420 coding system, an error is signaled instead of returning nil.
424 Lisp_Object coding_system = Ffind_coding_system (name);
426 if (NILP (coding_system))
427 signal_simple_error ("No such coding system", name);
428 return coding_system;
431 /* We store the coding systems in hash tables with the names as the key and the
432 actual coding system object as the value. Occasionally we need to use them
433 in a list format. These routines provide us with that. */
434 struct coding_system_list_closure
436 Lisp_Object *coding_system_list;
440 add_coding_system_to_list_mapper (CONST void *hash_key, void *hash_contents,
441 void *coding_system_list_closure)
443 /* This function can GC */
444 Lisp_Object key, contents;
445 Lisp_Object *coding_system_list;
446 struct coding_system_list_closure *cscl =
447 (struct coding_system_list_closure *) coding_system_list_closure;
448 CVOID_TO_LISP (key, hash_key);
449 VOID_TO_LISP (contents, hash_contents);
450 coding_system_list = cscl->coding_system_list;
452 *coding_system_list = Fcons (XCODING_SYSTEM (contents)->name,
453 *coding_system_list);
457 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
458 Return a list of the names of all defined coding systems.
462 Lisp_Object coding_system_list = Qnil;
464 struct coding_system_list_closure coding_system_list_closure;
466 GCPRO1 (coding_system_list);
467 coding_system_list_closure.coding_system_list = &coding_system_list;
468 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
469 &coding_system_list_closure);
472 return coding_system_list;
475 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
476 Return the name of the given coding system.
480 coding_system = Fget_coding_system (coding_system);
481 return XCODING_SYSTEM_NAME (coding_system);
484 static struct Lisp_Coding_System *
485 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
487 struct Lisp_Coding_System *codesys =
488 alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system);
490 zero_lcrecord (codesys);
491 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
492 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
493 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
494 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
495 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
496 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
497 CODING_SYSTEM_TYPE (codesys) = type;
499 if (type == CODESYS_ISO2022)
502 for (i = 0; i < 4; i++)
503 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
505 else if (type == CODESYS_CCL)
507 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
508 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
511 CODING_SYSTEM_NAME (codesys) = name;
516 /* Given a list of charset conversion specs as specified in a Lisp
517 program, parse it into STORE_HERE. */
520 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
521 Lisp_Object spec_list)
525 EXTERNAL_LIST_LOOP (rest, spec_list)
527 Lisp_Object car = XCAR (rest);
528 Lisp_Object from, to;
529 struct charset_conversion_spec spec;
531 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
532 signal_simple_error ("Invalid charset conversion spec", car);
533 from = Fget_charset (XCAR (car));
534 to = Fget_charset (XCAR (XCDR (car)));
535 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
536 signal_simple_error_2
537 ("Attempted conversion between different charset types",
539 spec.from_charset = from;
540 spec.to_charset = to;
542 Dynarr_add (store_here, spec);
546 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
547 specs, return the equivalent as the Lisp programmer would see it.
549 If LOAD_HERE is 0, return Qnil. */
552 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
555 Lisp_Object result = Qnil;
559 for (i = 0; i < Dynarr_length (load_here); i++)
561 struct charset_conversion_spec *ccs =
562 Dynarr_atp (load_here, i);
563 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
566 return Fnreverse (result);
569 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
570 Register symbol NAME as a coding system.
572 TYPE describes the conversion method used and should be one of
575 Automatic conversion. XEmacs attempts to detect the coding system
578 No conversion. Use this for binary files and such. On output,
579 graphic characters that are not in ASCII or Latin-1 will be
580 replaced by a ?. (For a no-conversion-encoded buffer, these
581 characters will only be present if you explicitly insert them.)
583 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
585 Any ISO2022-compliant encoding. Among other things, this includes
586 JIS (the Japanese encoding commonly used for e-mail), EUC (the
587 standard Unix encoding for Japanese and other languages), and
588 Compound Text (the encoding used in X11). You can specify more
589 specific information about the conversion with the FLAGS argument.
591 Big5 (the encoding commonly used for Taiwanese).
593 The conversion is performed using a user-written pseudo-code
594 program. CCL (Code Conversion Language) is the name of this
597 Write out or read in the raw contents of the memory representing
598 the buffer's text. This is primarily useful for debugging
599 purposes, and is only enabled when XEmacs has been compiled with
600 DEBUG_XEMACS defined (via the --debug configure option).
601 WARNING: Reading in a file using 'internal conversion can result
602 in an internal inconsistency in the memory representing a
603 buffer's text, which will produce unpredictable results and may
604 cause XEmacs to crash. Under normal circumstances you should
605 never use 'internal conversion.
607 DOC-STRING is a string describing the coding system.
609 PROPS is a property list, describing the specific nature of the
610 character set. Recognized properties are:
613 String to be displayed in the modeline when this coding system is
617 End-of-line conversion to be used. It should be one of
620 Automatically detect the end-of-line type (LF, CRLF,
621 or CR). Also generate subsidiary coding systems named
622 `NAME-unix', `NAME-dos', and `NAME-mac', that are
623 identical to this coding system but have an EOL-TYPE
624 value of 'lf, 'crlf, and 'cr, respectively.
626 The end of a line is marked externally using ASCII LF.
627 Since this is also the way that XEmacs represents an
628 end-of-line internally, specifying this option results
629 in no end-of-line conversion. This is the standard
630 format for Unix text files.
632 The end of a line is marked externally using ASCII
633 CRLF. This is the standard format for MS-DOS text
636 The end of a line is marked externally using ASCII CR.
637 This is the standard format for Macintosh text files.
639 Automatically detect the end-of-line type but do not
640 generate subsidiary coding systems. (This value is
641 converted to nil when stored internally, and
642 `coding-system-property' will return nil.)
644 'post-read-conversion
645 Function called after a file has been read in, to perform the
646 decoding. Called with two arguments, BEG and END, denoting
647 a region of the current buffer to be decoded.
649 'pre-write-conversion
650 Function called before a file is written out, to perform the
651 encoding. Called with two arguments, BEG and END, denoting
652 a region of the current buffer to be encoded.
655 The following additional properties are recognized if TYPE is 'iso2022:
661 The character set initially designated to the G0 - G3 registers.
662 The value should be one of
664 -- A charset object (designate that character set)
665 -- nil (do not ever use this register)
666 -- t (no character set is initially designated to
667 the register, but may be later on; this automatically
668 sets the corresponding `force-g*-on-output' property)
674 If non-nil, send an explicit designation sequence on output before
675 using the specified register.
678 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
679 "ESC $ B" on output in place of the full designation sequences
680 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
683 If non-nil, don't designate ASCII to G0 at each end of line on output.
684 Setting this to non-nil also suppresses other state-resetting that
685 normally happens at the end of a line.
688 If non-nil, don't designate ASCII to G0 before control chars on output.
691 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
695 If non-nil, use locking-shift (SO/SI) instead of single-shift
696 or designation by escape sequence.
699 If non-nil, don't use ISO6429's direction specification.
702 If non-nil, literal control characters that are the same as
703 the beginning of a recognized ISO2022 or ISO6429 escape sequence
704 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
705 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
706 so that they can be properly distinguished from an escape sequence.
707 (Note that doing this results in a non-portable encoding.) This
708 encoding flag is used for byte-compiled files. Note that ESC
709 is a good choice for a quoting character because there are no
710 escape sequences whose second byte is a character from the Control-0
711 or Control-1 character sets; this is explicitly disallowed by the
714 'input-charset-conversion
715 A list of conversion specifications, specifying conversion of
716 characters in one charset to another when decoding is performed.
717 Each specification is a list of two elements: the source charset,
718 and the destination charset.
720 'output-charset-conversion
721 A list of conversion specifications, specifying conversion of
722 characters in one charset to another when encoding is performed.
723 The form of each specification is the same as for
724 'input-charset-conversion.
727 The following additional properties are recognized (and required)
731 CCL program used for decoding (converting to internal format).
734 CCL program used for encoding (converting to external format).
736 (name, type, doc_string, props))
738 struct Lisp_Coding_System *codesys;
739 Lisp_Object rest, key, value;
740 enum coding_system_type ty;
741 int need_to_setup_eol_systems = 1;
743 /* Convert type to constant */
744 if (NILP (type) || EQ (type, Qundecided))
745 { ty = CODESYS_AUTODETECT; }
746 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
747 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
748 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
749 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
750 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
752 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
755 signal_simple_error ("Invalid coding system type", type);
759 codesys = allocate_coding_system (ty, name);
761 if (NILP (doc_string))
762 doc_string = build_string ("");
764 CHECK_STRING (doc_string);
765 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
767 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
769 if (EQ (key, Qmnemonic))
772 CHECK_STRING (value);
773 CODING_SYSTEM_MNEMONIC (codesys) = value;
776 else if (EQ (key, Qeol_type))
778 need_to_setup_eol_systems = NILP (value);
781 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
784 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
785 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
786 else if (ty == CODESYS_ISO2022)
788 #define FROB_INITIAL_CHARSET(charset_num) \
789 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
790 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
792 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
793 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
794 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
795 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
797 #define FROB_FORCE_CHARSET(charset_num) \
798 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
800 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
801 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
802 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
803 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
805 #define FROB_BOOLEAN_PROPERTY(prop) \
806 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
808 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
809 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
810 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
811 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
812 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
813 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
814 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
816 else if (EQ (key, Qinput_charset_conversion))
818 codesys->iso2022.input_conv =
819 Dynarr_new (charset_conversion_spec);
820 parse_charset_conversion_specs (codesys->iso2022.input_conv,
823 else if (EQ (key, Qoutput_charset_conversion))
825 codesys->iso2022.output_conv =
826 Dynarr_new (charset_conversion_spec);
827 parse_charset_conversion_specs (codesys->iso2022.output_conv,
831 signal_simple_error ("Unrecognized property", key);
833 else if (EQ (type, Qccl))
835 if (EQ (key, Qdecode))
837 CHECK_VECTOR (value);
838 CODING_SYSTEM_CCL_DECODE (codesys) = value;
840 else if (EQ (key, Qencode))
842 CHECK_VECTOR (value);
843 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
846 signal_simple_error ("Unrecognized property", key);
849 signal_simple_error ("Unrecognized property", key);
852 if (need_to_setup_eol_systems)
853 setup_eol_coding_systems (codesys);
856 Lisp_Object codesys_obj;
857 XSETCODING_SYSTEM (codesys_obj, codesys);
858 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
863 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
864 Copy OLD-CODING-SYSTEM to NEW-NAME.
865 If NEW-NAME does not name an existing coding system, a new one will
868 (old_coding_system, new_name))
870 Lisp_Object new_coding_system;
871 old_coding_system = Fget_coding_system (old_coding_system);
872 new_coding_system = Ffind_coding_system (new_name);
873 if (NILP (new_coding_system))
875 XSETCODING_SYSTEM (new_coding_system,
876 allocate_coding_system
877 (XCODING_SYSTEM_TYPE (old_coding_system),
879 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
883 struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
884 struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
885 memcpy (((char *) to ) + sizeof (to->header),
886 ((char *) from) + sizeof (from->header),
887 sizeof (*from) - sizeof (from->header));
890 return new_coding_system;
894 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
896 struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
897 Lisp_Object new_coding_system;
899 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
900 return coding_system;
904 case EOL_AUTODETECT: return coding_system;
905 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
906 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
907 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
911 return NILP (new_coding_system) ? coding_system : new_coding_system;
914 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
915 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
917 (coding_system, eol_type))
919 coding_system = Fget_coding_system (coding_system);
921 return subsidiary_coding_system (coding_system,
922 symbol_to_eol_type (eol_type));
926 /************************************************************************/
927 /* Coding system accessors */
928 /************************************************************************/
930 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
931 Return the doc string for CODING-SYSTEM.
935 coding_system = Fget_coding_system (coding_system);
936 return XCODING_SYSTEM_DOC_STRING (coding_system);
939 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
940 Return the type of CODING-SYSTEM.
944 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
946 case CODESYS_AUTODETECT: return Qundecided;
947 case CODESYS_SHIFT_JIS: return Qshift_jis;
948 case CODESYS_ISO2022: return Qiso2022;
949 case CODESYS_BIG5: return Qbig5;
950 case CODESYS_CCL: return Qccl;
951 case CODESYS_NO_CONVERSION: return Qno_conversion;
953 case CODESYS_INTERNAL: return Qinternal;
959 return Qnil; /* not reached */
963 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
966 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
968 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
971 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
972 Return initial charset of CODING-SYSTEM designated to GNUM.
975 (coding_system, gnum))
977 coding_system = Fget_coding_system (coding_system);
980 return coding_system_charset (coding_system, XINT (gnum));
983 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
984 Return the PROP property of CODING-SYSTEM.
986 (coding_system, prop))
989 enum coding_system_type type;
991 coding_system = Fget_coding_system (coding_system);
993 type = XCODING_SYSTEM_TYPE (coding_system);
995 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
996 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
999 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1001 case CODESYS_PROP_ALL_OK:
1004 case CODESYS_PROP_ISO2022:
1005 if (type != CODESYS_ISO2022)
1007 ("Property only valid in ISO2022 coding systems",
1011 case CODESYS_PROP_CCL:
1012 if (type != CODESYS_CCL)
1014 ("Property only valid in CCL coding systems",
1024 signal_simple_error ("Unrecognized property", prop);
1026 if (EQ (prop, Qname))
1027 return XCODING_SYSTEM_NAME (coding_system);
1028 else if (EQ (prop, Qtype))
1029 return Fcoding_system_type (coding_system);
1030 else if (EQ (prop, Qdoc_string))
1031 return XCODING_SYSTEM_DOC_STRING (coding_system);
1032 else if (EQ (prop, Qmnemonic))
1033 return XCODING_SYSTEM_MNEMONIC (coding_system);
1034 else if (EQ (prop, Qeol_type))
1035 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1036 else if (EQ (prop, Qeol_lf))
1037 return XCODING_SYSTEM_EOL_LF (coding_system);
1038 else if (EQ (prop, Qeol_crlf))
1039 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1040 else if (EQ (prop, Qeol_cr))
1041 return XCODING_SYSTEM_EOL_CR (coding_system);
1042 else if (EQ (prop, Qpost_read_conversion))
1043 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1044 else if (EQ (prop, Qpre_write_conversion))
1045 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1046 else if (type == CODESYS_ISO2022)
1048 if (EQ (prop, Qcharset_g0))
1049 return coding_system_charset (coding_system, 0);
1050 else if (EQ (prop, Qcharset_g1))
1051 return coding_system_charset (coding_system, 1);
1052 else if (EQ (prop, Qcharset_g2))
1053 return coding_system_charset (coding_system, 2);
1054 else if (EQ (prop, Qcharset_g3))
1055 return coding_system_charset (coding_system, 3);
1057 #define FORCE_CHARSET(charset_num) \
1058 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1059 (coding_system, charset_num) ? Qt : Qnil)
1061 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1062 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1063 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1064 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1066 #define LISP_BOOLEAN(prop) \
1067 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1069 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1070 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1071 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1072 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1073 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1074 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1075 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1077 else if (EQ (prop, Qinput_charset_conversion))
1079 unparse_charset_conversion_specs
1080 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1081 else if (EQ (prop, Qoutput_charset_conversion))
1083 unparse_charset_conversion_specs
1084 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1088 else if (type == CODESYS_CCL)
1090 if (EQ (prop, Qdecode))
1091 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1092 else if (EQ (prop, Qencode))
1093 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1100 return Qnil; /* not reached */
1104 /************************************************************************/
1105 /* Coding category functions */
1106 /************************************************************************/
1109 decode_coding_category (Lisp_Object symbol)
1113 CHECK_SYMBOL (symbol);
1114 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1115 if (EQ (coding_category_symbol[i], symbol))
1118 signal_simple_error ("Unrecognized coding category", symbol);
1119 return 0; /* not reached */
1122 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1123 Return a list of all recognized coding categories.
1128 Lisp_Object list = Qnil;
1130 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1131 list = Fcons (coding_category_symbol[i], list);
1135 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1136 Change the priority order of the coding categories.
1137 LIST should be list of coding categories, in descending order of
1138 priority. Unspecified coding categories will be lower in priority
1139 than all specified ones, in the same relative order they were in
1144 int category_to_priority[CODING_CATEGORY_LAST + 1];
1148 /* First generate a list that maps coding categories to priorities. */
1150 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1151 category_to_priority[i] = -1;
1153 /* Highest priority comes from the specified list. */
1155 EXTERNAL_LIST_LOOP (rest, list)
1157 int cat = decode_coding_category (XCAR (rest));
1159 if (category_to_priority[cat] >= 0)
1160 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1161 category_to_priority[cat] = i++;
1164 /* Now go through the existing categories by priority to retrieve
1165 the categories not yet specified and preserve their priority
1167 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1169 int cat = coding_category_by_priority[j];
1170 if (category_to_priority[cat] < 0)
1171 category_to_priority[cat] = i++;
1174 /* Now we need to construct the inverse of the mapping we just
1177 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1178 coding_category_by_priority[category_to_priority[i]] = i;
1180 /* Phew! That was confusing. */
1184 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1185 Return a list of coding categories in descending order of priority.
1190 Lisp_Object list = Qnil;
1192 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1193 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1198 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1199 Change the coding system associated with a coding category.
1201 (coding_category, coding_system))
1203 int cat = decode_coding_category (coding_category);
1205 coding_system = Fget_coding_system (coding_system);
1206 coding_category_system[cat] = coding_system;
1210 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1211 Return the coding system associated with a coding category.
1215 int cat = decode_coding_category (coding_category);
1216 Lisp_Object sys = coding_category_system[cat];
1219 return XCODING_SYSTEM_NAME (sys);
1224 /************************************************************************/
1225 /* Detecting the encoding of data */
1226 /************************************************************************/
1228 struct detection_state
1230 enum eol_type eol_type;
1252 struct iso2022_decoder iso;
1254 int high_byte_count;
1255 unsigned int saw_single_shift:1;
1268 acceptable_control_char_p (int c)
1272 /* Allow and ignore control characters that you might
1273 reasonably see in a text file */
1278 case 8: /* backspace */
1279 case 11: /* vertical tab */
1280 case 12: /* form feed */
1281 case 26: /* MS-DOS C-z junk */
1282 case 31: /* '^_' -- for info */
1290 mask_has_at_most_one_bit_p (int mask)
1292 /* Perhaps the only thing useful you learn from intensive Microsoft
1293 technical interviews */
1294 return (mask & (mask - 1)) == 0;
1297 static enum eol_type
1298 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1307 st->eol.just_saw_cr = 1;
1312 if (st->eol.just_saw_cr)
1314 else if (st->eol.seen_anything)
1317 else if (st->eol.just_saw_cr)
1319 st->eol.just_saw_cr = 0;
1321 st->eol.seen_anything = 1;
1324 return EOL_AUTODETECT;
1327 /* Attempt to determine the encoding and EOL type of the given text.
1328 Before calling this function for the first type, you must initialize
1329 st->eol_type as appropriate and initialize st->mask to ~0.
1331 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1334 st->mask holds the determined coding category mask, or ~0 if only
1335 ASCII has been seen so far.
1339 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1340 is present in st->mask
1341 1 == definitive answers are here for both st->eol_type and st->mask
1345 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1346 unsigned int n, int just_do_eol)
1350 if (st->eol_type == EOL_AUTODETECT)
1351 st->eol_type = detect_eol_type (st, src, n);
1354 return st->eol_type != EOL_AUTODETECT;
1356 if (!st->seen_non_ascii)
1358 for (; n; n--, src++)
1361 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1363 st->seen_non_ascii = 1;
1364 st->shift_jis.mask = ~0;
1366 st->iso2022.mask = ~0;
1375 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1376 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1377 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1378 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1379 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1380 st->big5.mask = detect_coding_big5 (st, src, n);
1382 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask;
1385 int retval = mask_has_at_most_one_bit_p (st->mask);
1386 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1387 return retval && st->eol_type != EOL_AUTODETECT;
1392 coding_system_from_mask (int mask)
1396 /* If the file was entirely or basically ASCII, use the
1397 default value of `buffer-file-coding-system'. */
1398 Lisp_Object retval =
1399 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1402 retval = Ffind_coding_system (retval);
1406 (Qbad_variable, Qwarning,
1407 "Invalid `default-buffer-file-coding-system', set to nil");
1408 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1412 retval = Fget_coding_system (Qno_conversion);
1420 mask = postprocess_iso2022_mask (mask);
1422 /* Look through the coding categories by priority and find
1423 the first one that is allowed. */
1424 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1426 cat = coding_category_by_priority[i];
1427 if ((mask & (1 << cat)) &&
1428 !NILP (coding_category_system[cat]))
1432 return coding_category_system[cat];
1434 return Fget_coding_system (Qno_conversion);
1438 /* Given a seekable read stream and potential coding system and EOL type
1439 as specified, do any autodetection that is called for. If the
1440 coding system and/or EOL type are not autodetect, they will be left
1441 alone; but this function will never return an autodetect coding system
1444 This function does not automatically fetch subsidiary coding systems;
1445 that should be unnecessary with the explicit eol-type argument. */
1448 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1449 enum eol_type *eol_type_in_out)
1451 struct detection_state decst;
1453 if (*eol_type_in_out == EOL_AUTODETECT)
1454 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1457 decst.eol_type = *eol_type_in_out;
1460 /* If autodetection is called for, do it now. */
1461 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1462 *eol_type_in_out == EOL_AUTODETECT)
1467 unsigned char random_buffer[4096];
1470 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1473 if (detect_coding_type (&decst, random_buffer, nread,
1474 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1475 CODESYS_AUTODETECT))
1479 *eol_type_in_out = decst.eol_type;
1480 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1481 *codesys_in_out = coding_system_from_mask (decst.mask);
1484 /* If we absolutely can't determine the EOL type, just assume LF. */
1485 if (*eol_type_in_out == EOL_AUTODETECT)
1486 *eol_type_in_out = EOL_LF;
1488 Lstream_rewind (stream);
1491 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1492 Detect coding system of the text in the region between START and END.
1493 Returned a list of possible coding systems ordered by priority.
1494 If only ASCII characters are found, it returns 'undecided or one of
1495 its subsidiary coding systems according to a detected end-of-line
1496 type. Optional arg BUFFER defaults to the current buffer.
1498 (start, end, buffer))
1500 Lisp_Object val = Qnil;
1501 struct buffer *buf = decode_buffer (buffer, 0);
1503 Lisp_Object instream, lb_instream;
1504 Lstream *istr, *lb_istr;
1505 struct detection_state decst;
1506 struct gcpro gcpro1, gcpro2;
1508 get_buffer_range_char (buf, start, end, &b, &e, 0);
1509 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1510 lb_istr = XLSTREAM (lb_instream);
1511 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1512 istr = XLSTREAM (instream);
1513 GCPRO2 (instream, lb_instream);
1515 decst.eol_type = EOL_AUTODETECT;
1519 unsigned char random_buffer[4096];
1520 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1524 if (detect_coding_type (&decst, random_buffer, nread, 0))
1528 if (decst.mask == ~0)
1529 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1537 decst.mask = postprocess_iso2022_mask (decst.mask);
1539 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1541 int sys = coding_category_by_priority[i];
1542 if (decst.mask & (1 << sys))
1544 Lisp_Object codesys = coding_category_system[sys];
1545 if (!NILP (codesys))
1546 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1547 val = Fcons (codesys, val);
1551 Lstream_close (istr);
1553 Lstream_delete (istr);
1554 Lstream_delete (lb_istr);
1559 /************************************************************************/
1560 /* Converting to internal Mule format ("decoding") */
1561 /************************************************************************/
1563 /* A decoding stream is a stream used for decoding text (i.e.
1564 converting from some external format to internal format).
1565 The decoding-stream object keeps track of the actual coding
1566 stream, the stream that is at the other end, and data that
1567 needs to be persistent across the lifetime of the stream. */
1569 /* Handle the EOL stuff related to just-read-in character C.
1570 EOL_TYPE is the EOL type of the coding stream.
1571 FLAGS is the current value of FLAGS in the coding stream, and may
1572 be modified by this macro. (The macro only looks at the
1573 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1574 bytes are to be written. You need to also define a local goto
1575 label "label_continue_loop" that is at the end of the main
1576 character-reading loop.
1578 If C is a CR character, then this macro handles it entirely and
1579 jumps to label_continue_loop. Otherwise, this macro does not add
1580 anything to DST, and continues normally. You should continue
1581 processing C normally after this macro. */
1583 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1587 if (eol_type == EOL_CR) \
1588 Dynarr_add (dst, '\n'); \
1589 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1590 Dynarr_add (dst, c); \
1592 flags |= CODING_STATE_CR; \
1593 goto label_continue_loop; \
1595 else if (flags & CODING_STATE_CR) \
1596 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1598 Dynarr_add (dst, '\r'); \
1599 flags &= ~CODING_STATE_CR; \
1603 /* C should be a binary character in the range 0 - 255; convert
1604 to internal format and add to Dynarr DST. */
1606 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1608 if (BYTE_ASCII_P (c)) \
1609 Dynarr_add (dst, c); \
1610 else if (BYTE_C1_P (c)) \
1612 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1613 Dynarr_add (dst, c + 0x20); \
1617 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1618 Dynarr_add (dst, c); \
1622 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1626 DECODE_ADD_BINARY_CHAR (ch, dst); \
1631 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1633 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1634 if ((flags & CODING_STATE_END) && \
1635 (flags & CODING_STATE_CR)) \
1636 Dynarr_add (dst, '\r'); \
1639 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1641 struct decoding_stream
1643 /* Coding system that governs the conversion. */
1644 struct Lisp_Coding_System *codesys;
1646 /* Stream that we read the encoded data from or
1647 write the decoded data to. */
1650 /* If we are reading, then we can return only a fixed amount of
1651 data, so if the conversion resulted in too much data, we store it
1652 here for retrieval the next time around. */
1653 unsigned_char_dynarr *runoff;
1655 /* FLAGS holds flags indicating the current state of the decoding.
1656 Some of these flags are dependent on the coding system. */
1659 /* CH holds a partially built-up character. Since we only deal
1660 with one- and two-byte characters at the moment, we only use
1661 this to store the first byte of a two-byte character. */
1664 /* EOL_TYPE specifies the type of end-of-line conversion that
1665 currently applies. We need to keep this separate from the
1666 EOL type stored in CODESYS because the latter might indicate
1667 automatic EOL-type detection while the former will always
1668 indicate a particular EOL type. */
1669 enum eol_type eol_type;
1671 /* Additional ISO2022 information. We define the structure above
1672 because it's also needed by the detection routines. */
1673 struct iso2022_decoder iso2022;
1675 /* Additional information (the state of the running CCL program)
1676 used by the CCL decoder. */
1677 struct ccl_program ccl;
1679 struct detection_state decst;
1682 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1683 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1684 static int decoding_rewinder (Lstream *stream);
1685 static int decoding_seekable_p (Lstream *stream);
1686 static int decoding_flusher (Lstream *stream);
1687 static int decoding_closer (Lstream *stream);
1688 static Lisp_Object decoding_marker (Lisp_Object stream,
1689 void (*markobj) (Lisp_Object));
1691 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1692 sizeof (struct decoding_stream));
1695 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1697 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1698 Lisp_Object str_obj;
1700 /* We do not need to mark the coding systems or charsets stored
1701 within the stream because they are stored in a global list
1702 and automatically marked. */
1704 XSETLSTREAM (str_obj, str);
1706 if (str->imp->marker)
1707 return (str->imp->marker) (str_obj, markobj);
1712 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1713 so we read data from the other end, decode it, and store it into DATA. */
1716 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1718 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1719 unsigned char *orig_data = data;
1721 int error_occurred = 0;
1723 /* We need to interface to mule_decode(), which expects to take some
1724 amount of data and store the result into a Dynarr. We have
1725 mule_decode() store into str->runoff, and take data from there
1728 /* We loop until we have enough data, reading chunks from the other
1729 end and decoding it. */
1732 /* Take data from the runoff if we can. Make sure to take at
1733 most SIZE bytes, and delete the data from the runoff. */
1734 if (Dynarr_length (str->runoff) > 0)
1736 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1737 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1738 Dynarr_delete_many (str->runoff, 0, chunk);
1744 break; /* No more room for data */
1746 if (str->flags & CODING_STATE_END)
1747 /* This means that on the previous iteration, we hit the EOF on
1748 the other end. We loop once more so that mule_decode() can
1749 output any final stuff it may be holding, or any "go back
1750 to a sane state" escape sequences. (This latter makes sense
1751 during encoding.) */
1754 /* Exhausted the runoff, so get some more. DATA has at least
1755 SIZE bytes left of storage in it, so it's OK to read directly
1756 into it. (We'll be overwriting above, after we've decoded it
1757 into the runoff.) */
1758 read_size = Lstream_read (str->other_end, data, size);
1765 /* There might be some more end data produced in the translation.
1766 See the comment above. */
1767 str->flags |= CODING_STATE_END;
1768 mule_decode (stream, data, str->runoff, read_size);
1771 if (data - orig_data == 0)
1772 return error_occurred ? -1 : 0;
1774 return data - orig_data;
1778 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1780 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1783 /* Decode all our data into the runoff, and then attempt to write
1784 it all out to the other end. Remove whatever chunk we succeeded
1786 mule_decode (stream, data, str->runoff, size);
1787 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1788 Dynarr_length (str->runoff));
1790 Dynarr_delete_many (str->runoff, 0, retval);
1791 /* Do NOT return retval. The return value indicates how much
1792 of the incoming data was written, not how many bytes were
1798 reset_decoding_stream (struct decoding_stream *str)
1800 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1802 Lisp_Object coding_system;
1803 XSETCODING_SYSTEM (coding_system, str->codesys);
1804 reset_iso2022 (coding_system, &str->iso2022);
1806 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1808 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1811 str->flags = str->ch = 0;
1815 decoding_rewinder (Lstream *stream)
1817 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1818 reset_decoding_stream (str);
1819 Dynarr_reset (str->runoff);
1820 return Lstream_rewind (str->other_end);
1824 decoding_seekable_p (Lstream *stream)
1826 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1827 return Lstream_seekable_p (str->other_end);
1831 decoding_flusher (Lstream *stream)
1833 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1834 return Lstream_flush (str->other_end);
1838 decoding_closer (Lstream *stream)
1840 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1841 if (stream->flags & LSTREAM_FL_WRITE)
1843 str->flags |= CODING_STATE_END;
1844 decoding_writer (stream, 0, 0);
1846 Dynarr_free (str->runoff);
1847 if (str->iso2022.composite_chars)
1848 Dynarr_free (str->iso2022.composite_chars);
1849 return Lstream_close (str->other_end);
1853 decoding_stream_coding_system (Lstream *stream)
1855 Lisp_Object coding_system;
1856 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1858 XSETCODING_SYSTEM (coding_system, str->codesys);
1859 return subsidiary_coding_system (coding_system, str->eol_type);
1863 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1865 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1866 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1868 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1869 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1870 reset_decoding_stream (str);
1873 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
1874 stream for writing, no automatic code detection will be performed.
1875 The reason for this is that automatic code detection requires a
1876 seekable input. Things will also fail if you open a decoding
1877 stream for reading using a non-fully-specified coding system and
1878 a non-seekable input stream. */
1881 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1884 Lstream *lstr = Lstream_new (lstream_decoding, mode);
1885 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1889 str->other_end = stream;
1890 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1891 str->eol_type = EOL_AUTODETECT;
1892 if (!strcmp (mode, "r")
1893 && Lstream_seekable_p (stream))
1894 /* We can determine the coding system now. */
1895 determine_real_coding_system (stream, &codesys, &str->eol_type);
1896 set_decoding_stream_coding_system (lstr, codesys);
1897 str->decst.eol_type = str->eol_type;
1898 str->decst.mask = ~0;
1899 XSETLSTREAM (obj, lstr);
1904 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
1906 return make_decoding_stream_1 (stream, codesys, "r");
1910 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
1912 return make_decoding_stream_1 (stream, codesys, "w");
1915 /* Note: the decode_coding_* functions all take the same
1916 arguments as mule_decode(), which is to say some SRC data of
1917 size N, which is to be stored into dynamic array DST.
1918 DECODING is the stream within which the decoding is
1919 taking place, but no data is actually read from or
1920 written to that stream; that is handled in decoding_reader()
1921 or decoding_writer(). This allows the same functions to
1922 be used for both reading and writing. */
1925 mule_decode (Lstream *decoding, CONST unsigned char *src,
1926 unsigned_char_dynarr *dst, unsigned int n)
1928 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
1930 /* If necessary, do encoding-detection now. We do this when
1931 we're a writing stream or a non-seekable reading stream,
1932 meaning that we can't just process the whole input,
1933 rewind, and start over. */
1935 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
1936 str->eol_type == EOL_AUTODETECT)
1938 Lisp_Object codesys;
1940 XSETCODING_SYSTEM (codesys, str->codesys);
1941 detect_coding_type (&str->decst, src, n,
1942 CODING_SYSTEM_TYPE (str->codesys) !=
1943 CODESYS_AUTODETECT);
1944 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
1945 str->decst.mask != ~0)
1946 /* #### This is cheesy. What we really ought to do is
1947 buffer up a certain amount of data so as to get a
1948 less random result. */
1949 codesys = coding_system_from_mask (str->decst.mask);
1950 str->eol_type = str->decst.eol_type;
1951 if (XCODING_SYSTEM (codesys) != str->codesys)
1953 /* Preserve the CODING_STATE_END flag in case it was set.
1954 If we erase it, bad things might happen. */
1955 int was_end = str->flags & CODING_STATE_END;
1956 set_decoding_stream_coding_system (decoding, codesys);
1958 str->flags |= CODING_STATE_END;
1962 switch (CODING_SYSTEM_TYPE (str->codesys))
1965 case CODESYS_INTERNAL:
1966 Dynarr_add_many (dst, src, n);
1969 case CODESYS_AUTODETECT:
1970 /* If we got this far and still haven't decided on the coding
1971 system, then do no conversion. */
1972 case CODESYS_NO_CONVERSION:
1973 decode_coding_no_conversion (decoding, src, dst, n);
1975 case CODESYS_SHIFT_JIS:
1976 decode_coding_sjis (decoding, src, dst, n);
1979 decode_coding_big5 (decoding, src, dst, n);
1982 ccl_driver (&str->ccl, src, dst, n, 0);
1984 case CODESYS_ISO2022:
1985 decode_coding_iso2022 (decoding, src, dst, n);
1992 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
1993 Decode the text between START and END which is encoded in CODING-SYSTEM.
1994 This is useful if you've read in encoded text from a file without decoding
1995 it (e.g. you read in a JIS-formatted file but used the `binary' or
1996 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
1997 Return length of decoded text.
1998 BUFFER defaults to the current buffer if unspecified.
2000 (start, end, coding_system, buffer))
2003 struct buffer *buf = decode_buffer (buffer, 0);
2004 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2005 Lstream *istr, *ostr;
2006 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2008 get_buffer_range_char (buf, start, end, &b, &e, 0);
2010 barf_if_buffer_read_only (buf, b, e);
2012 coding_system = Fget_coding_system (coding_system);
2013 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2014 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2015 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2017 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2018 Fget_coding_system (Qbinary));
2019 istr = XLSTREAM (instream);
2020 ostr = XLSTREAM (outstream);
2021 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2023 /* The chain of streams looks like this:
2025 [BUFFER] <----- send through
2026 ------> [ENCODE AS BINARY]
2027 ------> [DECODE AS SPECIFIED]
2033 char tempbuf[1024]; /* some random amount */
2034 Bufpos newpos, even_newer_pos;
2035 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2036 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2040 newpos = lisp_buffer_stream_startpos (istr);
2041 Lstream_write (ostr, tempbuf, size_in_bytes);
2042 even_newer_pos = lisp_buffer_stream_startpos (istr);
2043 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2046 Lstream_close (istr);
2047 Lstream_close (ostr);
2049 Lstream_delete (istr);
2050 Lstream_delete (ostr);
2051 Lstream_delete (XLSTREAM (de_outstream));
2052 Lstream_delete (XLSTREAM (lb_outstream));
2057 /************************************************************************/
2058 /* Converting to an external encoding ("encoding") */
2059 /************************************************************************/
2061 /* An encoding stream is an output stream. When you create the
2062 stream, you specify the coding system that governs the encoding
2063 and another stream that the resulting encoded data is to be
2064 sent to, and then start sending data to it. */
2066 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2068 struct encoding_stream
2070 /* Coding system that governs the conversion. */
2071 struct Lisp_Coding_System *codesys;
2073 /* Stream that we read the encoded data from or
2074 write the decoded data to. */
2077 /* If we are reading, then we can return only a fixed amount of
2078 data, so if the conversion resulted in too much data, we store it
2079 here for retrieval the next time around. */
2080 unsigned_char_dynarr *runoff;
2082 /* FLAGS holds flags indicating the current state of the encoding.
2083 Some of these flags are dependent on the coding system. */
2086 /* CH holds a partially built-up character. Since we only deal
2087 with one- and two-byte characters at the moment, we only use
2088 this to store the first byte of a two-byte character. */
2091 /* Additional information used by the ISO2022 encoder. */
2094 /* CHARSET holds the character sets currently assigned to the G0
2095 through G3 registers. It is initialized from the array
2096 INITIAL_CHARSET in CODESYS. */
2097 Lisp_Object charset[4];
2099 /* Which registers are currently invoked into the left (GL) and
2100 right (GR) halves of the 8-bit encoding space? */
2101 int register_left, register_right;
2103 /* Whether we need to explicitly designate the charset in the
2104 G? register before using it. It is initialized from the
2105 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2106 unsigned char force_charset_on_output[4];
2108 /* Other state variables that need to be preserved across
2110 Lisp_Object current_charset;
2112 int current_char_boundary;
2115 /* Additional information (the state of the running CCL program)
2116 used by the CCL encoder. */
2117 struct ccl_program ccl;
2120 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2121 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2123 static int encoding_rewinder (Lstream *stream);
2124 static int encoding_seekable_p (Lstream *stream);
2125 static int encoding_flusher (Lstream *stream);
2126 static int encoding_closer (Lstream *stream);
2127 static Lisp_Object encoding_marker (Lisp_Object stream,
2128 void (*markobj) (Lisp_Object));
2130 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2131 sizeof (struct encoding_stream));
2134 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2136 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2137 Lisp_Object str_obj;
2139 /* We do not need to mark the coding systems or charsets stored
2140 within the stream because they are stored in a global list
2141 and automatically marked. */
2143 XSETLSTREAM (str_obj, str);
2145 if (str->imp->marker)
2146 return (str->imp->marker) (str_obj, markobj);
2151 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2152 so we read data from the other end, encode it, and store it into DATA. */
2155 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2157 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2158 unsigned char *orig_data = data;
2160 int error_occurred = 0;
2162 /* We need to interface to mule_encode(), which expects to take some
2163 amount of data and store the result into a Dynarr. We have
2164 mule_encode() store into str->runoff, and take data from there
2167 /* We loop until we have enough data, reading chunks from the other
2168 end and encoding it. */
2171 /* Take data from the runoff if we can. Make sure to take at
2172 most SIZE bytes, and delete the data from the runoff. */
2173 if (Dynarr_length (str->runoff) > 0)
2175 int chunk = min ((int) size, Dynarr_length (str->runoff));
2176 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2177 Dynarr_delete_many (str->runoff, 0, chunk);
2183 break; /* No more room for data */
2185 if (str->flags & CODING_STATE_END)
2186 /* This means that on the previous iteration, we hit the EOF on
2187 the other end. We loop once more so that mule_encode() can
2188 output any final stuff it may be holding, or any "go back
2189 to a sane state" escape sequences. (This latter makes sense
2190 during encoding.) */
2193 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2194 left of storage in it, so it's OK to read directly into it.
2195 (We'll be overwriting above, after we've encoded it into the
2197 read_size = Lstream_read (str->other_end, data, size);
2204 /* There might be some more end data produced in the translation.
2205 See the comment above. */
2206 str->flags |= CODING_STATE_END;
2207 mule_encode (stream, data, str->runoff, read_size);
2210 if (data == orig_data)
2211 return error_occurred ? -1 : 0;
2213 return data - orig_data;
2217 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2219 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2222 /* Encode all our data into the runoff, and then attempt to write
2223 it all out to the other end. Remove whatever chunk we succeeded
2225 mule_encode (stream, data, str->runoff, size);
2226 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2227 Dynarr_length (str->runoff));
2229 Dynarr_delete_many (str->runoff, 0, retval);
2230 /* Do NOT return retval. The return value indicates how much
2231 of the incoming data was written, not how many bytes were
2237 reset_encoding_stream (struct encoding_stream *str)
2239 switch (CODING_SYSTEM_TYPE (str->codesys))
2241 case CODESYS_ISO2022:
2245 for (i = 0; i < 4; i++)
2247 str->iso2022.charset[i] =
2248 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2249 str->iso2022.force_charset_on_output[i] =
2250 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2252 str->iso2022.register_left = 0;
2253 str->iso2022.register_right = 1;
2254 str->iso2022.current_charset = Qnil;
2255 str->iso2022.current_half = 0;
2256 str->iso2022.current_char_boundary = 1;
2260 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2266 str->flags = str->ch = 0;
2270 encoding_rewinder (Lstream *stream)
2272 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2273 reset_encoding_stream (str);
2274 Dynarr_reset (str->runoff);
2275 return Lstream_rewind (str->other_end);
2279 encoding_seekable_p (Lstream *stream)
2281 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2282 return Lstream_seekable_p (str->other_end);
2286 encoding_flusher (Lstream *stream)
2288 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2289 return Lstream_flush (str->other_end);
2293 encoding_closer (Lstream *stream)
2295 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2296 if (stream->flags & LSTREAM_FL_WRITE)
2298 str->flags |= CODING_STATE_END;
2299 encoding_writer (stream, 0, 0);
2301 Dynarr_free (str->runoff);
2302 return Lstream_close (str->other_end);
2306 encoding_stream_coding_system (Lstream *stream)
2308 Lisp_Object coding_system;
2309 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2311 XSETCODING_SYSTEM (coding_system, str->codesys);
2312 return coding_system;
2316 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2318 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2319 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2321 reset_encoding_stream (str);
2325 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2328 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2329 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2333 str->runoff = Dynarr_new (unsigned_char);
2334 str->other_end = stream;
2335 set_encoding_stream_coding_system (lstr, codesys);
2336 XSETLSTREAM (obj, lstr);
2341 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2343 return make_encoding_stream_1 (stream, codesys, "r");
2347 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2349 return make_encoding_stream_1 (stream, codesys, "w");
2352 /* Convert N bytes of internally-formatted data stored in SRC to an
2353 external format, according to the encoding stream ENCODING.
2354 Store the encoded data into DST. */
2357 mule_encode (Lstream *encoding, CONST unsigned char *src,
2358 unsigned_char_dynarr *dst, unsigned int n)
2360 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2362 switch (CODING_SYSTEM_TYPE (str->codesys))
2365 case CODESYS_INTERNAL:
2366 Dynarr_add_many (dst, src, n);
2369 case CODESYS_AUTODETECT:
2370 /* If we got this far and still haven't decided on the coding
2371 system, then do no conversion. */
2372 case CODESYS_NO_CONVERSION:
2373 encode_coding_no_conversion (encoding, src, dst, n);
2375 case CODESYS_SHIFT_JIS:
2376 encode_coding_sjis (encoding, src, dst, n);
2379 encode_coding_big5 (encoding, src, dst, n);
2382 ccl_driver (&str->ccl, src, dst, n, 0);
2384 case CODESYS_ISO2022:
2385 encode_coding_iso2022 (encoding, src, dst, n);
2392 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2393 Encode the text between START and END using CODING-SYSTEM.
2394 This will, for example, convert Japanese characters into stuff such as
2395 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2396 text. BUFFER defaults to the current buffer if unspecified.
2398 (start, end, coding_system, buffer))
2401 struct buffer *buf = decode_buffer (buffer, 0);
2402 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2403 Lstream *istr, *ostr;
2404 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2406 get_buffer_range_char (buf, start, end, &b, &e, 0);
2408 barf_if_buffer_read_only (buf, b, e);
2410 coding_system = Fget_coding_system (coding_system);
2411 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2412 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2413 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2414 Fget_coding_system (Qbinary));
2415 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2417 istr = XLSTREAM (instream);
2418 ostr = XLSTREAM (outstream);
2419 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2420 /* The chain of streams looks like this:
2422 [BUFFER] <----- send through
2423 ------> [ENCODE AS SPECIFIED]
2424 ------> [DECODE AS BINARY]
2429 char tempbuf[1024]; /* some random amount */
2430 Bufpos newpos, even_newer_pos;
2431 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2432 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2436 newpos = lisp_buffer_stream_startpos (istr);
2437 Lstream_write (ostr, tempbuf, size_in_bytes);
2438 even_newer_pos = lisp_buffer_stream_startpos (istr);
2439 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2445 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2446 Lstream_close (istr);
2447 Lstream_close (ostr);
2449 Lstream_delete (istr);
2450 Lstream_delete (ostr);
2451 Lstream_delete (XLSTREAM (de_outstream));
2452 Lstream_delete (XLSTREAM (lb_outstream));
2453 return make_int (retlen);
2458 /************************************************************************/
2459 /* Shift-JIS methods */
2460 /************************************************************************/
2462 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2463 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2464 as is. A character of JISX0201-Kana (TYPE94 character set) is
2465 encoded by "position-code + 0x80". A character of JISX0208
2466 (TYPE94x94 character set) is encoded in 2-byte but two
2467 position-codes are divided and shifted so that it fit in the range
2470 --- CODE RANGE of Shift-JIS ---
2471 (character set) (range)
2473 JISX0201-Kana 0xA0 .. 0xDF
2474 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2475 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2476 -------------------------------
2480 /* Is this the first byte of a Shift-JIS two-byte char? */
2482 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2483 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2485 /* Is this the second byte of a Shift-JIS two-byte char? */
2487 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2488 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2490 #define BYTE_SJIS_KATAKANA_P(c) \
2491 ((c) >= 0xA1 && (c) <= 0xDF)
2494 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2502 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2504 if (st->shift_jis.in_second_byte)
2506 st->shift_jis.in_second_byte = 0;
2510 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2511 st->shift_jis.in_second_byte = 1;
2513 return CODING_CATEGORY_SHIFT_JIS_MASK;
2516 /* Convert Shift-JIS data to internal format. */
2519 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2520 unsigned_char_dynarr *dst, unsigned int n)
2523 unsigned int flags, ch;
2524 enum eol_type eol_type;
2525 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2527 CODING_STREAM_DECOMPOSE (str, flags, ch);
2528 eol_type = str->eol_type;
2536 /* Previous character was first byte of Shift-JIS Kanji char. */
2537 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2539 unsigned char e1, e2;
2541 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2542 DECODE_SJIS (ch, c, e1, e2);
2543 Dynarr_add (dst, e1);
2544 Dynarr_add (dst, e2);
2548 DECODE_ADD_BINARY_CHAR (ch, dst);
2549 DECODE_ADD_BINARY_CHAR (c, dst);
2555 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2556 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2558 else if (BYTE_SJIS_KATAKANA_P (c))
2560 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2561 Dynarr_add (dst, c);
2564 DECODE_ADD_BINARY_CHAR (c, dst);
2566 label_continue_loop:;
2569 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2571 CODING_STREAM_COMPOSE (str, flags, ch);
2574 /* Convert internally-formatted data to Shift-JIS. */
2577 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2578 unsigned_char_dynarr *dst, unsigned int n)
2581 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2582 unsigned int flags, ch;
2583 enum eol_type eol_type;
2585 CODING_STREAM_DECOMPOSE (str, flags, ch);
2586 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2593 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2594 Dynarr_add (dst, '\r');
2595 if (eol_type != EOL_CR)
2596 Dynarr_add (dst, '\n');
2599 else if (BYTE_ASCII_P (c))
2601 Dynarr_add (dst, c);
2604 else if (BUFBYTE_LEADING_BYTE_P (c))
2605 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2606 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2607 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2610 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2612 Dynarr_add (dst, c);
2615 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2616 ch == LEADING_BYTE_JAPANESE_JISX0208)
2620 unsigned char j1, j2;
2621 ENCODE_SJIS (ch, c, j1, j2);
2622 Dynarr_add (dst, j1);
2623 Dynarr_add (dst, j2);
2629 CODING_STREAM_COMPOSE (str, flags, ch);
2632 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2633 Decode a JISX0208 character of Shift-JIS coding-system.
2634 CODE is the character code in Shift-JIS as a cons of type bytes.
2635 Return the corresponding character.
2639 unsigned char c1, c2, s1, s2;
2642 CHECK_INT (XCAR (code));
2643 CHECK_INT (XCDR (code));
2644 s1 = XINT (XCAR (code));
2645 s2 = XINT (XCDR (code));
2646 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2647 BYTE_SJIS_TWO_BYTE_2_P (s2))
2649 DECODE_SJIS (s1, s2, c1, c2);
2650 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2651 c1 & 0x7F, c2 & 0x7F));
2657 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2658 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2659 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2663 Lisp_Object charset;
2666 CHECK_CHAR_COERCE_INT (ch);
2667 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2668 if (EQ (charset, Vcharset_japanese_jisx0208))
2670 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2671 return Fcons (make_int (s1), make_int (s2));
2678 /************************************************************************/
2680 /************************************************************************/
2682 /* BIG5 is a coding system encoding two character sets: ASCII and
2683 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2684 character set and is encoded in two-byte.
2686 --- CODE RANGE of BIG5 ---
2687 (character set) (range)
2689 Big5 (1st byte) 0xA1 .. 0xFE
2690 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2691 --------------------------
2693 Since the number of characters in Big5 is larger than maximum
2694 characters in Emacs' charset (96x96), it can't be handled as one
2695 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2696 and `charset-big5-2'. Both <type>s are TYPE94x94. The former
2697 contains frequently used characters and the latter contains less
2698 frequently used characters. */
2700 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2701 ((c) >= 0xA1 && (c) <= 0xFE)
2703 /* Is this the second byte of a Shift-JIS two-byte char? */
2705 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2706 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2708 /* Number of Big5 characters which have the same code in 1st byte. */
2710 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2712 /* Code conversion macros. These are macros because they are used in
2713 inner loops during code conversion.
2715 Note that temporary variables in macros introduce the classic
2716 dynamic-scoping problems with variable names. We use capital-
2717 lettered variables in the assumption that XEmacs does not use
2718 capital letters in variables except in a very formalized way
2721 /* Convert Big5 code (b1, b2) into its internal string representation
2724 /* There is a much simpler way to split the Big5 charset into two.
2725 For the moment I'm going to leave the algorithm as-is because it
2726 claims to separate out the most-used characters into a single
2727 charset, which perhaps will lead to optimizations in various
2730 The way the algorithm works is something like this:
2732 Big5 can be viewed as a 94x157 charset, where the row is
2733 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2734 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2735 the split between low and high column numbers is apparently
2736 meaningless; ascending rows produce less and less frequent chars.
2737 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2738 the first charset, and the upper half (0xC9 .. 0xFE) to the
2739 second. To do the conversion, we convert the character into
2740 a single number where 0 .. 156 is the first row, 157 .. 313
2741 is the second, etc. That way, the characters are ordered by
2742 decreasing frequency. Then we just chop the space in two
2743 and coerce the result into a 94x94 space.
2746 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2748 int B1 = b1, B2 = b2; \
2750 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2754 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2758 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2759 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2761 c1 = I / (0xFF - 0xA1) + 0xA1; \
2762 c2 = I % (0xFF - 0xA1) + 0xA1; \
2765 /* Convert the internal string representation of a Big5 character
2766 (lb, c1, c2) into Big5 code (b1, b2). */
2768 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2770 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2772 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2774 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2776 b1 = I / BIG5_SAME_ROW + 0xA1; \
2777 b2 = I % BIG5_SAME_ROW; \
2778 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2782 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2790 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2791 (c >= 0x80 && c <= 0xA0))
2793 if (st->big5.in_second_byte)
2795 st->big5.in_second_byte = 0;
2796 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2800 st->big5.in_second_byte = 1;
2802 return CODING_CATEGORY_BIG5_MASK;
2805 /* Convert Big5 data to internal format. */
2808 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2809 unsigned_char_dynarr *dst, unsigned int n)
2812 unsigned int flags, ch;
2813 enum eol_type eol_type;
2814 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2816 CODING_STREAM_DECOMPOSE (str, flags, ch);
2817 eol_type = str->eol_type;
2824 /* Previous character was first byte of Big5 char. */
2825 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2827 unsigned char b1, b2, b3;
2828 DECODE_BIG5 (ch, c, b1, b2, b3);
2829 Dynarr_add (dst, b1);
2830 Dynarr_add (dst, b2);
2831 Dynarr_add (dst, b3);
2835 DECODE_ADD_BINARY_CHAR (ch, dst);
2836 DECODE_ADD_BINARY_CHAR (c, dst);
2842 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2843 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2846 DECODE_ADD_BINARY_CHAR (c, dst);
2848 label_continue_loop:;
2851 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2853 CODING_STREAM_COMPOSE (str, flags, ch);
2856 /* Convert internally-formatted data to Big5. */
2859 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2860 unsigned_char_dynarr *dst, unsigned int n)
2863 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2864 unsigned int flags, ch;
2865 enum eol_type eol_type;
2867 CODING_STREAM_DECOMPOSE (str, flags, ch);
2868 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2875 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2876 Dynarr_add (dst, '\r');
2877 if (eol_type != EOL_CR)
2878 Dynarr_add (dst, '\n');
2880 else if (BYTE_ASCII_P (c))
2883 Dynarr_add (dst, c);
2885 else if (BUFBYTE_LEADING_BYTE_P (c))
2887 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
2888 c == LEADING_BYTE_CHINESE_BIG5_2)
2890 /* A recognized leading byte. */
2892 continue; /* not done with this character. */
2894 /* otherwise just ignore this character. */
2896 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
2897 ch == LEADING_BYTE_CHINESE_BIG5_2)
2899 /* Previous char was a recognized leading byte. */
2901 continue; /* not done with this character. */
2905 /* Encountering second byte of a Big5 character. */
2906 unsigned char b1, b2;
2908 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
2909 Dynarr_add (dst, b1);
2910 Dynarr_add (dst, b2);
2916 CODING_STREAM_COMPOSE (str, flags, ch);
2920 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
2921 Decode a Big5 character CODE of BIG5 coding-system.
2922 CODE is the character code in BIG5, a cons of two integers.
2923 Return the corresponding character.
2927 unsigned char c1, c2, b1, b2;
2930 CHECK_INT (XCAR (code));
2931 CHECK_INT (XCDR (code));
2932 b1 = XINT (XCAR (code));
2933 b2 = XINT (XCDR (code));
2934 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
2935 BYTE_BIG5_TWO_BYTE_2_P (b2))
2938 Lisp_Object charset;
2939 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
2940 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
2941 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
2947 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
2948 Encode the Big5 character CH to BIG5 coding-system.
2949 Return the corresponding character code in Big5.
2953 Lisp_Object charset;
2956 CHECK_CHAR_COERCE_INT (ch);
2957 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2958 if (EQ (charset, Vcharset_chinese_big5_1) ||
2959 EQ (charset, Vcharset_chinese_big5_2))
2961 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
2963 return Fcons (make_int (b1), make_int (b2));
2970 /************************************************************************/
2971 /* ISO2022 methods */
2972 /************************************************************************/
2974 /* The following note describes the coding system ISO2022 briefly.
2975 Since the intention of this note is to help understanding of the
2976 programs in this file, some parts are NOT ACCURATE or OVERLY
2977 SIMPLIFIED. For thorough understanding, please refer to the
2978 original document of ISO2022.
2980 ISO2022 provides many mechanisms to encode several character sets
2981 in 7-bit and 8-bit environments. If one chooses 7-bit environment,
2982 all text is encoded by codes of less than 128. This may make the
2983 encoded text a little bit longer, but the text get more stability
2984 to pass through several gateways (some of them strip off MSB).
2986 There are two kind of character sets: control character set and
2987 graphic character set. The former contains control characters such
2988 as `newline' and `escape' to provide control functions (control
2989 functions are provided also by escape sequence). The latter
2990 contains graphic characters such as 'A' and '-'. Emacs recognizes
2991 two control character sets and many graphic character sets.
2993 Graphic character sets are classified into one of four types,
2994 according to the dimension and number of characters in the set:
2995 TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each
2996 character set is assigned an identification byte, unique for each
2997 type, called "final character" (denoted as <F> hereafter). The <F>
2998 of each character set is decided by ECMA(*) when it is registered
2999 in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for
3002 Note (*): ECMA = European Computer Manufacturers Association
3004 Here are examples of graphic character set [NAME(<F>)]:
3005 o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3006 o TYPE96 -- right-half-of-ISO8859-1('A'), ...
3007 o TYPE94x94 -- GB2312('A'), JISX0208('B'), ...
3008 o TYPE96x96 -- none for the moment
3010 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
3011 C0 [0x00..0x1F] -- control character plane 0
3012 GL [0x20..0x7F] -- graphic character plane 0
3013 C1 [0x80..0x9F] -- control character plane 1
3014 GR [0xA0..0xFF] -- graphic character plane 1
3016 A control character set is directly designated and invoked to C0 or
3017 C1 by an escape sequence. The most common case is that:
3018 - ISO646's control character set is designated/invoked to C0, and
3019 - ISO6429's control character set is designated/invoked to C1,
3020 and usually these designations/invocations are omitted in encoded
3021 text. In a 7-bit environment, only C0 can be used, and a control
3022 character for C1 is encoded by an appropriate escape sequence to
3023 fit into the environment. All control characters for C1 are
3024 defined to have corresponding escape sequences.
3026 A graphic character set is at first designated to one of four
3027 graphic registers (G0 through G3), then these graphic registers are
3028 invoked to GL or GR. These designations and invocations can be
3029 done independently. The most common case is that G0 is invoked to
3030 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3031 these invocations and designations are omitted in encoded text.
3032 In a 7-bit environment, only GL can be used.
3034 When a graphic character set of TYPE94 or TYPE94x94 is invoked to
3035 GL, codes 0x20 and 0x7F of the GL area work as control characters
3036 SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area
3039 There are two ways of invocation: locking-shift and single-shift.
3040 With locking-shift, the invocation lasts until the next different
3041 invocation, whereas with single-shift, the invocation works only
3042 for the following character and doesn't affect locking-shift.
3043 Invocations are done by the following control characters or escape
3046 ----------------------------------------------------------------------
3047 abbrev function cntrl escape seq description
3048 ----------------------------------------------------------------------
3049 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3050 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3051 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR
3052 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3053 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR
3054 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3055 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR
3056 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3057 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3058 ----------------------------------------------------------------------
3059 The first four are for locking-shift. Control characters for these
3060 functions are defined by macros ISO_CODE_XXX in `coding.h'.
3062 Designations are done by the following escape sequences.
3063 ----------------------------------------------------------------------
3064 escape sequence description
3065 ----------------------------------------------------------------------
3066 ESC '(' <F> designate TYPE94<F> to G0
3067 ESC ')' <F> designate TYPE94<F> to G1
3068 ESC '*' <F> designate TYPE94<F> to G2
3069 ESC '+' <F> designate TYPE94<F> to G3
3070 ESC ',' <F> designate TYPE96<F> to G0 (*)
3071 ESC '-' <F> designate TYPE96<F> to G1
3072 ESC '.' <F> designate TYPE96<F> to G2
3073 ESC '/' <F> designate TYPE96<F> to G3
3074 ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**)
3075 ESC '$' ')' <F> designate TYPE94x94<F> to G1
3076 ESC '$' '*' <F> designate TYPE94x94<F> to G2
3077 ESC '$' '+' <F> designate TYPE94x94<F> to G3
3078 ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*)
3079 ESC '$' '-' <F> designate TYPE96x96<F> to G1
3080 ESC '$' '.' <F> designate TYPE96x96<F> to G2
3081 ESC '$' '/' <F> designate TYPE96x96<F> to G3
3082 ----------------------------------------------------------------------
3083 In this list, "TYPE94<F>" means a graphic character set of type TYPE94
3084 and final character <F>, and etc.
3086 Note (*): Although these designations are not allowed in ISO2022,
3087 Emacs accepts them on decoding, and produces them on encoding
3088 TYPE96 or TYPE96x96 character set in a coding system which is
3089 characterized as 7-bit environment, non-locking-shift, and
3092 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3093 '(' can be omitted. We call this as "short-form" here after.
3095 Now you may notice that there are a lot of ways for encoding the
3096 same multilingual text in ISO2022. Actually, there exist many
3097 coding systems such as Compound Text (used in X's inter client
3098 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3099 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3100 localized platforms), and all of these are variants of ISO2022.
3102 In addition to the above, Emacs handles two more kinds of escape
3103 sequences: ISO6429's direction specification and Emacs' private
3104 sequence for specifying character composition.
3106 ISO6429's direction specification takes the following format:
3107 o CSI ']' -- end of the current direction
3108 o CSI '0' ']' -- end of the current direction
3109 o CSI '1' ']' -- start of left-to-right text
3110 o CSI '2' ']' -- start of right-to-left text
3111 The control character CSI (0x9B: control sequence introducer) is
3112 abbreviated to the escape sequence ESC '[' in 7-bit environment.
3114 Character composition specification takes the following format:
3115 o ESC '0' -- start character composition
3116 o ESC '1' -- end character composition
3117 Since these are not standard escape sequences of any ISO, the use
3118 of them for these meanings is restricted to Emacs only. */
3121 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3125 for (i = 0; i < 4; i++)
3127 if (!NILP (coding_system))
3129 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3131 iso->charset[i] = Qt;
3132 iso->invalid_designated[i] = 0;
3134 iso->esc = ISO_ESC_NOTHING;
3135 iso->esc_bytes_index = 0;
3136 iso->register_left = 0;
3137 iso->register_right = 1;
3138 iso->switched_dir_and_no_valid_charset_yet = 0;
3139 iso->invalid_switch_dir = 0;
3140 iso->output_direction_sequence = 0;
3141 iso->output_literally = 0;
3142 if (iso->composite_chars)
3143 Dynarr_reset (iso->composite_chars);
3147 fit_to_be_escape_quoted (unsigned char c)
3164 /* Parse one byte of an ISO2022 escape sequence.
3165 If the result is an invalid escape sequence, return 0 and
3166 do not change anything in STR. Otherwise, if the result is
3167 an incomplete escape sequence, update ISO2022.ESC and
3168 ISO2022.ESC_BYTES and return -1. Otherwise, update
3169 all the state variables (but not ISO2022.ESC_BYTES) and
3172 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3173 or invocation of an invalid character set and treat that as
3174 an unrecognized escape sequence. */
3177 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3178 unsigned char c, unsigned int *flags,
3179 int check_invalid_charsets)
3181 /* (1) If we're at the end of a designation sequence, CS is the
3182 charset being designated and REG is the register to designate
3185 (2) If we're at the end of a locking-shift sequence, REG is
3186 the register to invoke and HALF (0 == left, 1 == right) is
3187 the half to invoke it into.
3189 (3) If we're at the end of a single-shift sequence, REG is
3190 the register to invoke. */
3191 Lisp_Object cs = Qnil;
3194 /* NOTE: This code does goto's all over the fucking place.
3195 The reason for this is that we're basically implementing
3196 a state machine here, and hierarchical languages like C
3197 don't really provide a clean way of doing this. */
3199 if (! (*flags & CODING_STATE_ESCAPE))
3200 /* At beginning of escape sequence; we need to reset our
3201 escape-state variables. */
3202 iso->esc = ISO_ESC_NOTHING;
3204 iso->output_literally = 0;
3205 iso->output_direction_sequence = 0;
3209 case ISO_ESC_NOTHING:
3210 iso->esc_bytes_index = 0;
3213 case ISO_CODE_ESC: /* Start escape sequence */
3214 *flags |= CODING_STATE_ESCAPE;
3218 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3219 *flags |= CODING_STATE_ESCAPE;
3220 iso->esc = ISO_ESC_5_11;
3223 case ISO_CODE_SO: /* locking shift 1 */
3226 case ISO_CODE_SI: /* locking shift 0 */
3230 case ISO_CODE_SS2: /* single shift */
3233 case ISO_CODE_SS3: /* single shift */
3237 default: /* Other control characters */
3244 /**** single shift ****/
3246 case 'N': /* single shift 2 */
3249 case 'O': /* single shift 3 */
3253 /**** locking shift ****/
3255 case '~': /* locking shift 1 right */
3258 case 'n': /* locking shift 2 */
3261 case '}': /* locking shift 2 right */
3264 case 'o': /* locking shift 3 */
3267 case '|': /* locking shift 3 right */
3271 /**** composite ****/
3274 iso->esc = ISO_ESC_START_COMPOSITE;
3275 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3276 CODING_STATE_COMPOSITE;
3280 iso->esc = ISO_ESC_END_COMPOSITE;
3281 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3282 ~CODING_STATE_COMPOSITE;
3285 /**** directionality ****/
3288 iso->esc = ISO_ESC_5_11;
3291 /**** designation ****/
3293 case '$': /* multibyte charset prefix */
3294 iso->esc = ISO_ESC_2_4;
3298 if (0x28 <= c && c <= 0x2F)
3300 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3304 /* This function is called with CODESYS equal to nil when
3305 doing coding-system detection. */
3307 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3308 && fit_to_be_escape_quoted (c))
3310 iso->esc = ISO_ESC_LITERAL;
3311 *flags &= CODING_STATE_ISO2022_LOCK;
3321 /**** directionality ****/
3323 case ISO_ESC_5_11: /* ISO6429 direction control */
3326 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3327 goto directionality;
3329 if (c == '0') iso->esc = ISO_ESC_5_11_0;
3330 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3331 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3335 case ISO_ESC_5_11_0:
3338 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3339 goto directionality;
3343 case ISO_ESC_5_11_1:
3346 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3347 goto directionality;
3351 case ISO_ESC_5_11_2:
3354 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
3355 goto directionality;
3360 iso->esc = ISO_ESC_DIRECTIONALITY;
3361 /* Various junk here to attempt to preserve the direction sequences
3362 literally in the text if they would otherwise be swallowed due
3363 to invalid designations that don't show up as actual charset
3364 changes in the text. */
3365 if (iso->invalid_switch_dir)
3367 /* We already inserted a direction switch literally into the
3368 text. We assume (#### this may not be right) that the
3369 next direction switch is the one going the other way,
3370 and we need to output that literally as well. */
3371 iso->output_literally = 1;
3372 iso->invalid_switch_dir = 0;
3378 /* If we are in the thrall of an invalid designation,
3379 then stick the directionality sequence literally into the
3380 output stream so it ends up in the original text again. */
3381 for (jj = 0; jj < 4; jj++)
3382 if (iso->invalid_designated[jj])
3386 iso->output_literally = 1;
3387 iso->invalid_switch_dir = 1;
3390 /* Indicate that we haven't yet seen a valid designation,
3391 so that if a switch-dir is directly followed by an
3392 invalid designation, both get inserted literally. */
3393 iso->switched_dir_and_no_valid_charset_yet = 1;
3398 /**** designation ****/
3401 if (0x28 <= c && c <= 0x2F)
3403 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
3406 if (0x40 <= c && c <= 0x42)
3408 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
3409 *flags & CODING_STATE_R2L ?
3410 CHARSET_RIGHT_TO_LEFT :
3411 CHARSET_LEFT_TO_RIGHT);
3421 if (c < '0' || c > '~')
3422 return 0; /* bad final byte */
3424 if (iso->esc >= ISO_ESC_2_8 &&
3425 iso->esc <= ISO_ESC_2_15)
3427 type = ((iso->esc >= ISO_ESC_2_12) ?
3428 CHARSET_TYPE_96 : CHARSET_TYPE_94);
3429 reg = (iso->esc - ISO_ESC_2_8) & 3;
3431 else if (iso->esc >= ISO_ESC_2_4_8 &&
3432 iso->esc <= ISO_ESC_2_4_15)
3434 type = ((iso->esc >= ISO_ESC_2_4_12) ?
3435 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
3436 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
3440 /* Can this ever be reached? -slb */
3444 cs = CHARSET_BY_ATTRIBUTES (type, c,
3445 *flags & CODING_STATE_R2L ?
3446 CHARSET_RIGHT_TO_LEFT :
3447 CHARSET_LEFT_TO_RIGHT);
3453 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
3457 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
3458 /* can't invoke something that ain't there. */
3460 iso->esc = ISO_ESC_SINGLE_SHIFT;
3461 *flags &= CODING_STATE_ISO2022_LOCK;
3463 *flags |= CODING_STATE_SS2;
3465 *flags |= CODING_STATE_SS3;
3469 if (check_invalid_charsets &&
3470 !CHARSETP (iso->charset[reg]))
3471 /* can't invoke something that ain't there. */
3474 iso->register_right = reg;
3476 iso->register_left = reg;
3477 *flags &= CODING_STATE_ISO2022_LOCK;
3478 iso->esc = ISO_ESC_LOCKING_SHIFT;
3482 if (NILP (cs) && check_invalid_charsets)
3484 iso->invalid_designated[reg] = 1;
3485 iso->charset[reg] = Vcharset_ascii;
3486 iso->esc = ISO_ESC_DESIGNATE;
3487 *flags &= CODING_STATE_ISO2022_LOCK;
3488 iso->output_literally = 1;
3489 if (iso->switched_dir_and_no_valid_charset_yet)
3491 /* We encountered a switch-direction followed by an
3492 invalid designation. Ensure that the switch-direction
3493 gets outputted; otherwise it will probably get eaten
3494 when the text is written out again. */
3495 iso->switched_dir_and_no_valid_charset_yet = 0;
3496 iso->output_direction_sequence = 1;
3497 /* And make sure that the switch-dir going the other
3498 way gets outputted, as well. */
3499 iso->invalid_switch_dir = 1;
3503 /* This function is called with CODESYS equal to nil when
3504 doing coding-system detection. */
3505 if (!NILP (codesys))
3507 charset_conversion_spec_dynarr *dyn =
3508 XCODING_SYSTEM (codesys)->iso2022.input_conv;
3514 for (i = 0; i < Dynarr_length (dyn); i++)
3516 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
3517 if (EQ (cs, spec->from_charset))
3518 cs = spec->to_charset;
3523 iso->charset[reg] = cs;
3524 iso->esc = ISO_ESC_DESIGNATE;
3525 *flags &= CODING_STATE_ISO2022_LOCK;
3526 if (iso->invalid_designated[reg])
3528 iso->invalid_designated[reg] = 0;
3529 iso->output_literally = 1;
3531 if (iso->switched_dir_and_no_valid_charset_yet)
3532 iso->switched_dir_and_no_valid_charset_yet = 0;
3537 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
3543 /* #### There are serious deficiencies in the recognition mechanism
3544 here. This needs to be much smarter if it's going to cut it. */
3546 if (!st->iso2022.initted)
3548 reset_iso2022 (Qnil, &st->iso2022.iso);
3549 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
3550 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3551 CODING_CATEGORY_ISO_8_1_MASK |
3552 CODING_CATEGORY_ISO_8_2_MASK |
3553 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
3554 st->iso2022.flags = 0;
3555 st->iso2022.high_byte_count = 0;
3556 st->iso2022.saw_single_shift = 0;
3557 st->iso2022.initted = 1;
3560 mask = st->iso2022.mask;
3567 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3568 st->iso2022.high_byte_count++;
3572 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
3574 if (st->iso2022.high_byte_count & 1)
3575 /* odd number of high bytes; assume not iso-8-2 */
3576 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3578 st->iso2022.high_byte_count = 0;
3579 st->iso2022.saw_single_shift = 0;
3581 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3583 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
3584 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
3585 { /* control chars */
3588 /* Allow and ignore control characters that you might
3589 reasonably see in a text file */
3594 case 8: /* backspace */
3595 case 11: /* vertical tab */
3596 case 12: /* form feed */
3597 case 26: /* MS-DOS C-z junk */
3598 case 31: /* '^_' -- for info */
3599 goto label_continue_loop;
3606 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
3609 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
3610 &st->iso2022.flags, 0))
3612 switch (st->iso2022.iso.esc)
3614 case ISO_ESC_DESIGNATE:
3615 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
3616 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3618 case ISO_ESC_LOCKING_SHIFT:
3619 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
3620 goto ran_out_of_chars;
3621 case ISO_ESC_SINGLE_SHIFT:
3622 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
3623 st->iso2022.saw_single_shift = 1;
3632 goto ran_out_of_chars;
3635 label_continue_loop:;
3644 postprocess_iso2022_mask (int mask)
3646 /* #### kind of cheesy */
3647 /* If seven-bit ISO is allowed, then assume that the encoding is
3648 entirely seven-bit and turn off the eight-bit ones. */
3649 if (mask & CODING_CATEGORY_ISO_7_MASK)
3650 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3651 CODING_CATEGORY_ISO_8_1_MASK |
3652 CODING_CATEGORY_ISO_8_2_MASK);
3656 /* If FLAGS is a null pointer or specifies right-to-left motion,
3657 output a switch-dir-to-left-to-right sequence to DST.
3658 Also update FLAGS if it is not a null pointer.
3659 If INTERNAL_P is set, we are outputting in internal format and
3660 need to handle the CSI differently. */
3663 restore_left_to_right_direction (struct Lisp_Coding_System *codesys,
3664 unsigned_char_dynarr *dst,
3665 unsigned int *flags,
3668 if (!flags || (*flags & CODING_STATE_R2L))
3670 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3672 Dynarr_add (dst, ISO_CODE_ESC);
3673 Dynarr_add (dst, '[');
3675 else if (internal_p)
3676 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3678 Dynarr_add (dst, ISO_CODE_CSI);
3679 Dynarr_add (dst, '0');
3680 Dynarr_add (dst, ']');
3682 *flags &= ~CODING_STATE_R2L;
3686 /* If FLAGS is a null pointer or specifies a direction different from
3687 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
3688 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
3689 sequence to DST. Also update FLAGS if it is not a null pointer.
3690 If INTERNAL_P is set, we are outputting in internal format and
3691 need to handle the CSI differently. */
3694 ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys,
3695 unsigned_char_dynarr *dst, unsigned int *flags,
3698 if ((!flags || (*flags & CODING_STATE_R2L)) &&
3699 direction == CHARSET_LEFT_TO_RIGHT)
3700 restore_left_to_right_direction (codesys, dst, flags, internal_p);
3701 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
3702 && (!flags || !(*flags & CODING_STATE_R2L)) &&
3703 direction == CHARSET_RIGHT_TO_LEFT)
3705 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3707 Dynarr_add (dst, ISO_CODE_ESC);
3708 Dynarr_add (dst, '[');
3710 else if (internal_p)
3711 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3713 Dynarr_add (dst, ISO_CODE_CSI);
3714 Dynarr_add (dst, '2');
3715 Dynarr_add (dst, ']');
3717 *flags |= CODING_STATE_R2L;
3721 /* Convert ISO2022-format data to internal format. */
3724 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
3725 unsigned_char_dynarr *dst, unsigned int n)
3728 unsigned int flags, ch;
3729 enum eol_type eol_type;
3730 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3731 Lisp_Object coding_system;
3732 unsigned_char_dynarr *real_dst = dst;
3734 CODING_STREAM_DECOMPOSE (str, flags, ch);
3735 eol_type = str->eol_type;
3736 XSETCODING_SYSTEM (coding_system, str->codesys);
3738 if (flags & CODING_STATE_COMPOSITE)
3739 dst = str->iso2022.composite_chars;
3744 if (flags & CODING_STATE_ESCAPE)
3745 { /* Within ESC sequence */
3746 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
3751 switch (str->iso2022.esc)
3753 case ISO_ESC_START_COMPOSITE:
3754 if (str->iso2022.composite_chars)
3755 Dynarr_reset (str->iso2022.composite_chars);
3757 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
3758 dst = str->iso2022.composite_chars;
3760 case ISO_ESC_END_COMPOSITE:
3762 Bufbyte comstr[MAX_EMCHAR_LEN];
3764 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
3765 Dynarr_length (dst));
3767 len = set_charptr_emchar (comstr, emch);
3768 Dynarr_add_many (dst, comstr, len);
3772 case ISO_ESC_LITERAL:
3773 DECODE_ADD_BINARY_CHAR (c, dst);
3777 /* Everything else handled already */
3782 /* Attempted error recovery. */
3783 if (str->iso2022.output_direction_sequence)
3784 ensure_correct_direction (flags & CODING_STATE_R2L ?
3785 CHARSET_RIGHT_TO_LEFT :
3786 CHARSET_LEFT_TO_RIGHT,
3787 str->codesys, dst, 0, 1);
3788 /* More error recovery. */
3789 if (!retval || str->iso2022.output_literally)
3791 /* Output the (possibly invalid) sequence */
3793 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
3794 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
3795 flags &= CODING_STATE_ISO2022_LOCK;
3797 n++, src--;/* Repeat the loop with the same character. */
3800 /* No sense in reprocessing the final byte of the
3801 escape sequence; it could mess things up anyway.
3803 DECODE_ADD_BINARY_CHAR (c, dst);
3808 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
3809 { /* Control characters */
3811 /***** Error-handling *****/
3813 /* If we were in the middle of a character, dump out the
3814 partial character. */
3815 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3817 /* If we just saw a single-shift character, dump it out.
3818 This may dump out the wrong sort of single-shift character,
3819 but least it will give an indication that something went
3821 if (flags & CODING_STATE_SS2)
3823 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
3824 flags &= ~CODING_STATE_SS2;
3826 if (flags & CODING_STATE_SS3)
3828 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
3829 flags &= ~CODING_STATE_SS3;
3832 /***** Now handle the control characters. *****/
3835 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3837 flags &= CODING_STATE_ISO2022_LOCK;
3839 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
3840 DECODE_ADD_BINARY_CHAR (c, dst);
3843 { /* Graphic characters */
3844 Lisp_Object charset;
3848 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3850 /* Now determine the charset. */
3851 reg = ((flags & CODING_STATE_SS2) ? 2
3852 : (flags & CODING_STATE_SS3) ? 3
3853 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
3854 : str->iso2022.register_left);
3855 charset = str->iso2022.charset[reg];
3857 /* Error checking: */
3858 if (NILP (charset) || str->iso2022.invalid_designated[reg]
3859 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
3860 && XCHARSET_CHARS (charset) == 94))
3861 /* Mrmph. We are trying to invoke a register that has no
3862 or an invalid charset in it, or trying to add a character
3863 outside the range of the charset. Insert that char literally
3864 to preserve it for the output. */
3866 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3867 DECODE_ADD_BINARY_CHAR (c, dst);
3872 /* Things are probably hunky-dorey. */
3874 /* Fetch reverse charset, maybe. */
3875 if (((flags & CODING_STATE_R2L) &&
3876 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
3878 (!(flags & CODING_STATE_R2L) &&
3879 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
3881 Lisp_Object new_charset =
3882 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
3883 if (!NILP (new_charset))
3884 charset = new_charset;
3887 lb = XCHARSET_LEADING_BYTE (charset);
3888 switch (XCHARSET_REP_BYTES (charset))
3891 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3892 Dynarr_add (dst, c & 0x7F);
3895 case 2: /* one-byte official */
3896 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3897 Dynarr_add (dst, lb);
3898 Dynarr_add (dst, c | 0x80);
3901 case 3: /* one-byte private or two-byte official */
3902 if (XCHARSET_PRIVATE_P (charset))
3904 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3905 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
3906 Dynarr_add (dst, lb);
3907 Dynarr_add (dst, c | 0x80);
3913 Dynarr_add (dst, lb);
3914 Dynarr_add (dst, ch | 0x80);
3915 Dynarr_add (dst, c | 0x80);
3923 default: /* two-byte private */
3926 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
3927 Dynarr_add (dst, lb);
3928 Dynarr_add (dst, ch | 0x80);
3929 Dynarr_add (dst, c | 0x80);
3938 flags &= CODING_STATE_ISO2022_LOCK;
3941 label_continue_loop:;
3944 if (flags & CODING_STATE_END)
3945 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3947 CODING_STREAM_COMPOSE (str, flags, ch);
3951 /***** ISO2022 encoder *****/
3953 /* Designate CHARSET into register REG. */
3956 iso2022_designate (Lisp_Object charset, unsigned char reg,
3957 struct encoding_stream *str, unsigned_char_dynarr *dst)
3959 CONST char *inter94 = "()*+", *inter96= ",-./";
3961 unsigned char final;
3962 Lisp_Object old_charset = str->iso2022.charset[reg];
3964 str->iso2022.charset[reg] = charset;
3965 if (!CHARSETP (charset))
3966 /* charset might be an initial nil or t. */
3968 type = XCHARSET_TYPE (charset);
3969 final = XCHARSET_FINAL (charset);
3970 if (!str->iso2022.force_charset_on_output[reg] &&
3971 CHARSETP (old_charset) &&
3972 XCHARSET_TYPE (old_charset) == type &&
3973 XCHARSET_FINAL (old_charset) == final)
3976 str->iso2022.force_charset_on_output[reg] = 0;
3979 charset_conversion_spec_dynarr *dyn =
3980 str->codesys->iso2022.output_conv;
3986 for (i = 0; i < Dynarr_length (dyn); i++)
3988 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
3989 if (EQ (charset, spec->from_charset))
3990 charset = spec->to_charset;
3995 Dynarr_add (dst, ISO_CODE_ESC);
3998 case CHARSET_TYPE_94:
3999 Dynarr_add (dst, inter94[reg]);
4001 case CHARSET_TYPE_96:
4002 Dynarr_add (dst, inter96[reg]);
4004 case CHARSET_TYPE_94X94:
4005 Dynarr_add (dst, '$');
4007 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4010 Dynarr_add (dst, inter94[reg]);
4012 case CHARSET_TYPE_96X96:
4013 Dynarr_add (dst, '$');
4014 Dynarr_add (dst, inter96[reg]);
4017 Dynarr_add (dst, final);
4021 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4023 if (str->iso2022.register_left != 0)
4025 Dynarr_add (dst, ISO_CODE_SI);
4026 str->iso2022.register_left = 0;
4031 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4033 if (str->iso2022.register_left != 1)
4035 Dynarr_add (dst, ISO_CODE_SO);
4036 str->iso2022.register_left = 1;
4040 /* Convert internally-formatted data to ISO2022 format. */
4043 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4044 unsigned_char_dynarr *dst, unsigned int n)
4046 unsigned char charmask, c;
4047 unsigned int flags, ch;
4048 enum eol_type eol_type;
4049 unsigned char char_boundary;
4050 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4051 struct Lisp_Coding_System *codesys = str->codesys;
4053 Lisp_Object charset;
4056 /* flags for handling composite chars. We do a little switcharoo
4057 on the source while we're outputting the composite char. */
4058 unsigned int saved_n = 0;
4059 CONST unsigned char *saved_src = NULL;
4060 int in_composite = 0;
4062 CODING_STREAM_DECOMPOSE (str, flags, ch);
4063 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4064 char_boundary = str->iso2022.current_char_boundary;
4065 charset = str->iso2022.current_charset;
4066 half = str->iso2022.current_half;
4073 if (BYTE_ASCII_P (c))
4074 { /* Processing ASCII character */
4077 restore_left_to_right_direction (codesys, dst, &flags, 0);
4079 /* Make sure G0 contains ASCII */
4080 if ((c > ' ' && c < ISO_CODE_DEL) ||
4081 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4083 ensure_normal_shift (str, dst);
4084 iso2022_designate (Vcharset_ascii, 0, str, dst);
4087 /* If necessary, restore everything to the default state
4090 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4092 restore_left_to_right_direction (codesys, dst, &flags, 0);
4094 ensure_normal_shift (str, dst);
4096 for (i = 0; i < 4; i++)
4098 Lisp_Object initial_charset =
4099 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4100 iso2022_designate (initial_charset, i, str, dst);
4105 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4106 Dynarr_add (dst, '\r');
4107 if (eol_type != EOL_CR)
4108 Dynarr_add (dst, c);
4112 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4113 && fit_to_be_escape_quoted (c))
4114 Dynarr_add (dst, ISO_CODE_ESC);
4115 Dynarr_add (dst, c);
4120 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4121 { /* Processing Leading Byte */
4123 charset = CHARSET_BY_LEADING_BYTE (c);
4124 if (LEADING_BYTE_PREFIX_P(c))
4126 else if (!EQ (charset, Vcharset_control_1)
4127 && !EQ (charset, Vcharset_composite))
4131 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4132 codesys, dst, &flags, 0);
4134 /* Now determine which register to use. */
4136 for (i = 0; i < 4; i++)
4138 if (EQ (charset, str->iso2022.charset[i]) ||
4140 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4149 if (XCHARSET_GRAPHIC (charset) != 0)
4151 if (!NILP (str->iso2022.charset[1]) &&
4152 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4153 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4155 else if (!NILP (str->iso2022.charset[2]))
4157 else if (!NILP (str->iso2022.charset[3]))
4166 iso2022_designate (charset, reg, str, dst);
4168 /* Now invoke that register. */
4172 ensure_normal_shift (str, dst);
4177 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4179 ensure_shift_out (str, dst);
4187 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4189 Dynarr_add (dst, ISO_CODE_ESC);
4190 Dynarr_add (dst, 'N');
4195 Dynarr_add (dst, ISO_CODE_SS2);
4201 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4203 Dynarr_add (dst, ISO_CODE_ESC);
4204 Dynarr_add (dst, 'O');
4209 Dynarr_add (dst, ISO_CODE_SS3);
4221 { /* Processing Non-ASCII character */
4222 charmask = (half == 0 ? 0x7F : 0xFF);
4224 if (EQ (charset, Vcharset_control_1))
4226 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4227 && fit_to_be_escape_quoted (c))
4228 Dynarr_add (dst, ISO_CODE_ESC);
4229 /* you asked for it ... */
4230 Dynarr_add (dst, c - 0x20);
4234 switch (XCHARSET_REP_BYTES (charset))
4237 Dynarr_add (dst, c & charmask);
4240 if (XCHARSET_PRIVATE_P (charset))
4242 Dynarr_add (dst, c & charmask);
4247 if (EQ (charset, Vcharset_composite))
4251 /* #### Bother! We don't know how to
4253 Dynarr_add (dst, '~');
4257 Emchar emch = MAKE_CHAR (Vcharset_composite,
4258 ch & 0x7F, c & 0x7F);
4259 Lisp_Object lstr = composite_char_string (emch);
4263 src = XSTRING_DATA (lstr);
4264 n = XSTRING_LENGTH (lstr);
4265 Dynarr_add (dst, ISO_CODE_ESC);
4266 Dynarr_add (dst, '0'); /* start composing */
4271 Dynarr_add (dst, ch & charmask);
4272 Dynarr_add (dst, c & charmask);
4285 Dynarr_add (dst, ch & charmask);
4286 Dynarr_add (dst, c & charmask);
4307 Dynarr_add (dst, ISO_CODE_ESC);
4308 Dynarr_add (dst, '1'); /* end composing */
4309 goto back_to_square_n; /* Wheeeeeeeee ..... */
4312 if (char_boundary && flags & CODING_STATE_END)
4314 restore_left_to_right_direction (codesys, dst, &flags, 0);
4315 ensure_normal_shift (str, dst);
4316 for (i = 0; i < 4; i++)
4318 Lisp_Object initial_charset =
4319 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4320 iso2022_designate (initial_charset, i, str, dst);
4324 CODING_STREAM_COMPOSE (str, flags, ch);
4325 str->iso2022.current_char_boundary = char_boundary;
4326 str->iso2022.current_charset = charset;
4327 str->iso2022.current_half = half;
4329 /* Verbum caro factum est! */
4333 /************************************************************************/
4334 /* No-conversion methods */
4335 /************************************************************************/
4337 /* This is used when reading in "binary" files -- i.e. files that may
4338 contain all 256 possible byte values and that are not to be
4339 interpreted as being in any particular decoding. */
4341 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4342 unsigned_char_dynarr *dst, unsigned int n)
4345 unsigned int flags, ch;
4346 enum eol_type eol_type;
4347 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4349 CODING_STREAM_DECOMPOSE (str, flags, ch);
4350 eol_type = str->eol_type;
4356 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4357 DECODE_ADD_BINARY_CHAR (c, dst);
4358 label_continue_loop:;
4361 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4363 CODING_STREAM_COMPOSE (str, flags, ch);
4367 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4368 unsigned_char_dynarr *dst, unsigned int n)
4371 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4372 unsigned int flags, ch;
4373 enum eol_type eol_type;
4375 CODING_STREAM_DECOMPOSE (str, flags, ch);
4376 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4383 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4384 Dynarr_add (dst, '\r');
4385 if (eol_type != EOL_CR)
4386 Dynarr_add (dst, '\n');
4389 else if (BYTE_ASCII_P (c))
4392 Dynarr_add (dst, c);
4394 else if (BUFBYTE_LEADING_BYTE_P (c))
4397 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
4398 c == LEADING_BYTE_CONTROL_1)
4401 Dynarr_add (dst, '~'); /* untranslatable character */
4405 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
4406 Dynarr_add (dst, c);
4407 else if (ch == LEADING_BYTE_CONTROL_1)
4410 Dynarr_add (dst, c - 0x20);
4412 /* else it should be the second or third byte of an
4413 untranslatable character, so ignore it */
4418 CODING_STREAM_COMPOSE (str, flags, ch);
4422 /************************************************************************/
4423 /* Simple internal/external functions */
4424 /************************************************************************/
4426 static Extbyte_dynarr *conversion_out_dynarr;
4427 static Bufbyte_dynarr *conversion_in_dynarr;
4429 /* Determine coding system from coding format */
4431 /* #### not correct for all values of `fmt'! */
4433 external_data_format_to_coding_system (enum external_data_format fmt)
4437 case FORMAT_FILENAME:
4438 case FORMAT_TERMINAL:
4439 if (EQ (Vfile_name_coding_system, Qnil) ||
4440 EQ (Vfile_name_coding_system, Qbinary))
4443 return Fget_coding_system (Vfile_name_coding_system);
4445 return Fget_coding_system (Qctext);
4452 convert_to_external_format (CONST Bufbyte *ptr,
4455 enum external_data_format fmt)
4457 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
4459 if (!conversion_out_dynarr)
4460 conversion_out_dynarr = Dynarr_new (Extbyte);
4462 Dynarr_reset (conversion_out_dynarr);
4464 if (NILP (coding_system))
4466 CONST Bufbyte *end = ptr + len;
4471 (BYTE_ASCII_P (*ptr)) ? *ptr :
4472 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
4473 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
4476 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
4480 #ifdef ERROR_CHECK_BUFPOS
4481 assert (ptr == end);
4486 Lisp_Object instream, outstream, da_outstream;
4487 Lstream *istr, *ostr;
4488 struct gcpro gcpro1, gcpro2, gcpro3;
4489 char tempbuf[1024]; /* some random amount */
4491 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4492 da_outstream = make_dynarr_output_stream
4493 ((unsigned_char_dynarr *) conversion_out_dynarr);
4495 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
4496 istr = XLSTREAM (instream);
4497 ostr = XLSTREAM (outstream);
4498 GCPRO3 (instream, outstream, da_outstream);
4501 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4504 Lstream_write (ostr, tempbuf, size_in_bytes);
4506 Lstream_close (istr);
4507 Lstream_close (ostr);
4509 Lstream_delete (istr);
4510 Lstream_delete (ostr);
4511 Lstream_delete (XLSTREAM (da_outstream));
4514 *len_out = Dynarr_length (conversion_out_dynarr);
4515 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
4516 return Dynarr_atp (conversion_out_dynarr, 0);
4520 convert_from_external_format (CONST Extbyte *ptr,
4523 enum external_data_format fmt)
4525 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
4527 if (!conversion_in_dynarr)
4528 conversion_in_dynarr = Dynarr_new (Bufbyte);
4530 Dynarr_reset (conversion_in_dynarr);
4532 if (NILP (coding_system))
4534 CONST Extbyte *end = ptr + len;
4535 for (; ptr < end; ptr++)
4538 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
4543 Lisp_Object instream, outstream, da_outstream;
4544 Lstream *istr, *ostr;
4545 struct gcpro gcpro1, gcpro2, gcpro3;
4546 char tempbuf[1024]; /* some random amount */
4548 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4549 da_outstream = make_dynarr_output_stream
4550 ((unsigned_char_dynarr *) conversion_in_dynarr);
4552 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
4553 istr = XLSTREAM (instream);
4554 ostr = XLSTREAM (outstream);
4555 GCPRO3 (instream, outstream, da_outstream);
4558 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4561 Lstream_write (ostr, tempbuf, size_in_bytes);
4563 Lstream_close (istr);
4564 Lstream_close (ostr);
4566 Lstream_delete (istr);
4567 Lstream_delete (ostr);
4568 Lstream_delete (XLSTREAM (da_outstream));
4571 *len_out = Dynarr_length (conversion_in_dynarr);
4572 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
4573 return Dynarr_atp (conversion_in_dynarr, 0);
4577 /************************************************************************/
4578 /* Initialization */
4579 /************************************************************************/
4582 syms_of_mule_coding (void)
4584 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
4585 deferror (&Qcoding_system_error, "coding-system-error",
4586 "Coding-system error", Qio_error);
4588 DEFSUBR (Fcoding_system_p);
4589 DEFSUBR (Ffind_coding_system);
4590 DEFSUBR (Fget_coding_system);
4591 DEFSUBR (Fcoding_system_list);
4592 DEFSUBR (Fcoding_system_name);
4593 DEFSUBR (Fmake_coding_system);
4594 DEFSUBR (Fcopy_coding_system);
4595 DEFSUBR (Fsubsidiary_coding_system);
4597 DEFSUBR (Fcoding_system_type);
4598 DEFSUBR (Fcoding_system_doc_string);
4599 DEFSUBR (Fcoding_system_charset);
4600 DEFSUBR (Fcoding_system_property);
4602 DEFSUBR (Fcoding_category_list);
4603 DEFSUBR (Fset_coding_priority_list);
4604 DEFSUBR (Fcoding_priority_list);
4605 DEFSUBR (Fset_coding_category_system);
4606 DEFSUBR (Fcoding_category_system);
4608 DEFSUBR (Fdetect_coding_region);
4609 DEFSUBR (Fdecode_coding_region);
4610 DEFSUBR (Fencode_coding_region);
4611 DEFSUBR (Fdecode_shift_jis_char);
4612 DEFSUBR (Fencode_shift_jis_char);
4613 DEFSUBR (Fdecode_big5_char);
4614 DEFSUBR (Fencode_big5_char);
4616 defsymbol (&Qcoding_system_p, "coding-system-p");
4618 defsymbol (&Qbig5, "big5");
4619 defsymbol (&Qshift_jis, "shift-jis");
4620 defsymbol (&Qno_conversion, "no-conversion");
4621 defsymbol (&Qccl, "ccl");
4622 defsymbol (&Qiso2022, "iso2022");
4624 defsymbol (&Qmnemonic, "mnemonic");
4625 defsymbol (&Qeol_type, "eol-type");
4626 defsymbol (&Qpost_read_conversion, "post-read-conversion");
4627 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
4629 defsymbol (&Qcr, "cr");
4630 defsymbol (&Qlf, "lf");
4631 defsymbol (&Qcrlf, "crlf");
4632 defsymbol (&Qeol_cr, "eol-cr");
4633 defsymbol (&Qeol_lf, "eol-lf");
4634 defsymbol (&Qeol_crlf, "eol-crlf");
4636 defsymbol (&Qcharset_g0, "charset-g0");
4637 defsymbol (&Qcharset_g1, "charset-g1");
4638 defsymbol (&Qcharset_g2, "charset-g2");
4639 defsymbol (&Qcharset_g3, "charset-g3");
4640 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
4641 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
4642 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
4643 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
4644 defsymbol (&Qshort, "short");
4645 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
4646 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
4647 defsymbol (&Qseven, "seven");
4648 defsymbol (&Qlock_shift, "lock-shift");
4649 defsymbol (&Qno_iso6429, "no-iso6429");
4650 defsymbol (&Qescape_quoted, "escape-quoted");
4651 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
4652 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
4654 defsymbol (&Qencode, "encode");
4655 defsymbol (&Qdecode, "decode");
4657 defsymbol (&Qctext, "ctext");
4659 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
4661 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
4663 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
4665 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
4667 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
4669 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
4671 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
4673 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
4678 lstream_type_create_mule_coding (void)
4680 LSTREAM_HAS_METHOD (decoding, reader);
4681 LSTREAM_HAS_METHOD (decoding, writer);
4682 LSTREAM_HAS_METHOD (decoding, rewinder);
4683 LSTREAM_HAS_METHOD (decoding, seekable_p);
4684 LSTREAM_HAS_METHOD (decoding, flusher);
4685 LSTREAM_HAS_METHOD (decoding, closer);
4686 LSTREAM_HAS_METHOD (decoding, marker);
4688 LSTREAM_HAS_METHOD (encoding, reader);
4689 LSTREAM_HAS_METHOD (encoding, writer);
4690 LSTREAM_HAS_METHOD (encoding, rewinder);
4691 LSTREAM_HAS_METHOD (encoding, seekable_p);
4692 LSTREAM_HAS_METHOD (encoding, flusher);
4693 LSTREAM_HAS_METHOD (encoding, closer);
4694 LSTREAM_HAS_METHOD (encoding, marker);
4698 vars_of_mule_coding (void)
4702 /* Initialize to something reasonable ... */
4703 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
4705 coding_category_system[i] = Qnil;
4706 coding_category_by_priority[i] = i;
4709 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
4710 Coding system used for TTY keyboard input.
4711 Not used under a windowing system.
4713 Vkeyboard_coding_system = Qnil;
4715 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
4716 Coding system used for TTY display output.
4717 Not used under a windowing system.
4719 Vterminal_coding_system = Qnil;
4721 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
4722 Overriding coding system used when writing a file or process.
4723 You should *bind* this, not set it. If this is non-nil, it specifies
4724 the coding system that will be used when a file or process is read
4725 in, and overrides `buffer-file-coding-system-for-read',
4726 `insert-file-contents-pre-hook', etc. Use those variables instead of
4727 this one for permanent changes to the environment.
4729 Vcoding_system_for_read = Qnil;
4731 DEFVAR_LISP ("coding-system-for-write",
4732 &Vcoding_system_for_write /*
4733 Overriding coding system used when writing a file or process.
4734 You should *bind* this, not set it. If this is non-nil, it specifies
4735 the coding system that will be used when a file or process is wrote
4736 in, and overrides `buffer-file-coding-system',
4737 `write-region-pre-hook', etc. Use those variables instead of this one
4738 for permanent changes to the environment.
4740 Vcoding_system_for_write = Qnil;
4742 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
4743 Coding system used to convert pathnames when accessing files.
4745 Vfile_name_coding_system = Qnil;
4747 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
4748 Non-nil means the buffer contents are regarded as multi-byte form
4749 of characters, not a binary code. This affects the display, file I/O,
4750 and behaviors of various editing commands.
4752 Setting this to nil does not do anything.
4754 enable_multibyte_characters = 1;
4758 complex_vars_of_mule_coding (void)
4760 staticpro (&Vcoding_system_hash_table);
4761 Vcoding_system_hash_table =
4762 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
4764 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
4766 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
4768 struct codesys_prop csp; \
4770 csp.prop_type = (Prop_Type); \
4771 Dynarr_add (the_codesys_prop_dynarr, csp); \
4774 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
4775 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
4776 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
4777 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
4778 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
4779 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
4780 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
4782 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
4783 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
4784 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
4785 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
4786 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
4787 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
4788 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
4789 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
4790 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
4791 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
4792 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
4793 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
4794 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
4795 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
4796 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
4797 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
4798 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
4800 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
4801 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
4803 /* Need to create this here or we're really screwed. */
4804 Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
4805 list2 (Qmnemonic, build_string ("Noconv")));
4807 Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
4810 /* Need this for bootstrapping */
4811 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
4812 Fget_coding_system (Qno_conversion);