1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
36 #include "file-coding.h"
38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
40 Lisp_Object Vkeyboard_coding_system;
41 Lisp_Object Vterminal_coding_system;
42 Lisp_Object Vcoding_system_for_read;
43 Lisp_Object Vcoding_system_for_write;
44 Lisp_Object Vfile_name_coding_system;
46 /* Table of symbols identifying each coding category. */
47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
49 /* Coding system currently associated with each coding category. */
50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
52 /* Table of all coding categories in decreasing order of priority.
53 This describes a permutation of the possible coding categories. */
54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
56 Lisp_Object Qcoding_system_p;
58 Lisp_Object Qno_conversion, Qccl, Qiso2022;
59 /* Qinternal in general.c */
61 Lisp_Object Qmnemonic, Qeol_type;
62 Lisp_Object Qcr, Qcrlf, Qlf;
63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
64 Lisp_Object Qpost_read_conversion;
65 Lisp_Object Qpre_write_conversion;
68 Lisp_Object Qucs4, Qutf8;
69 Lisp_Object Qbig5, Qshift_jis;
70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
73 Lisp_Object Qno_iso6429;
74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
75 Lisp_Object Qctext, Qescape_quoted;
76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
78 Lisp_Object Qencode, Qdecode;
80 Lisp_Object Vcoding_system_hash_table;
82 int enable_multibyte_characters;
85 /* Additional information used by the ISO2022 decoder and detector. */
86 struct iso2022_decoder
88 /* CHARSET holds the character sets currently assigned to the G0
89 through G3 variables. It is initialized from the array
90 INITIAL_CHARSET in CODESYS. */
91 Lisp_Object charset[4];
93 /* Which registers are currently invoked into the left (GL) and
94 right (GR) halves of the 8-bit encoding space? */
95 int register_left, register_right;
97 /* ISO_ESC holds a value indicating part of an escape sequence
98 that has already been seen. */
99 enum iso_esc_flag esc;
101 /* This records the bytes we've seen so far in an escape sequence,
102 in case the sequence is invalid (we spit out the bytes unchanged). */
103 unsigned char esc_bytes[8];
105 /* Index for next byte to store in ISO escape sequence. */
108 #ifdef ENABLE_COMPOSITE_CHARS
109 /* Stuff seen so far when composing a string. */
110 unsigned_char_dynarr *composite_chars;
113 /* If we saw an invalid designation sequence for a particular
114 register, we flag it here and switch to ASCII. The next time we
115 see a valid designation for this register, we turn off the flag
116 and do the designation normally, but pretend the sequence was
117 invalid. The effect of all this is that (most of the time) the
118 escape sequences for both the switch to the unknown charset, and
119 the switch back to the known charset, get inserted literally into
120 the buffer and saved out as such. The hope is that we can
121 preserve the escape sequences so that the resulting written out
122 file makes sense. If we don't do any of this, the designation
123 to the invalid charset will be preserved but that switch back
124 to the known charset will probably get eaten because it was
125 the same charset that was already present in the register. */
126 unsigned char invalid_designated[4];
128 /* We try to do similar things as above for direction-switching
129 sequences. If we encountered a direction switch while an
130 invalid designation was present, or an invalid designation
131 just after a direction switch (i.e. no valid designation
132 encountered yet), we insert the direction-switch escape
133 sequence literally into the output stream, and later on
134 insert the corresponding direction-restoring escape sequence
136 unsigned int switched_dir_and_no_valid_charset_yet :1;
137 unsigned int invalid_switch_dir :1;
139 /* Tells the decoder to output the escape sequence literally
140 even though it was valid. Used in the games we play to
141 avoid lossage when we encounter invalid designations. */
142 unsigned int output_literally :1;
143 /* We encountered a direction switch followed by an invalid
144 designation. We didn't output the direction switch
145 literally because we didn't know about the invalid designation;
146 but we have to do so now. */
147 unsigned int output_direction_sequence :1;
150 EXFUN (Fcopy_coding_system, 2);
152 struct detection_state;
153 static int detect_coding_sjis (struct detection_state *st,
154 CONST unsigned char *src,
156 static void decode_coding_sjis (Lstream *decoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static void encode_coding_sjis (Lstream *encoding,
161 CONST unsigned char *src,
162 unsigned_char_dynarr *dst,
164 static int detect_coding_big5 (struct detection_state *st,
165 CONST unsigned char *src,
167 static void decode_coding_big5 (Lstream *decoding,
168 CONST unsigned char *src,
169 unsigned_char_dynarr *dst, unsigned int n);
170 static void encode_coding_big5 (Lstream *encoding,
171 CONST unsigned char *src,
172 unsigned_char_dynarr *dst, unsigned int n);
173 static int detect_coding_ucs4 (struct detection_state *st,
174 CONST unsigned char *src,
176 static void decode_coding_ucs4 (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static void encode_coding_ucs4 (Lstream *encoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
182 static int detect_coding_utf8 (struct detection_state *st,
183 CONST unsigned char *src,
185 static void decode_coding_utf8 (Lstream *decoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst, unsigned int n);
188 static void encode_coding_utf8 (Lstream *encoding,
189 CONST unsigned char *src,
190 unsigned_char_dynarr *dst, unsigned int n);
191 static int postprocess_iso2022_mask (int mask);
192 static void reset_iso2022 (Lisp_Object coding_system,
193 struct iso2022_decoder *iso);
194 static int detect_coding_iso2022 (struct detection_state *st,
195 CONST unsigned char *src,
197 static void decode_coding_iso2022 (Lstream *decoding,
198 CONST unsigned char *src,
199 unsigned_char_dynarr *dst, unsigned int n);
200 static void encode_coding_iso2022 (Lstream *encoding,
201 CONST unsigned char *src,
202 unsigned_char_dynarr *dst, unsigned int n);
204 static void decode_coding_no_conversion (Lstream *decoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst,
208 static void encode_coding_no_conversion (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst,
212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
213 unsigned_char_dynarr *dst, unsigned int n);
214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
217 typedef struct codesys_prop codesys_prop;
226 Dynarr_declare (codesys_prop);
227 } codesys_prop_dynarr;
229 codesys_prop_dynarr *the_codesys_prop_dynarr;
231 enum codesys_prop_enum
234 CODESYS_PROP_ISO2022,
239 /************************************************************************/
240 /* Coding system functions */
241 /************************************************************************/
243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
245 static void finalize_coding_system (void *header, int for_disksave);
247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
248 mark_coding_system, print_coding_system,
249 finalize_coding_system,
250 0, 0, struct Lisp_Coding_System);
253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
255 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
257 markobj (CODING_SYSTEM_NAME (codesys));
258 markobj (CODING_SYSTEM_DOC_STRING (codesys));
259 markobj (CODING_SYSTEM_MNEMONIC (codesys));
260 markobj (CODING_SYSTEM_EOL_LF (codesys));
261 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
262 markobj (CODING_SYSTEM_EOL_CR (codesys));
264 switch (CODING_SYSTEM_TYPE (codesys))
268 case CODESYS_ISO2022:
269 for (i = 0; i < 4; i++)
270 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
271 if (codesys->iso2022.input_conv)
273 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
275 struct charset_conversion_spec *ccs =
276 Dynarr_atp (codesys->iso2022.input_conv, i);
277 markobj (ccs->from_charset);
278 markobj (ccs->to_charset);
281 if (codesys->iso2022.output_conv)
283 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
285 struct charset_conversion_spec *ccs =
286 Dynarr_atp (codesys->iso2022.output_conv, i);
287 markobj (ccs->from_charset);
288 markobj (ccs->to_charset);
294 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
295 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
302 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
303 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
310 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
312 error ("printing unreadable object #<coding_system 0x%x>",
315 write_c_string ("#<coding_system ", printcharfun);
316 print_internal (c->name, printcharfun, 1);
317 write_c_string (">", printcharfun);
321 finalize_coding_system (void *header, int for_disksave)
323 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
324 /* Since coding systems never go away, this function is not
325 necessary. But it would be necessary if we changed things
326 so that coding systems could go away. */
327 if (!for_disksave) /* see comment in lstream.c */
329 switch (CODING_SYSTEM_TYPE (c))
332 case CODESYS_ISO2022:
333 if (c->iso2022.input_conv)
335 Dynarr_free (c->iso2022.input_conv);
336 c->iso2022.input_conv = 0;
338 if (c->iso2022.output_conv)
340 Dynarr_free (c->iso2022.output_conv);
341 c->iso2022.output_conv = 0;
352 symbol_to_eol_type (Lisp_Object symbol)
354 CHECK_SYMBOL (symbol);
355 if (NILP (symbol)) return EOL_AUTODETECT;
356 if (EQ (symbol, Qlf)) return EOL_LF;
357 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
358 if (EQ (symbol, Qcr)) return EOL_CR;
360 signal_simple_error ("Unrecognized eol type", symbol);
361 return EOL_AUTODETECT; /* not reached */
365 eol_type_to_symbol (enum eol_type type)
370 case EOL_LF: return Qlf;
371 case EOL_CRLF: return Qcrlf;
372 case EOL_CR: return Qcr;
373 case EOL_AUTODETECT: return Qnil;
378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
380 Lisp_Object codesys_obj;
381 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
382 char *codesys_name = (char *) alloca (len + 7);
384 char *codesys_mnemonic=0;
386 Lisp_Object codesys_name_sym, sub_codesys_obj;
390 XSETCODING_SYSTEM (codesys_obj, codesys);
392 memcpy (codesys_name,
393 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
395 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
397 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
398 codesys_mnemonic = (char *) alloca (mlen + 7);
399 memcpy (codesys_mnemonic,
400 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
404 strcpy (codesys_name + len, "-" op_sys); \
406 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
407 codesys_name_sym = intern (codesys_name); \
408 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
409 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
411 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
412 build_string (codesys_mnemonic); \
413 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
416 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
417 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
418 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
422 Return t if OBJECT is a coding system.
423 A coding system is an object that defines how text containing multiple
424 character sets is encoded into a stream of (typically 8-bit) bytes.
425 The coding system is used to decode the stream into a series of
426 characters (which may be from multiple charsets) when the text is read
427 from a file or process, and is used to encode the text back into the
428 same format when it is written out to a file or process.
430 For example, many ISO2022-compliant coding systems (such as Compound
431 Text, which is used for inter-client data under the X Window System)
432 use escape sequences to switch between different charsets -- Japanese
433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
435 `make-coding-system' for more information.
437 Coding systems are normally identified using a symbol, and the
438 symbol is accepted in place of the actual coding system object whenever
439 a coding system is called for. (This is similar to how faces work.)
443 return CODING_SYSTEMP (object) ? Qt : Qnil;
446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
447 Retrieve the coding system of the given name.
449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
450 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
451 If there is no such coding system, nil is returned. Otherwise the
452 associated coding system object is returned.
454 (coding_system_or_name))
456 if (CODING_SYSTEMP (coding_system_or_name))
457 return coding_system_or_name;
459 if (NILP (coding_system_or_name))
460 coding_system_or_name = Qbinary;
462 CHECK_SYMBOL (coding_system_or_name);
464 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
468 Retrieve the coding system of the given name.
469 Same as `find-coding-system' except that if there is no such
470 coding system, an error is signaled instead of returning nil.
474 Lisp_Object coding_system = Ffind_coding_system (name);
476 if (NILP (coding_system))
477 signal_simple_error ("No such coding system", name);
478 return coding_system;
481 /* We store the coding systems in hash tables with the names as the key and the
482 actual coding system object as the value. Occasionally we need to use them
483 in a list format. These routines provide us with that. */
484 struct coding_system_list_closure
486 Lisp_Object *coding_system_list;
490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
491 void *coding_system_list_closure)
493 /* This function can GC */
494 struct coding_system_list_closure *cscl =
495 (struct coding_system_list_closure *) coding_system_list_closure;
496 Lisp_Object *coding_system_list = cscl->coding_system_list;
498 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
499 *coding_system_list);
503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
504 Return a list of the names of all defined coding systems.
508 Lisp_Object coding_system_list = Qnil;
510 struct coding_system_list_closure coding_system_list_closure;
512 GCPRO1 (coding_system_list);
513 coding_system_list_closure.coding_system_list = &coding_system_list;
514 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
515 &coding_system_list_closure);
518 return coding_system_list;
521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
522 Return the name of the given coding system.
526 coding_system = Fget_coding_system (coding_system);
527 return XCODING_SYSTEM_NAME (coding_system);
530 static Lisp_Coding_System *
531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
533 Lisp_Coding_System *codesys =
534 alloc_lcrecord_type (Lisp_Coding_System, lrecord_coding_system);
536 zero_lcrecord (codesys);
537 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
538 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
539 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
540 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
541 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
542 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
543 CODING_SYSTEM_TYPE (codesys) = type;
544 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
546 if (type == CODESYS_ISO2022)
549 for (i = 0; i < 4; i++)
550 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
552 else if (type == CODESYS_CCL)
554 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
555 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
558 CODING_SYSTEM_NAME (codesys) = name;
564 /* Given a list of charset conversion specs as specified in a Lisp
565 program, parse it into STORE_HERE. */
568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
569 Lisp_Object spec_list)
573 EXTERNAL_LIST_LOOP (rest, spec_list)
575 Lisp_Object car = XCAR (rest);
576 Lisp_Object from, to;
577 struct charset_conversion_spec spec;
579 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
580 signal_simple_error ("Invalid charset conversion spec", car);
581 from = Fget_charset (XCAR (car));
582 to = Fget_charset (XCAR (XCDR (car)));
583 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
584 signal_simple_error_2
585 ("Attempted conversion between different charset types",
587 spec.from_charset = from;
588 spec.to_charset = to;
590 Dynarr_add (store_here, spec);
594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
595 specs, return the equivalent as the Lisp programmer would see it.
597 If LOAD_HERE is 0, return Qnil. */
600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
607 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
609 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
610 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
613 return Fnreverse (result);
618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
619 Register symbol NAME as a coding system.
621 TYPE describes the conversion method used and should be one of
624 Automatic conversion. XEmacs attempts to detect the coding system
627 No conversion. Use this for binary files and such. On output,
628 graphic characters that are not in ASCII or Latin-1 will be
629 replaced by a ?. (For a no-conversion-encoded buffer, these
630 characters will only be present if you explicitly insert them.)
632 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
634 ISO 10646 UCS-4 encoding.
636 ISO 10646 UTF-8 encoding.
638 Any ISO2022-compliant encoding. Among other things, this includes
639 JIS (the Japanese encoding commonly used for e-mail), EUC (the
640 standard Unix encoding for Japanese and other languages), and
641 Compound Text (the encoding used in X11). You can specify more
642 specific information about the conversion with the FLAGS argument.
644 Big5 (the encoding commonly used for Taiwanese).
646 The conversion is performed using a user-written pseudo-code
647 program. CCL (Code Conversion Language) is the name of this
650 Write out or read in the raw contents of the memory representing
651 the buffer's text. This is primarily useful for debugging
652 purposes, and is only enabled when XEmacs has been compiled with
653 DEBUG_XEMACS defined (via the --debug configure option).
654 WARNING: Reading in a file using 'internal conversion can result
655 in an internal inconsistency in the memory representing a
656 buffer's text, which will produce unpredictable results and may
657 cause XEmacs to crash. Under normal circumstances you should
658 never use 'internal conversion.
660 DOC-STRING is a string describing the coding system.
662 PROPS is a property list, describing the specific nature of the
663 character set. Recognized properties are:
666 String to be displayed in the modeline when this coding system is
670 End-of-line conversion to be used. It should be one of
673 Automatically detect the end-of-line type (LF, CRLF,
674 or CR). Also generate subsidiary coding systems named
675 `NAME-unix', `NAME-dos', and `NAME-mac', that are
676 identical to this coding system but have an EOL-TYPE
677 value of 'lf, 'crlf, and 'cr, respectively.
679 The end of a line is marked externally using ASCII LF.
680 Since this is also the way that XEmacs represents an
681 end-of-line internally, specifying this option results
682 in no end-of-line conversion. This is the standard
683 format for Unix text files.
685 The end of a line is marked externally using ASCII
686 CRLF. This is the standard format for MS-DOS text
689 The end of a line is marked externally using ASCII CR.
690 This is the standard format for Macintosh text files.
692 Automatically detect the end-of-line type but do not
693 generate subsidiary coding systems. (This value is
694 converted to nil when stored internally, and
695 `coding-system-property' will return nil.)
697 'post-read-conversion
698 Function called after a file has been read in, to perform the
699 decoding. Called with two arguments, BEG and END, denoting
700 a region of the current buffer to be decoded.
702 'pre-write-conversion
703 Function called before a file is written out, to perform the
704 encoding. Called with two arguments, BEG and END, denoting
705 a region of the current buffer to be encoded.
708 The following additional properties are recognized if TYPE is 'iso2022:
714 The character set initially designated to the G0 - G3 registers.
715 The value should be one of
717 -- A charset object (designate that character set)
718 -- nil (do not ever use this register)
719 -- t (no character set is initially designated to
720 the register, but may be later on; this automatically
721 sets the corresponding `force-g*-on-output' property)
727 If non-nil, send an explicit designation sequence on output before
728 using the specified register.
731 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
732 "ESC $ B" on output in place of the full designation sequences
733 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
736 If non-nil, don't designate ASCII to G0 at each end of line on output.
737 Setting this to non-nil also suppresses other state-resetting that
738 normally happens at the end of a line.
741 If non-nil, don't designate ASCII to G0 before control chars on output.
744 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
748 If non-nil, use locking-shift (SO/SI) instead of single-shift
749 or designation by escape sequence.
752 If non-nil, don't use ISO6429's direction specification.
755 If non-nil, literal control characters that are the same as
756 the beginning of a recognized ISO2022 or ISO6429 escape sequence
757 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
758 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
759 so that they can be properly distinguished from an escape sequence.
760 (Note that doing this results in a non-portable encoding.) This
761 encoding flag is used for byte-compiled files. Note that ESC
762 is a good choice for a quoting character because there are no
763 escape sequences whose second byte is a character from the Control-0
764 or Control-1 character sets; this is explicitly disallowed by the
767 'input-charset-conversion
768 A list of conversion specifications, specifying conversion of
769 characters in one charset to another when decoding is performed.
770 Each specification is a list of two elements: the source charset,
771 and the destination charset.
773 'output-charset-conversion
774 A list of conversion specifications, specifying conversion of
775 characters in one charset to another when encoding is performed.
776 The form of each specification is the same as for
777 'input-charset-conversion.
780 The following additional properties are recognized (and required)
784 CCL program used for decoding (converting to internal format).
787 CCL program used for encoding (converting to external format).
789 (name, type, doc_string, props))
791 Lisp_Coding_System *codesys;
792 Lisp_Object rest, key, value;
793 enum coding_system_type ty;
794 int need_to_setup_eol_systems = 1;
796 /* Convert type to constant */
797 if (NILP (type) || EQ (type, Qundecided))
798 { ty = CODESYS_AUTODETECT; }
800 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
801 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
802 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
803 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
804 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
805 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
807 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
809 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
812 signal_simple_error ("Invalid coding system type", type);
816 codesys = allocate_coding_system (ty, name);
818 if (NILP (doc_string))
819 doc_string = build_string ("");
821 CHECK_STRING (doc_string);
822 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
824 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
826 if (EQ (key, Qmnemonic))
829 CHECK_STRING (value);
830 CODING_SYSTEM_MNEMONIC (codesys) = value;
833 else if (EQ (key, Qeol_type))
835 need_to_setup_eol_systems = NILP (value);
838 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
841 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
842 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
844 else if (ty == CODESYS_ISO2022)
846 #define FROB_INITIAL_CHARSET(charset_num) \
847 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
848 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
850 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
851 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
852 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
853 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
855 #define FROB_FORCE_CHARSET(charset_num) \
856 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
858 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
859 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
860 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
861 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
863 #define FROB_BOOLEAN_PROPERTY(prop) \
864 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
866 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
867 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
868 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
869 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
870 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
871 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
872 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
874 else if (EQ (key, Qinput_charset_conversion))
876 codesys->iso2022.input_conv =
877 Dynarr_new (charset_conversion_spec);
878 parse_charset_conversion_specs (codesys->iso2022.input_conv,
881 else if (EQ (key, Qoutput_charset_conversion))
883 codesys->iso2022.output_conv =
884 Dynarr_new (charset_conversion_spec);
885 parse_charset_conversion_specs (codesys->iso2022.output_conv,
889 signal_simple_error ("Unrecognized property", key);
891 else if (EQ (type, Qccl))
893 if (EQ (key, Qdecode))
895 CHECK_VECTOR (value);
896 CODING_SYSTEM_CCL_DECODE (codesys) = value;
898 else if (EQ (key, Qencode))
900 CHECK_VECTOR (value);
901 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
904 signal_simple_error ("Unrecognized property", key);
908 signal_simple_error ("Unrecognized property", key);
911 if (need_to_setup_eol_systems)
912 setup_eol_coding_systems (codesys);
915 Lisp_Object codesys_obj;
916 XSETCODING_SYSTEM (codesys_obj, codesys);
917 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
923 Copy OLD-CODING-SYSTEM to NEW-NAME.
924 If NEW-NAME does not name an existing coding system, a new one will
927 (old_coding_system, new_name))
929 Lisp_Object new_coding_system;
930 old_coding_system = Fget_coding_system (old_coding_system);
931 new_coding_system = Ffind_coding_system (new_name);
932 if (NILP (new_coding_system))
934 XSETCODING_SYSTEM (new_coding_system,
935 allocate_coding_system
936 (XCODING_SYSTEM_TYPE (old_coding_system),
938 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
942 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
943 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
944 memcpy (((char *) to ) + sizeof (to->header),
945 ((char *) from) + sizeof (from->header),
946 sizeof (*from) - sizeof (from->header));
949 return new_coding_system;
953 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
955 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
956 Lisp_Object new_coding_system;
958 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
959 return coding_system;
963 case EOL_AUTODETECT: return coding_system;
964 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
965 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
966 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
970 return NILP (new_coding_system) ? coding_system : new_coding_system;
973 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
974 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
976 (coding_system, eol_type))
978 coding_system = Fget_coding_system (coding_system);
980 return subsidiary_coding_system (coding_system,
981 symbol_to_eol_type (eol_type));
985 /************************************************************************/
986 /* Coding system accessors */
987 /************************************************************************/
989 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
990 Return the doc string for CODING-SYSTEM.
994 coding_system = Fget_coding_system (coding_system);
995 return XCODING_SYSTEM_DOC_STRING (coding_system);
998 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
999 Return the type of CODING-SYSTEM.
1003 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1006 case CODESYS_AUTODETECT: return Qundecided;
1008 case CODESYS_SHIFT_JIS: return Qshift_jis;
1009 case CODESYS_ISO2022: return Qiso2022;
1010 case CODESYS_BIG5: return Qbig5;
1011 case CODESYS_UCS4: return Qucs4;
1012 case CODESYS_UTF8: return Qutf8;
1013 case CODESYS_CCL: return Qccl;
1015 case CODESYS_NO_CONVERSION: return Qno_conversion;
1017 case CODESYS_INTERNAL: return Qinternal;
1024 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1027 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1029 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1032 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1033 Return initial charset of CODING-SYSTEM designated to GNUM.
1036 (coding_system, gnum))
1038 coding_system = Fget_coding_system (coding_system);
1041 return coding_system_charset (coding_system, XINT (gnum));
1045 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1046 Return the PROP property of CODING-SYSTEM.
1048 (coding_system, prop))
1051 enum coding_system_type type;
1053 coding_system = Fget_coding_system (coding_system);
1054 CHECK_SYMBOL (prop);
1055 type = XCODING_SYSTEM_TYPE (coding_system);
1057 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1058 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1061 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1063 case CODESYS_PROP_ALL_OK:
1066 case CODESYS_PROP_ISO2022:
1067 if (type != CODESYS_ISO2022)
1069 ("Property only valid in ISO2022 coding systems",
1073 case CODESYS_PROP_CCL:
1074 if (type != CODESYS_CCL)
1076 ("Property only valid in CCL coding systems",
1086 signal_simple_error ("Unrecognized property", prop);
1088 if (EQ (prop, Qname))
1089 return XCODING_SYSTEM_NAME (coding_system);
1090 else if (EQ (prop, Qtype))
1091 return Fcoding_system_type (coding_system);
1092 else if (EQ (prop, Qdoc_string))
1093 return XCODING_SYSTEM_DOC_STRING (coding_system);
1094 else if (EQ (prop, Qmnemonic))
1095 return XCODING_SYSTEM_MNEMONIC (coding_system);
1096 else if (EQ (prop, Qeol_type))
1097 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1098 else if (EQ (prop, Qeol_lf))
1099 return XCODING_SYSTEM_EOL_LF (coding_system);
1100 else if (EQ (prop, Qeol_crlf))
1101 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1102 else if (EQ (prop, Qeol_cr))
1103 return XCODING_SYSTEM_EOL_CR (coding_system);
1104 else if (EQ (prop, Qpost_read_conversion))
1105 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1106 else if (EQ (prop, Qpre_write_conversion))
1107 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1109 else if (type == CODESYS_ISO2022)
1111 if (EQ (prop, Qcharset_g0))
1112 return coding_system_charset (coding_system, 0);
1113 else if (EQ (prop, Qcharset_g1))
1114 return coding_system_charset (coding_system, 1);
1115 else if (EQ (prop, Qcharset_g2))
1116 return coding_system_charset (coding_system, 2);
1117 else if (EQ (prop, Qcharset_g3))
1118 return coding_system_charset (coding_system, 3);
1120 #define FORCE_CHARSET(charset_num) \
1121 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1122 (coding_system, charset_num) ? Qt : Qnil)
1124 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1125 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1126 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1127 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1129 #define LISP_BOOLEAN(prop) \
1130 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1132 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1133 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1134 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1135 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1136 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1137 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1138 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1140 else if (EQ (prop, Qinput_charset_conversion))
1142 unparse_charset_conversion_specs
1143 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1144 else if (EQ (prop, Qoutput_charset_conversion))
1146 unparse_charset_conversion_specs
1147 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1151 else if (type == CODESYS_CCL)
1153 if (EQ (prop, Qdecode))
1154 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1155 else if (EQ (prop, Qencode))
1156 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1164 return Qnil; /* not reached */
1168 /************************************************************************/
1169 /* Coding category functions */
1170 /************************************************************************/
1173 decode_coding_category (Lisp_Object symbol)
1177 CHECK_SYMBOL (symbol);
1178 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1179 if (EQ (coding_category_symbol[i], symbol))
1182 signal_simple_error ("Unrecognized coding category", symbol);
1183 return 0; /* not reached */
1186 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1187 Return a list of all recognized coding categories.
1192 Lisp_Object list = Qnil;
1194 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1195 list = Fcons (coding_category_symbol[i], list);
1199 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1200 Change the priority order of the coding categories.
1201 LIST should be list of coding categories, in descending order of
1202 priority. Unspecified coding categories will be lower in priority
1203 than all specified ones, in the same relative order they were in
1208 int category_to_priority[CODING_CATEGORY_LAST + 1];
1212 /* First generate a list that maps coding categories to priorities. */
1214 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1215 category_to_priority[i] = -1;
1217 /* Highest priority comes from the specified list. */
1219 EXTERNAL_LIST_LOOP (rest, list)
1221 int cat = decode_coding_category (XCAR (rest));
1223 if (category_to_priority[cat] >= 0)
1224 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1225 category_to_priority[cat] = i++;
1228 /* Now go through the existing categories by priority to retrieve
1229 the categories not yet specified and preserve their priority
1231 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1233 int cat = coding_category_by_priority[j];
1234 if (category_to_priority[cat] < 0)
1235 category_to_priority[cat] = i++;
1238 /* Now we need to construct the inverse of the mapping we just
1241 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1242 coding_category_by_priority[category_to_priority[i]] = i;
1244 /* Phew! That was confusing. */
1248 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1249 Return a list of coding categories in descending order of priority.
1254 Lisp_Object list = Qnil;
1256 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1257 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1262 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1263 Change the coding system associated with a coding category.
1265 (coding_category, coding_system))
1267 int cat = decode_coding_category (coding_category);
1269 coding_system = Fget_coding_system (coding_system);
1270 coding_category_system[cat] = coding_system;
1274 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1275 Return the coding system associated with a coding category.
1279 int cat = decode_coding_category (coding_category);
1280 Lisp_Object sys = coding_category_system[cat];
1283 return XCODING_SYSTEM_NAME (sys);
1288 /************************************************************************/
1289 /* Detecting the encoding of data */
1290 /************************************************************************/
1292 struct detection_state
1294 enum eol_type eol_type;
1330 struct iso2022_decoder iso;
1332 int high_byte_count;
1333 unsigned int saw_single_shift:1;
1346 acceptable_control_char_p (int c)
1350 /* Allow and ignore control characters that you might
1351 reasonably see in a text file */
1356 case 8: /* backspace */
1357 case 11: /* vertical tab */
1358 case 12: /* form feed */
1359 case 26: /* MS-DOS C-z junk */
1360 case 31: /* '^_' -- for info */
1368 mask_has_at_most_one_bit_p (int mask)
1370 /* Perhaps the only thing useful you learn from intensive Microsoft
1371 technical interviews */
1372 return (mask & (mask - 1)) == 0;
1375 static enum eol_type
1376 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1385 st->eol.just_saw_cr = 1;
1390 if (st->eol.just_saw_cr)
1392 else if (st->eol.seen_anything)
1395 else if (st->eol.just_saw_cr)
1397 st->eol.just_saw_cr = 0;
1399 st->eol.seen_anything = 1;
1402 return EOL_AUTODETECT;
1405 /* Attempt to determine the encoding and EOL type of the given text.
1406 Before calling this function for the first type, you must initialize
1407 st->eol_type as appropriate and initialize st->mask to ~0.
1409 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1412 st->mask holds the determined coding category mask, or ~0 if only
1413 ASCII has been seen so far.
1417 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1418 is present in st->mask
1419 1 == definitive answers are here for both st->eol_type and st->mask
1423 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1424 unsigned int n, int just_do_eol)
1428 if (st->eol_type == EOL_AUTODETECT)
1429 st->eol_type = detect_eol_type (st, src, n);
1432 return st->eol_type != EOL_AUTODETECT;
1434 if (!st->seen_non_ascii)
1436 for (; n; n--, src++)
1439 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1441 st->seen_non_ascii = 1;
1443 st->shift_jis.mask = ~0;
1447 st->iso2022.mask = ~0;
1457 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1458 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1459 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1460 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1461 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1462 st->big5.mask = detect_coding_big5 (st, src, n);
1463 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1464 st->utf8.mask = detect_coding_utf8 (st, src, n);
1465 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1466 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1469 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1470 | st->utf8.mask | st->ucs4.mask;
1473 int retval = mask_has_at_most_one_bit_p (st->mask);
1474 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1475 return retval && st->eol_type != EOL_AUTODETECT;
1480 coding_system_from_mask (int mask)
1484 /* If the file was entirely or basically ASCII, use the
1485 default value of `buffer-file-coding-system'. */
1486 Lisp_Object retval =
1487 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1490 retval = Ffind_coding_system (retval);
1494 (Qbad_variable, Qwarning,
1495 "Invalid `default-buffer-file-coding-system', set to nil");
1496 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1500 retval = Fget_coding_system (Qno_conversion);
1508 mask = postprocess_iso2022_mask (mask);
1510 /* Look through the coding categories by priority and find
1511 the first one that is allowed. */
1512 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1514 cat = coding_category_by_priority[i];
1515 if ((mask & (1 << cat)) &&
1516 !NILP (coding_category_system[cat]))
1520 return coding_category_system[cat];
1522 return Fget_coding_system (Qno_conversion);
1526 /* Given a seekable read stream and potential coding system and EOL type
1527 as specified, do any autodetection that is called for. If the
1528 coding system and/or EOL type are not autodetect, they will be left
1529 alone; but this function will never return an autodetect coding system
1532 This function does not automatically fetch subsidiary coding systems;
1533 that should be unnecessary with the explicit eol-type argument. */
1536 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1537 enum eol_type *eol_type_in_out)
1539 struct detection_state decst;
1541 if (*eol_type_in_out == EOL_AUTODETECT)
1542 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1545 decst.eol_type = *eol_type_in_out;
1548 /* If autodetection is called for, do it now. */
1549 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1550 *eol_type_in_out == EOL_AUTODETECT)
1555 unsigned char random_buffer[4096];
1558 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1561 if (detect_coding_type (&decst, random_buffer, nread,
1562 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1563 CODESYS_AUTODETECT))
1567 *eol_type_in_out = decst.eol_type;
1568 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1569 *codesys_in_out = coding_system_from_mask (decst.mask);
1572 /* If we absolutely can't determine the EOL type, just assume LF. */
1573 if (*eol_type_in_out == EOL_AUTODETECT)
1574 *eol_type_in_out = EOL_LF;
1576 Lstream_rewind (stream);
1579 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1580 Detect coding system of the text in the region between START and END.
1581 Returned a list of possible coding systems ordered by priority.
1582 If only ASCII characters are found, it returns 'undecided or one of
1583 its subsidiary coding systems according to a detected end-of-line
1584 type. Optional arg BUFFER defaults to the current buffer.
1586 (start, end, buffer))
1588 Lisp_Object val = Qnil;
1589 struct buffer *buf = decode_buffer (buffer, 0);
1591 Lisp_Object instream, lb_instream;
1592 Lstream *istr, *lb_istr;
1593 struct detection_state decst;
1594 struct gcpro gcpro1, gcpro2;
1596 get_buffer_range_char (buf, start, end, &b, &e, 0);
1597 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1598 lb_istr = XLSTREAM (lb_instream);
1599 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1600 istr = XLSTREAM (instream);
1601 GCPRO2 (instream, lb_instream);
1603 decst.eol_type = EOL_AUTODETECT;
1607 unsigned char random_buffer[4096];
1608 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1612 if (detect_coding_type (&decst, random_buffer, nread, 0))
1616 if (decst.mask == ~0)
1617 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1625 decst.mask = postprocess_iso2022_mask (decst.mask);
1627 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1629 int sys = coding_category_by_priority[i];
1630 if (decst.mask & (1 << sys))
1632 Lisp_Object codesys = coding_category_system[sys];
1633 if (!NILP (codesys))
1634 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1635 val = Fcons (codesys, val);
1639 Lstream_close (istr);
1641 Lstream_delete (istr);
1642 Lstream_delete (lb_istr);
1647 /************************************************************************/
1648 /* Converting to internal Mule format ("decoding") */
1649 /************************************************************************/
1651 /* A decoding stream is a stream used for decoding text (i.e.
1652 converting from some external format to internal format).
1653 The decoding-stream object keeps track of the actual coding
1654 stream, the stream that is at the other end, and data that
1655 needs to be persistent across the lifetime of the stream. */
1657 /* Handle the EOL stuff related to just-read-in character C.
1658 EOL_TYPE is the EOL type of the coding stream.
1659 FLAGS is the current value of FLAGS in the coding stream, and may
1660 be modified by this macro. (The macro only looks at the
1661 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1662 bytes are to be written. You need to also define a local goto
1663 label "label_continue_loop" that is at the end of the main
1664 character-reading loop.
1666 If C is a CR character, then this macro handles it entirely and
1667 jumps to label_continue_loop. Otherwise, this macro does not add
1668 anything to DST, and continues normally. You should continue
1669 processing C normally after this macro. */
1671 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1675 if (eol_type == EOL_CR) \
1676 Dynarr_add (dst, '\n'); \
1677 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1678 Dynarr_add (dst, c); \
1680 flags |= CODING_STATE_CR; \
1681 goto label_continue_loop; \
1683 else if (flags & CODING_STATE_CR) \
1684 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1686 Dynarr_add (dst, '\r'); \
1687 flags &= ~CODING_STATE_CR; \
1691 /* C should be a binary character in the range 0 - 255; convert
1692 to internal format and add to Dynarr DST. */
1694 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1696 if (BYTE_ASCII_P (c)) \
1697 Dynarr_add (dst, c); \
1698 else if (BYTE_C1_P (c)) \
1700 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1701 Dynarr_add (dst, c + 0x20); \
1705 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1706 Dynarr_add (dst, c); \
1710 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1714 DECODE_ADD_BINARY_CHAR (ch, dst); \
1719 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1721 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1722 if ((flags & CODING_STATE_END) && \
1723 (flags & CODING_STATE_CR)) \
1724 Dynarr_add (dst, '\r'); \
1727 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1729 struct decoding_stream
1731 /* Coding system that governs the conversion. */
1732 Lisp_Coding_System *codesys;
1734 /* Stream that we read the encoded data from or
1735 write the decoded data to. */
1738 /* If we are reading, then we can return only a fixed amount of
1739 data, so if the conversion resulted in too much data, we store it
1740 here for retrieval the next time around. */
1741 unsigned_char_dynarr *runoff;
1743 /* FLAGS holds flags indicating the current state of the decoding.
1744 Some of these flags are dependent on the coding system. */
1747 /* CH holds a partially built-up character. Since we only deal
1748 with one- and two-byte characters at the moment, we only use
1749 this to store the first byte of a two-byte character. */
1752 /* EOL_TYPE specifies the type of end-of-line conversion that
1753 currently applies. We need to keep this separate from the
1754 EOL type stored in CODESYS because the latter might indicate
1755 automatic EOL-type detection while the former will always
1756 indicate a particular EOL type. */
1757 enum eol_type eol_type;
1759 /* Additional ISO2022 information. We define the structure above
1760 because it's also needed by the detection routines. */
1761 struct iso2022_decoder iso2022;
1763 /* Additional information (the state of the running CCL program)
1764 used by the CCL decoder. */
1765 struct ccl_program ccl;
1767 struct detection_state decst;
1770 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1771 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1772 static int decoding_rewinder (Lstream *stream);
1773 static int decoding_seekable_p (Lstream *stream);
1774 static int decoding_flusher (Lstream *stream);
1775 static int decoding_closer (Lstream *stream);
1777 static Lisp_Object decoding_marker (Lisp_Object stream,
1778 void (*markobj) (Lisp_Object));
1780 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1781 sizeof (struct decoding_stream));
1784 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1786 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1787 Lisp_Object str_obj;
1789 /* We do not need to mark the coding systems or charsets stored
1790 within the stream because they are stored in a global list
1791 and automatically marked. */
1793 XSETLSTREAM (str_obj, str);
1795 if (str->imp->marker)
1796 return (str->imp->marker) (str_obj, markobj);
1801 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1802 so we read data from the other end, decode it, and store it into DATA. */
1805 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1807 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1808 unsigned char *orig_data = data;
1810 int error_occurred = 0;
1812 /* We need to interface to mule_decode(), which expects to take some
1813 amount of data and store the result into a Dynarr. We have
1814 mule_decode() store into str->runoff, and take data from there
1817 /* We loop until we have enough data, reading chunks from the other
1818 end and decoding it. */
1821 /* Take data from the runoff if we can. Make sure to take at
1822 most SIZE bytes, and delete the data from the runoff. */
1823 if (Dynarr_length (str->runoff) > 0)
1825 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1826 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1827 Dynarr_delete_many (str->runoff, 0, chunk);
1833 break; /* No more room for data */
1835 if (str->flags & CODING_STATE_END)
1836 /* This means that on the previous iteration, we hit the EOF on
1837 the other end. We loop once more so that mule_decode() can
1838 output any final stuff it may be holding, or any "go back
1839 to a sane state" escape sequences. (This latter makes sense
1840 during encoding.) */
1843 /* Exhausted the runoff, so get some more. DATA has at least
1844 SIZE bytes left of storage in it, so it's OK to read directly
1845 into it. (We'll be overwriting above, after we've decoded it
1846 into the runoff.) */
1847 read_size = Lstream_read (str->other_end, data, size);
1854 /* There might be some more end data produced in the translation.
1855 See the comment above. */
1856 str->flags |= CODING_STATE_END;
1857 mule_decode (stream, data, str->runoff, read_size);
1860 if (data - orig_data == 0)
1861 return error_occurred ? -1 : 0;
1863 return data - orig_data;
1867 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1869 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1872 /* Decode all our data into the runoff, and then attempt to write
1873 it all out to the other end. Remove whatever chunk we succeeded
1875 mule_decode (stream, data, str->runoff, size);
1876 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1877 Dynarr_length (str->runoff));
1879 Dynarr_delete_many (str->runoff, 0, retval);
1880 /* Do NOT return retval. The return value indicates how much
1881 of the incoming data was written, not how many bytes were
1887 reset_decoding_stream (struct decoding_stream *str)
1890 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1892 Lisp_Object coding_system;
1893 XSETCODING_SYSTEM (coding_system, str->codesys);
1894 reset_iso2022 (coding_system, &str->iso2022);
1896 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1898 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1901 str->flags = str->ch = 0;
1905 decoding_rewinder (Lstream *stream)
1907 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1908 reset_decoding_stream (str);
1909 Dynarr_reset (str->runoff);
1910 return Lstream_rewind (str->other_end);
1914 decoding_seekable_p (Lstream *stream)
1916 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1917 return Lstream_seekable_p (str->other_end);
1921 decoding_flusher (Lstream *stream)
1923 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1924 return Lstream_flush (str->other_end);
1928 decoding_closer (Lstream *stream)
1930 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1931 if (stream->flags & LSTREAM_FL_WRITE)
1933 str->flags |= CODING_STATE_END;
1934 decoding_writer (stream, 0, 0);
1936 Dynarr_free (str->runoff);
1938 #ifdef ENABLE_COMPOSITE_CHARS
1939 if (str->iso2022.composite_chars)
1940 Dynarr_free (str->iso2022.composite_chars);
1943 return Lstream_close (str->other_end);
1947 decoding_stream_coding_system (Lstream *stream)
1949 Lisp_Object coding_system;
1950 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1952 XSETCODING_SYSTEM (coding_system, str->codesys);
1953 return subsidiary_coding_system (coding_system, str->eol_type);
1957 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1959 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1960 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1962 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1963 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1964 reset_decoding_stream (str);
1967 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
1968 stream for writing, no automatic code detection will be performed.
1969 The reason for this is that automatic code detection requires a
1970 seekable input. Things will also fail if you open a decoding
1971 stream for reading using a non-fully-specified coding system and
1972 a non-seekable input stream. */
1975 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1978 Lstream *lstr = Lstream_new (lstream_decoding, mode);
1979 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1983 str->other_end = stream;
1984 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1985 str->eol_type = EOL_AUTODETECT;
1986 if (!strcmp (mode, "r")
1987 && Lstream_seekable_p (stream))
1988 /* We can determine the coding system now. */
1989 determine_real_coding_system (stream, &codesys, &str->eol_type);
1990 set_decoding_stream_coding_system (lstr, codesys);
1991 str->decst.eol_type = str->eol_type;
1992 str->decst.mask = ~0;
1993 XSETLSTREAM (obj, lstr);
1998 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2000 return make_decoding_stream_1 (stream, codesys, "r");
2004 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2006 return make_decoding_stream_1 (stream, codesys, "w");
2009 /* Note: the decode_coding_* functions all take the same
2010 arguments as mule_decode(), which is to say some SRC data of
2011 size N, which is to be stored into dynamic array DST.
2012 DECODING is the stream within which the decoding is
2013 taking place, but no data is actually read from or
2014 written to that stream; that is handled in decoding_reader()
2015 or decoding_writer(). This allows the same functions to
2016 be used for both reading and writing. */
2019 mule_decode (Lstream *decoding, CONST unsigned char *src,
2020 unsigned_char_dynarr *dst, unsigned int n)
2022 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2024 /* If necessary, do encoding-detection now. We do this when
2025 we're a writing stream or a non-seekable reading stream,
2026 meaning that we can't just process the whole input,
2027 rewind, and start over. */
2029 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2030 str->eol_type == EOL_AUTODETECT)
2032 Lisp_Object codesys;
2034 XSETCODING_SYSTEM (codesys, str->codesys);
2035 detect_coding_type (&str->decst, src, n,
2036 CODING_SYSTEM_TYPE (str->codesys) !=
2037 CODESYS_AUTODETECT);
2038 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2039 str->decst.mask != ~0)
2040 /* #### This is cheesy. What we really ought to do is
2041 buffer up a certain amount of data so as to get a
2042 less random result. */
2043 codesys = coding_system_from_mask (str->decst.mask);
2044 str->eol_type = str->decst.eol_type;
2045 if (XCODING_SYSTEM (codesys) != str->codesys)
2047 /* Preserve the CODING_STATE_END flag in case it was set.
2048 If we erase it, bad things might happen. */
2049 int was_end = str->flags & CODING_STATE_END;
2050 set_decoding_stream_coding_system (decoding, codesys);
2052 str->flags |= CODING_STATE_END;
2056 switch (CODING_SYSTEM_TYPE (str->codesys))
2059 case CODESYS_INTERNAL:
2060 Dynarr_add_many (dst, src, n);
2063 case CODESYS_AUTODETECT:
2064 /* If we got this far and still haven't decided on the coding
2065 system, then do no conversion. */
2066 case CODESYS_NO_CONVERSION:
2067 decode_coding_no_conversion (decoding, src, dst, n);
2070 case CODESYS_SHIFT_JIS:
2071 decode_coding_sjis (decoding, src, dst, n);
2074 decode_coding_big5 (decoding, src, dst, n);
2077 decode_coding_ucs4 (decoding, src, dst, n);
2080 decode_coding_utf8 (decoding, src, dst, n);
2083 ccl_driver (&str->ccl, src, dst, n, 0);
2085 case CODESYS_ISO2022:
2086 decode_coding_iso2022 (decoding, src, dst, n);
2094 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2095 Decode the text between START and END which is encoded in CODING-SYSTEM.
2096 This is useful if you've read in encoded text from a file without decoding
2097 it (e.g. you read in a JIS-formatted file but used the `binary' or
2098 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2099 Return length of decoded text.
2100 BUFFER defaults to the current buffer if unspecified.
2102 (start, end, coding_system, buffer))
2105 struct buffer *buf = decode_buffer (buffer, 0);
2106 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2107 Lstream *istr, *ostr;
2108 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2110 get_buffer_range_char (buf, start, end, &b, &e, 0);
2112 barf_if_buffer_read_only (buf, b, e);
2114 coding_system = Fget_coding_system (coding_system);
2115 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2116 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2117 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2119 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2120 Fget_coding_system (Qbinary));
2121 istr = XLSTREAM (instream);
2122 ostr = XLSTREAM (outstream);
2123 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2125 /* The chain of streams looks like this:
2127 [BUFFER] <----- send through
2128 ------> [ENCODE AS BINARY]
2129 ------> [DECODE AS SPECIFIED]
2135 char tempbuf[1024]; /* some random amount */
2136 Bufpos newpos, even_newer_pos;
2137 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2138 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2142 newpos = lisp_buffer_stream_startpos (istr);
2143 Lstream_write (ostr, tempbuf, size_in_bytes);
2144 even_newer_pos = lisp_buffer_stream_startpos (istr);
2145 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2148 Lstream_close (istr);
2149 Lstream_close (ostr);
2151 Lstream_delete (istr);
2152 Lstream_delete (ostr);
2153 Lstream_delete (XLSTREAM (de_outstream));
2154 Lstream_delete (XLSTREAM (lb_outstream));
2159 /************************************************************************/
2160 /* Converting to an external encoding ("encoding") */
2161 /************************************************************************/
2163 /* An encoding stream is an output stream. When you create the
2164 stream, you specify the coding system that governs the encoding
2165 and another stream that the resulting encoded data is to be
2166 sent to, and then start sending data to it. */
2168 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2170 struct encoding_stream
2172 /* Coding system that governs the conversion. */
2173 Lisp_Coding_System *codesys;
2175 /* Stream that we read the encoded data from or
2176 write the decoded data to. */
2179 /* If we are reading, then we can return only a fixed amount of
2180 data, so if the conversion resulted in too much data, we store it
2181 here for retrieval the next time around. */
2182 unsigned_char_dynarr *runoff;
2184 /* FLAGS holds flags indicating the current state of the encoding.
2185 Some of these flags are dependent on the coding system. */
2188 /* CH holds a partially built-up character. Since we only deal
2189 with one- and two-byte characters at the moment, we only use
2190 this to store the first byte of a two-byte character. */
2193 /* Additional information used by the ISO2022 encoder. */
2196 /* CHARSET holds the character sets currently assigned to the G0
2197 through G3 registers. It is initialized from the array
2198 INITIAL_CHARSET in CODESYS. */
2199 Lisp_Object charset[4];
2201 /* Which registers are currently invoked into the left (GL) and
2202 right (GR) halves of the 8-bit encoding space? */
2203 int register_left, register_right;
2205 /* Whether we need to explicitly designate the charset in the
2206 G? register before using it. It is initialized from the
2207 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2208 unsigned char force_charset_on_output[4];
2210 /* Other state variables that need to be preserved across
2212 Lisp_Object current_charset;
2214 int current_char_boundary;
2217 /* Additional information (the state of the running CCL program)
2218 used by the CCL encoder. */
2219 struct ccl_program ccl;
2223 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2224 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2226 static int encoding_rewinder (Lstream *stream);
2227 static int encoding_seekable_p (Lstream *stream);
2228 static int encoding_flusher (Lstream *stream);
2229 static int encoding_closer (Lstream *stream);
2231 static Lisp_Object encoding_marker (Lisp_Object stream,
2232 void (*markobj) (Lisp_Object));
2234 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2235 sizeof (struct encoding_stream));
2238 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2240 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2241 Lisp_Object str_obj;
2243 /* We do not need to mark the coding systems or charsets stored
2244 within the stream because they are stored in a global list
2245 and automatically marked. */
2247 XSETLSTREAM (str_obj, str);
2249 if (str->imp->marker)
2250 return (str->imp->marker) (str_obj, markobj);
2255 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2256 so we read data from the other end, encode it, and store it into DATA. */
2259 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2261 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2262 unsigned char *orig_data = data;
2264 int error_occurred = 0;
2266 /* We need to interface to mule_encode(), which expects to take some
2267 amount of data and store the result into a Dynarr. We have
2268 mule_encode() store into str->runoff, and take data from there
2271 /* We loop until we have enough data, reading chunks from the other
2272 end and encoding it. */
2275 /* Take data from the runoff if we can. Make sure to take at
2276 most SIZE bytes, and delete the data from the runoff. */
2277 if (Dynarr_length (str->runoff) > 0)
2279 int chunk = min ((int) size, Dynarr_length (str->runoff));
2280 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2281 Dynarr_delete_many (str->runoff, 0, chunk);
2287 break; /* No more room for data */
2289 if (str->flags & CODING_STATE_END)
2290 /* This means that on the previous iteration, we hit the EOF on
2291 the other end. We loop once more so that mule_encode() can
2292 output any final stuff it may be holding, or any "go back
2293 to a sane state" escape sequences. (This latter makes sense
2294 during encoding.) */
2297 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2298 left of storage in it, so it's OK to read directly into it.
2299 (We'll be overwriting above, after we've encoded it into the
2301 read_size = Lstream_read (str->other_end, data, size);
2308 /* There might be some more end data produced in the translation.
2309 See the comment above. */
2310 str->flags |= CODING_STATE_END;
2311 mule_encode (stream, data, str->runoff, read_size);
2314 if (data == orig_data)
2315 return error_occurred ? -1 : 0;
2317 return data - orig_data;
2321 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2323 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2326 /* Encode all our data into the runoff, and then attempt to write
2327 it all out to the other end. Remove whatever chunk we succeeded
2329 mule_encode (stream, data, str->runoff, size);
2330 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2331 Dynarr_length (str->runoff));
2333 Dynarr_delete_many (str->runoff, 0, retval);
2334 /* Do NOT return retval. The return value indicates how much
2335 of the incoming data was written, not how many bytes were
2341 reset_encoding_stream (struct encoding_stream *str)
2344 switch (CODING_SYSTEM_TYPE (str->codesys))
2346 case CODESYS_ISO2022:
2350 for (i = 0; i < 4; i++)
2352 str->iso2022.charset[i] =
2353 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2354 str->iso2022.force_charset_on_output[i] =
2355 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2357 str->iso2022.register_left = 0;
2358 str->iso2022.register_right = 1;
2359 str->iso2022.current_charset = Qnil;
2360 str->iso2022.current_half = 0;
2361 str->iso2022.current_char_boundary = 1;
2365 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2372 str->flags = str->ch = 0;
2376 encoding_rewinder (Lstream *stream)
2378 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2379 reset_encoding_stream (str);
2380 Dynarr_reset (str->runoff);
2381 return Lstream_rewind (str->other_end);
2385 encoding_seekable_p (Lstream *stream)
2387 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2388 return Lstream_seekable_p (str->other_end);
2392 encoding_flusher (Lstream *stream)
2394 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2395 return Lstream_flush (str->other_end);
2399 encoding_closer (Lstream *stream)
2401 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2402 if (stream->flags & LSTREAM_FL_WRITE)
2404 str->flags |= CODING_STATE_END;
2405 encoding_writer (stream, 0, 0);
2407 Dynarr_free (str->runoff);
2408 return Lstream_close (str->other_end);
2412 encoding_stream_coding_system (Lstream *stream)
2414 Lisp_Object coding_system;
2415 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2417 XSETCODING_SYSTEM (coding_system, str->codesys);
2418 return coding_system;
2422 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2424 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2425 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2427 reset_encoding_stream (str);
2431 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2434 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2435 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2439 str->runoff = Dynarr_new (unsigned_char);
2440 str->other_end = stream;
2441 set_encoding_stream_coding_system (lstr, codesys);
2442 XSETLSTREAM (obj, lstr);
2447 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2449 return make_encoding_stream_1 (stream, codesys, "r");
2453 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2455 return make_encoding_stream_1 (stream, codesys, "w");
2458 /* Convert N bytes of internally-formatted data stored in SRC to an
2459 external format, according to the encoding stream ENCODING.
2460 Store the encoded data into DST. */
2463 mule_encode (Lstream *encoding, CONST unsigned char *src,
2464 unsigned_char_dynarr *dst, unsigned int n)
2466 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2468 switch (CODING_SYSTEM_TYPE (str->codesys))
2471 case CODESYS_INTERNAL:
2472 Dynarr_add_many (dst, src, n);
2475 case CODESYS_AUTODETECT:
2476 /* If we got this far and still haven't decided on the coding
2477 system, then do no conversion. */
2478 case CODESYS_NO_CONVERSION:
2479 encode_coding_no_conversion (encoding, src, dst, n);
2482 case CODESYS_SHIFT_JIS:
2483 encode_coding_sjis (encoding, src, dst, n);
2486 encode_coding_big5 (encoding, src, dst, n);
2489 encode_coding_ucs4 (encoding, src, dst, n);
2492 encode_coding_utf8 (encoding, src, dst, n);
2495 ccl_driver (&str->ccl, src, dst, n, 0);
2497 case CODESYS_ISO2022:
2498 encode_coding_iso2022 (encoding, src, dst, n);
2506 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2507 Encode the text between START and END using CODING-SYSTEM.
2508 This will, for example, convert Japanese characters into stuff such as
2509 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2510 text. BUFFER defaults to the current buffer if unspecified.
2512 (start, end, coding_system, buffer))
2515 struct buffer *buf = decode_buffer (buffer, 0);
2516 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2517 Lstream *istr, *ostr;
2518 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2520 get_buffer_range_char (buf, start, end, &b, &e, 0);
2522 barf_if_buffer_read_only (buf, b, e);
2524 coding_system = Fget_coding_system (coding_system);
2525 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2526 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2527 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2528 Fget_coding_system (Qbinary));
2529 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2531 istr = XLSTREAM (instream);
2532 ostr = XLSTREAM (outstream);
2533 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2534 /* The chain of streams looks like this:
2536 [BUFFER] <----- send through
2537 ------> [ENCODE AS SPECIFIED]
2538 ------> [DECODE AS BINARY]
2543 char tempbuf[1024]; /* some random amount */
2544 Bufpos newpos, even_newer_pos;
2545 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2546 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2550 newpos = lisp_buffer_stream_startpos (istr);
2551 Lstream_write (ostr, tempbuf, size_in_bytes);
2552 even_newer_pos = lisp_buffer_stream_startpos (istr);
2553 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2559 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2560 Lstream_close (istr);
2561 Lstream_close (ostr);
2563 Lstream_delete (istr);
2564 Lstream_delete (ostr);
2565 Lstream_delete (XLSTREAM (de_outstream));
2566 Lstream_delete (XLSTREAM (lb_outstream));
2567 return make_int (retlen);
2573 /************************************************************************/
2574 /* Shift-JIS methods */
2575 /************************************************************************/
2577 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2578 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2579 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2580 encoded by "position-code + 0x80". A character of JISX0208
2581 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2582 position-codes are divided and shifted so that it fit in the range
2585 --- CODE RANGE of Shift-JIS ---
2586 (character set) (range)
2588 JISX0201-Kana 0xA0 .. 0xDF
2589 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2590 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2591 -------------------------------
2595 /* Is this the first byte of a Shift-JIS two-byte char? */
2597 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2598 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2600 /* Is this the second byte of a Shift-JIS two-byte char? */
2602 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2603 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2605 #define BYTE_SJIS_KATAKANA_P(c) \
2606 ((c) >= 0xA1 && (c) <= 0xDF)
2609 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2617 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2619 if (st->shift_jis.in_second_byte)
2621 st->shift_jis.in_second_byte = 0;
2625 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2626 st->shift_jis.in_second_byte = 1;
2628 return CODING_CATEGORY_SHIFT_JIS_MASK;
2631 /* Convert Shift-JIS data to internal format. */
2634 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2635 unsigned_char_dynarr *dst, unsigned int n)
2638 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2639 unsigned int flags = str->flags;
2640 unsigned int ch = str->ch;
2641 eol_type_t eol_type = str->eol_type;
2649 /* Previous character was first byte of Shift-JIS Kanji char. */
2650 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2652 unsigned char e1, e2;
2654 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2655 DECODE_SJIS (ch, c, e1, e2);
2656 Dynarr_add (dst, e1);
2657 Dynarr_add (dst, e2);
2661 DECODE_ADD_BINARY_CHAR (ch, dst);
2662 DECODE_ADD_BINARY_CHAR (c, dst);
2668 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2669 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2671 else if (BYTE_SJIS_KATAKANA_P (c))
2673 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2674 Dynarr_add (dst, c);
2677 DECODE_ADD_BINARY_CHAR (c, dst);
2679 label_continue_loop:;
2682 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2688 /* Convert internally-formatted data to Shift-JIS. */
2691 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2692 unsigned_char_dynarr *dst, unsigned int n)
2695 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2696 unsigned int flags = str->flags;
2697 unsigned int ch = str->ch;
2698 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2705 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2706 Dynarr_add (dst, '\r');
2707 if (eol_type != EOL_CR)
2708 Dynarr_add (dst, '\n');
2711 else if (BYTE_ASCII_P (c))
2713 Dynarr_add (dst, c);
2716 else if (BUFBYTE_LEADING_BYTE_P (c))
2717 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2718 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2719 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2722 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2724 Dynarr_add (dst, c);
2727 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2728 ch == LEADING_BYTE_JAPANESE_JISX0208)
2732 unsigned char j1, j2;
2733 ENCODE_SJIS (ch, c, j1, j2);
2734 Dynarr_add (dst, j1);
2735 Dynarr_add (dst, j2);
2745 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2746 Decode a JISX0208 character of Shift-JIS coding-system.
2747 CODE is the character code in Shift-JIS as a cons of type bytes.
2748 Return the corresponding character.
2752 unsigned char c1, c2, s1, s2;
2755 CHECK_INT (XCAR (code));
2756 CHECK_INT (XCDR (code));
2757 s1 = XINT (XCAR (code));
2758 s2 = XINT (XCDR (code));
2759 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2760 BYTE_SJIS_TWO_BYTE_2_P (s2))
2762 DECODE_SJIS (s1, s2, c1, c2);
2763 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2764 c1 & 0x7F, c2 & 0x7F));
2770 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2771 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2772 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2776 Lisp_Object charset;
2779 CHECK_CHAR_COERCE_INT (ch);
2780 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2781 if (EQ (charset, Vcharset_japanese_jisx0208))
2783 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2784 return Fcons (make_int (s1), make_int (s2));
2791 /************************************************************************/
2793 /************************************************************************/
2795 /* BIG5 is a coding system encoding two character sets: ASCII and
2796 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2797 character set and is encoded in two-byte.
2799 --- CODE RANGE of BIG5 ---
2800 (character set) (range)
2802 Big5 (1st byte) 0xA1 .. 0xFE
2803 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2804 --------------------------
2806 Since the number of characters in Big5 is larger than maximum
2807 characters in Emacs' charset (96x96), it can't be handled as one
2808 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2809 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2810 contains frequently used characters and the latter contains less
2811 frequently used characters. */
2813 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2814 ((c) >= 0xA1 && (c) <= 0xFE)
2816 /* Is this the second byte of a Shift-JIS two-byte char? */
2818 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2819 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2821 /* Number of Big5 characters which have the same code in 1st byte. */
2823 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2825 /* Code conversion macros. These are macros because they are used in
2826 inner loops during code conversion.
2828 Note that temporary variables in macros introduce the classic
2829 dynamic-scoping problems with variable names. We use capital-
2830 lettered variables in the assumption that XEmacs does not use
2831 capital letters in variables except in a very formalized way
2834 /* Convert Big5 code (b1, b2) into its internal string representation
2837 /* There is a much simpler way to split the Big5 charset into two.
2838 For the moment I'm going to leave the algorithm as-is because it
2839 claims to separate out the most-used characters into a single
2840 charset, which perhaps will lead to optimizations in various
2843 The way the algorithm works is something like this:
2845 Big5 can be viewed as a 94x157 charset, where the row is
2846 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2847 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2848 the split between low and high column numbers is apparently
2849 meaningless; ascending rows produce less and less frequent chars.
2850 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2851 the first charset, and the upper half (0xC9 .. 0xFE) to the
2852 second. To do the conversion, we convert the character into
2853 a single number where 0 .. 156 is the first row, 157 .. 313
2854 is the second, etc. That way, the characters are ordered by
2855 decreasing frequency. Then we just chop the space in two
2856 and coerce the result into a 94x94 space.
2859 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2861 int B1 = b1, B2 = b2; \
2863 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2867 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2871 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2872 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2874 c1 = I / (0xFF - 0xA1) + 0xA1; \
2875 c2 = I % (0xFF - 0xA1) + 0xA1; \
2878 /* Convert the internal string representation of a Big5 character
2879 (lb, c1, c2) into Big5 code (b1, b2). */
2881 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2883 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2885 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2887 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2889 b1 = I / BIG5_SAME_ROW + 0xA1; \
2890 b2 = I % BIG5_SAME_ROW; \
2891 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2895 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2903 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2904 (c >= 0x80 && c <= 0xA0))
2906 if (st->big5.in_second_byte)
2908 st->big5.in_second_byte = 0;
2909 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2913 st->big5.in_second_byte = 1;
2915 return CODING_CATEGORY_BIG5_MASK;
2918 /* Convert Big5 data to internal format. */
2921 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2922 unsigned_char_dynarr *dst, unsigned int n)
2925 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2926 unsigned int flags = str->flags;
2927 unsigned int ch = str->ch;
2928 eol_type_t eol_type = str->eol_type;
2935 /* Previous character was first byte of Big5 char. */
2936 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2938 unsigned char b1, b2, b3;
2939 DECODE_BIG5 (ch, c, b1, b2, b3);
2940 Dynarr_add (dst, b1);
2941 Dynarr_add (dst, b2);
2942 Dynarr_add (dst, b3);
2946 DECODE_ADD_BINARY_CHAR (ch, dst);
2947 DECODE_ADD_BINARY_CHAR (c, dst);
2953 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2954 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2957 DECODE_ADD_BINARY_CHAR (c, dst);
2959 label_continue_loop:;
2962 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2968 /* Convert internally-formatted data to Big5. */
2971 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2972 unsigned_char_dynarr *dst, unsigned int n)
2975 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2976 unsigned int flags = str->flags;
2977 unsigned int ch = str->ch;
2978 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2985 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2986 Dynarr_add (dst, '\r');
2987 if (eol_type != EOL_CR)
2988 Dynarr_add (dst, '\n');
2990 else if (BYTE_ASCII_P (c))
2993 Dynarr_add (dst, c);
2995 else if (BUFBYTE_LEADING_BYTE_P (c))
2997 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
2998 c == LEADING_BYTE_CHINESE_BIG5_2)
3000 /* A recognized leading byte. */
3002 continue; /* not done with this character. */
3004 /* otherwise just ignore this character. */
3006 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3007 ch == LEADING_BYTE_CHINESE_BIG5_2)
3009 /* Previous char was a recognized leading byte. */
3011 continue; /* not done with this character. */
3015 /* Encountering second byte of a Big5 character. */
3016 unsigned char b1, b2;
3018 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3019 Dynarr_add (dst, b1);
3020 Dynarr_add (dst, b2);
3031 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3032 Decode a Big5 character CODE of BIG5 coding-system.
3033 CODE is the character code in BIG5, a cons of two integers.
3034 Return the corresponding character.
3038 unsigned char c1, c2, b1, b2;
3041 CHECK_INT (XCAR (code));
3042 CHECK_INT (XCDR (code));
3043 b1 = XINT (XCAR (code));
3044 b2 = XINT (XCDR (code));
3045 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3046 BYTE_BIG5_TWO_BYTE_2_P (b2))
3049 Lisp_Object charset;
3050 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3051 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3052 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3058 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3059 Encode the Big5 character CH to BIG5 coding-system.
3060 Return the corresponding character code in Big5.
3064 Lisp_Object charset;
3067 CHECK_CHAR_COERCE_INT (ch);
3068 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3069 if (EQ (charset, Vcharset_chinese_big5_1) ||
3070 EQ (charset, Vcharset_chinese_big5_2))
3072 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3074 return Fcons (make_int (b1), make_int (b2));
3081 /************************************************************************/
3084 /* UCS-4 character codes are implemented as nonnegative integers. */
3086 /************************************************************************/
3088 Lisp_Object ucs_to_mule_table[65536];
3089 Lisp_Object mule_to_ucs_table;
3091 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3092 Map UCS-4 code CODE to Mule character CHARACTER.
3094 Return T on success, NIL on failure.
3100 CHECK_CHAR (character);
3104 if (c < sizeof (ucs_to_mule_table))
3106 ucs_to_mule_table[c] = character;
3114 ucs_to_char (unsigned long code)
3116 if (code < sizeof (ucs_to_mule_table))
3118 return ucs_to_mule_table[code];
3120 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3125 c = code % (94 * 94);
3127 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3128 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3129 CHARSET_LEFT_TO_RIGHT),
3130 c / 94 + 33, c % 94 + 33));
3136 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3137 Return Mule character corresponding to UCS code CODE (a positive integer).
3141 CHECK_NATNUM (code);
3142 return ucs_to_char (XINT (code));
3145 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3146 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3150 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3151 Fset_char_ucs is more restrictive on index arg, but should
3152 check code arg in a char_table method. */
3153 CHECK_CHAR (character);
3154 CHECK_NATNUM (code);
3155 return Fput_char_table (character, code, mule_to_ucs_table);
3158 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3159 Return the UCS code (a positive integer) corresponding to CHARACTER.
3163 return Fget_char_table (character, mule_to_ucs_table);
3166 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3167 JIS X 0208 double-width `=' instead.
3168 #### do something more appropriate (use blob?)
3169 Danger, Will Robinson! Data loss. Should we signal user? */
3171 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3173 Lisp_Object chr = ucs_to_char (ch);
3177 Bufbyte work[MAX_EMCHAR_LEN];
3182 simple_set_charptr_emchar (work, ch) :
3183 non_ascii_set_charptr_emchar (work, ch);
3184 Dynarr_add_many (dst, work, len);
3188 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3189 Dynarr_add (dst, 34 + 128);
3190 Dynarr_add (dst, 46 + 128);
3194 static unsigned long
3195 mule_char_to_ucs4 (Lisp_Object charset,
3196 unsigned char h, unsigned char l)
3199 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3206 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3207 (XCHARSET_CHARS (charset) == 94) )
3209 unsigned char final = XCHARSET_FINAL (charset);
3211 if ( ('@' <= final) && (final < 0x7f) )
3213 return 0xe00000 + (final - '@') * 94 * 94
3214 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3228 encode_ucs4 (Lisp_Object charset,
3229 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3231 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3232 Dynarr_add (dst, code >> 24);
3233 Dynarr_add (dst, (code >> 16) & 255);
3234 Dynarr_add (dst, (code >> 8) & 255);
3235 Dynarr_add (dst, code & 255);
3239 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3245 switch (st->ucs4.in_byte)
3254 st->ucs4.in_byte = 0;
3260 return CODING_CATEGORY_UCS4_MASK;
3264 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3265 unsigned_char_dynarr *dst, unsigned int n)
3267 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3268 unsigned int flags = str->flags;
3269 unsigned int ch = str->ch;
3273 unsigned char c = *src++;
3281 decode_ucs4 ( ( ch << 8 ) | c, dst);
3286 ch = ( ch << 8 ) | c;
3290 if (flags & CODING_STATE_END)
3291 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3298 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3299 unsigned_char_dynarr *dst, unsigned int n)
3301 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3302 unsigned int flags = str->flags;
3303 unsigned int ch = str->ch;
3304 unsigned char char_boundary = str->iso2022.current_char_boundary;
3305 Lisp_Object charset = str->iso2022.current_charset;
3307 #ifdef ENABLE_COMPOSITE_CHARS
3308 /* flags for handling composite chars. We do a little switcharoo
3309 on the source while we're outputting the composite char. */
3310 unsigned int saved_n = 0;
3311 CONST unsigned char *saved_src = NULL;
3312 int in_composite = 0;
3319 unsigned char c = *src++;
3321 if (BYTE_ASCII_P (c))
3322 { /* Processing ASCII character */
3324 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3327 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3328 { /* Processing Leading Byte */
3330 charset = CHARSET_BY_LEADING_BYTE (c);
3331 if (LEADING_BYTE_PREFIX_P(c))
3336 { /* Processing Non-ASCII character */
3338 if (EQ (charset, Vcharset_control_1))
3340 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3344 switch (XCHARSET_REP_BYTES (charset))
3347 encode_ucs4 (charset, c, 0, dst);
3350 if (XCHARSET_PRIVATE_P (charset))
3352 encode_ucs4 (charset, c, 0, dst);
3357 #ifdef ENABLE_COMPOSITE_CHARS
3358 if (EQ (charset, Vcharset_composite))
3362 /* #### Bother! We don't know how to
3364 Dynarr_add (dst, 0);
3365 Dynarr_add (dst, 0);
3366 Dynarr_add (dst, 0);
3367 Dynarr_add (dst, '~');
3371 Emchar emch = MAKE_CHAR (Vcharset_composite,
3372 ch & 0x7F, c & 0x7F);
3373 Lisp_Object lstr = composite_char_string (emch);
3377 src = XSTRING_DATA (lstr);
3378 n = XSTRING_LENGTH (lstr);
3382 #endif /* ENABLE_COMPOSITE_CHARS */
3384 encode_ucs4(charset, ch, c, dst);
3397 encode_ucs4 (charset, ch, c, dst);
3413 #ifdef ENABLE_COMPOSITE_CHARS
3419 goto back_to_square_n; /* Wheeeeeeeee ..... */
3421 #endif /* ENABLE_COMPOSITE_CHARS */
3425 str->iso2022.current_char_boundary = char_boundary;
3426 str->iso2022.current_charset = charset;
3428 /* Verbum caro factum est! */
3432 /************************************************************************/
3434 /************************************************************************/
3437 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3442 unsigned char c = *src++;
3443 switch (st->utf8.in_byte)
3446 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3449 st->utf8.in_byte = 5;
3451 st->utf8.in_byte = 4;
3453 st->utf8.in_byte = 3;
3455 st->utf8.in_byte = 2;
3457 st->utf8.in_byte = 1;
3462 if ((c & 0xc0) != 0x80)
3468 return CODING_CATEGORY_UTF8_MASK;
3472 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3473 unsigned_char_dynarr *dst, unsigned int n)
3475 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3476 unsigned int flags = str->flags;
3477 unsigned int ch = str->ch;
3478 eol_type_t eol_type = str->eol_type;
3482 unsigned char c = *src++;
3491 else if ( c >= 0xf8 )
3496 else if ( c >= 0xf0 )
3501 else if ( c >= 0xe0 )
3506 else if ( c >= 0xc0 )
3513 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3514 decode_ucs4 (c, dst);
3518 ch = ( ch << 6 ) | ( c & 0x3f );
3519 decode_ucs4 (ch, dst);
3524 ch = ( ch << 6 ) | ( c & 0x3f );
3527 label_continue_loop:;
3530 if (flags & CODING_STATE_END)
3531 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3538 encode_utf8 (Lisp_Object charset,
3539 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3541 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3544 Dynarr_add (dst, code);
3546 else if ( code <= 0x7ff )
3548 Dynarr_add (dst, (code >> 6) | 0xc0);
3549 Dynarr_add (dst, (code & 0x3f) | 0x80);
3551 else if ( code <= 0xffff )
3553 Dynarr_add (dst, (code >> 12) | 0xe0);
3554 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3555 Dynarr_add (dst, (code & 0x3f) | 0x80);
3557 else if ( code <= 0x1fffff )
3559 Dynarr_add (dst, (code >> 18) | 0xf0);
3560 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3561 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3562 Dynarr_add (dst, (code & 0x3f) | 0x80);
3564 else if ( code <= 0x3ffffff )
3566 Dynarr_add (dst, (code >> 24) | 0xf8);
3567 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3568 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3569 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3570 Dynarr_add (dst, (code & 0x3f) | 0x80);
3574 Dynarr_add (dst, (code >> 30) | 0xfc);
3575 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3576 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3577 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3578 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3579 Dynarr_add (dst, (code & 0x3f) | 0x80);
3584 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3585 unsigned_char_dynarr *dst, unsigned int n)
3587 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3588 unsigned int flags = str->flags;
3589 unsigned int ch = str->ch;
3590 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3591 unsigned char char_boundary = str->iso2022.current_char_boundary;
3592 Lisp_Object charset = str->iso2022.current_charset;
3594 #ifdef ENABLE_COMPOSITE_CHARS
3595 /* flags for handling composite chars. We do a little switcharoo
3596 on the source while we're outputting the composite char. */
3597 unsigned int saved_n = 0;
3598 CONST unsigned char *saved_src = NULL;
3599 int in_composite = 0;
3602 #endif /* ENABLE_COMPOSITE_CHARS */
3606 unsigned char c = *src++;
3608 if (BYTE_ASCII_P (c))
3609 { /* Processing ASCII character */
3613 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3614 Dynarr_add (dst, '\r');
3615 if (eol_type != EOL_CR)
3616 Dynarr_add (dst, c);
3619 encode_utf8 (Vcharset_ascii, c, 0, dst);
3622 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3623 { /* Processing Leading Byte */
3625 charset = CHARSET_BY_LEADING_BYTE (c);
3626 if (LEADING_BYTE_PREFIX_P(c))
3631 { /* Processing Non-ASCII character */
3633 if (EQ (charset, Vcharset_control_1))
3635 encode_utf8 (Vcharset_control_1, c, 0, dst);
3639 switch (XCHARSET_REP_BYTES (charset))
3642 encode_utf8 (charset, c, 0, dst);
3645 if (XCHARSET_PRIVATE_P (charset))
3647 encode_utf8 (charset, c, 0, dst);
3652 #ifdef ENABLE_COMPOSITE_CHARS
3653 if (EQ (charset, Vcharset_composite))
3657 /* #### Bother! We don't know how to
3659 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3663 Emchar emch = MAKE_CHAR (Vcharset_composite,
3664 ch & 0x7F, c & 0x7F);
3665 Lisp_Object lstr = composite_char_string (emch);
3669 src = XSTRING_DATA (lstr);
3670 n = XSTRING_LENGTH (lstr);
3674 #endif /* ENABLE_COMPOSITE_CHARS */
3676 encode_utf8 (charset, ch, c, dst);
3689 encode_utf8 (charset, ch, c, dst);
3705 #ifdef ENABLE_COMPOSITE_CHARS
3711 goto back_to_square_n; /* Wheeeeeeeee ..... */
3717 str->iso2022.current_char_boundary = char_boundary;
3718 str->iso2022.current_charset = charset;
3720 /* Verbum caro factum est! */
3724 /************************************************************************/
3725 /* ISO2022 methods */
3726 /************************************************************************/
3728 /* The following note describes the coding system ISO2022 briefly.
3729 Since the intention of this note is to help understand the
3730 functions in this file, some parts are NOT ACCURATE or OVERLY
3731 SIMPLIFIED. For thorough understanding, please refer to the
3732 original document of ISO2022.
3734 ISO2022 provides many mechanisms to encode several character sets
3735 in 7-bit and 8-bit environments. For 7-bit environments, all text
3736 is encoded using bytes less than 128. This may make the encoded
3737 text a little bit longer, but the text passes more easily through
3738 several gateways, some of which strip off MSB (Most Signigant Bit).
3740 There are two kinds of character sets: control character set and
3741 graphic character set. The former contains control characters such
3742 as `newline' and `escape' to provide control functions (control
3743 functions are also provided by escape sequences). The latter
3744 contains graphic characters such as 'A' and '-'. Emacs recognizes
3745 two control character sets and many graphic character sets.
3747 Graphic character sets are classified into one of the following
3748 four classes, according to the number of bytes (DIMENSION) and
3749 number of characters in one dimension (CHARS) of the set:
3750 - DIMENSION1_CHARS94
3751 - DIMENSION1_CHARS96
3752 - DIMENSION2_CHARS94
3753 - DIMENSION2_CHARS96
3755 In addition, each character set is assigned an identification tag,
3756 unique for each set, called "final character" (denoted as <F>
3757 hereafter). The <F> of each character set is decided by ECMA(*)
3758 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3759 (0x30..0x3F are for private use only).
3761 Note (*): ECMA = European Computer Manufacturers Association
3763 Here are examples of graphic character set [NAME(<F>)]:
3764 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3765 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3766 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3767 o DIMENSION2_CHARS96 -- none for the moment
3769 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3770 C0 [0x00..0x1F] -- control character plane 0
3771 GL [0x20..0x7F] -- graphic character plane 0
3772 C1 [0x80..0x9F] -- control character plane 1
3773 GR [0xA0..0xFF] -- graphic character plane 1
3775 A control character set is directly designated and invoked to C0 or
3776 C1 by an escape sequence. The most common case is that:
3777 - ISO646's control character set is designated/invoked to C0, and
3778 - ISO6429's control character set is designated/invoked to C1,
3779 and usually these designations/invocations are omitted in encoded
3780 text. In a 7-bit environment, only C0 can be used, and a control
3781 character for C1 is encoded by an appropriate escape sequence to
3782 fit into the environment. All control characters for C1 are
3783 defined to have corresponding escape sequences.
3785 A graphic character set is at first designated to one of four
3786 graphic registers (G0 through G3), then these graphic registers are
3787 invoked to GL or GR. These designations and invocations can be
3788 done independently. The most common case is that G0 is invoked to
3789 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3790 these invocations and designations are omitted in encoded text.
3791 In a 7-bit environment, only GL can be used.
3793 When a graphic character set of CHARS94 is invoked to GL, codes
3794 0x20 and 0x7F of the GL area work as control characters SPACE and
3795 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3798 There are two ways of invocation: locking-shift and single-shift.
3799 With locking-shift, the invocation lasts until the next different
3800 invocation, whereas with single-shift, the invocation affects the
3801 following character only and doesn't affect the locking-shift
3802 state. Invocations are done by the following control characters or
3805 ----------------------------------------------------------------------
3806 abbrev function cntrl escape seq description
3807 ----------------------------------------------------------------------
3808 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3809 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3810 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3811 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3812 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3813 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3814 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3815 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3816 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3817 ----------------------------------------------------------------------
3818 (*) These are not used by any known coding system.
3820 Control characters for these functions are defined by macros
3821 ISO_CODE_XXX in `coding.h'.
3823 Designations are done by the following escape sequences:
3824 ----------------------------------------------------------------------
3825 escape sequence description
3826 ----------------------------------------------------------------------
3827 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3828 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3829 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3830 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3831 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3832 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3833 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3834 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3835 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3836 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3837 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3838 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3839 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3840 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3841 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3842 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3843 ----------------------------------------------------------------------
3845 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3846 of dimension 1, chars 94, and final character <F>, etc...
3848 Note (*): Although these designations are not allowed in ISO2022,
3849 Emacs accepts them on decoding, and produces them on encoding
3850 CHARS96 character sets in a coding system which is characterized as
3851 7-bit environment, non-locking-shift, and non-single-shift.
3853 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3854 '(' can be omitted. We refer to this as "short-form" hereafter.
3856 Now you may notice that there are a lot of ways for encoding the
3857 same multilingual text in ISO2022. Actually, there exist many
3858 coding systems such as Compound Text (used in X11's inter client
3859 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3860 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3861 localized platforms), and all of these are variants of ISO2022.
3863 In addition to the above, Emacs handles two more kinds of escape
3864 sequences: ISO6429's direction specification and Emacs' private
3865 sequence for specifying character composition.
3867 ISO6429's direction specification takes the following form:
3868 o CSI ']' -- end of the current direction
3869 o CSI '0' ']' -- end of the current direction
3870 o CSI '1' ']' -- start of left-to-right text
3871 o CSI '2' ']' -- start of right-to-left text
3872 The control character CSI (0x9B: control sequence introducer) is
3873 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3875 Character composition specification takes the following form:
3876 o ESC '0' -- start character composition
3877 o ESC '1' -- end character composition
3878 Since these are not standard escape sequences of any ISO standard,
3879 their use with these meanings is restricted to Emacs only. */
3882 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3886 for (i = 0; i < 4; i++)
3888 if (!NILP (coding_system))
3890 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3892 iso->charset[i] = Qt;
3893 iso->invalid_designated[i] = 0;
3895 iso->esc = ISO_ESC_NOTHING;
3896 iso->esc_bytes_index = 0;
3897 iso->register_left = 0;
3898 iso->register_right = 1;
3899 iso->switched_dir_and_no_valid_charset_yet = 0;
3900 iso->invalid_switch_dir = 0;
3901 iso->output_direction_sequence = 0;
3902 iso->output_literally = 0;
3903 #ifdef ENABLE_COMPOSITE_CHARS
3904 if (iso->composite_chars)
3905 Dynarr_reset (iso->composite_chars);
3910 fit_to_be_escape_quoted (unsigned char c)
3927 /* Parse one byte of an ISO2022 escape sequence.
3928 If the result is an invalid escape sequence, return 0 and
3929 do not change anything in STR. Otherwise, if the result is
3930 an incomplete escape sequence, update ISO2022.ESC and
3931 ISO2022.ESC_BYTES and return -1. Otherwise, update
3932 all the state variables (but not ISO2022.ESC_BYTES) and
3935 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3936 or invocation of an invalid character set and treat that as
3937 an unrecognized escape sequence. */
3940 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3941 unsigned char c, unsigned int *flags,
3942 int check_invalid_charsets)
3944 /* (1) If we're at the end of a designation sequence, CS is the
3945 charset being designated and REG is the register to designate
3948 (2) If we're at the end of a locking-shift sequence, REG is
3949 the register to invoke and HALF (0 == left, 1 == right) is
3950 the half to invoke it into.
3952 (3) If we're at the end of a single-shift sequence, REG is
3953 the register to invoke. */
3954 Lisp_Object cs = Qnil;
3957 /* NOTE: This code does goto's all over the fucking place.
3958 The reason for this is that we're basically implementing
3959 a state machine here, and hierarchical languages like C
3960 don't really provide a clean way of doing this. */
3962 if (! (*flags & CODING_STATE_ESCAPE))
3963 /* At beginning of escape sequence; we need to reset our
3964 escape-state variables. */
3965 iso->esc = ISO_ESC_NOTHING;
3967 iso->output_literally = 0;
3968 iso->output_direction_sequence = 0;
3972 case ISO_ESC_NOTHING:
3973 iso->esc_bytes_index = 0;
3976 case ISO_CODE_ESC: /* Start escape sequence */
3977 *flags |= CODING_STATE_ESCAPE;
3981 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3982 *flags |= CODING_STATE_ESCAPE;
3983 iso->esc = ISO_ESC_5_11;
3986 case ISO_CODE_SO: /* locking shift 1 */
3989 case ISO_CODE_SI: /* locking shift 0 */
3993 case ISO_CODE_SS2: /* single shift */
3996 case ISO_CODE_SS3: /* single shift */
4000 default: /* Other control characters */
4007 /**** single shift ****/
4009 case 'N': /* single shift 2 */
4012 case 'O': /* single shift 3 */
4016 /**** locking shift ****/
4018 case '~': /* locking shift 1 right */
4021 case 'n': /* locking shift 2 */
4024 case '}': /* locking shift 2 right */
4027 case 'o': /* locking shift 3 */
4030 case '|': /* locking shift 3 right */
4034 #ifdef ENABLE_COMPOSITE_CHARS
4035 /**** composite ****/
4038 iso->esc = ISO_ESC_START_COMPOSITE;
4039 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4040 CODING_STATE_COMPOSITE;
4044 iso->esc = ISO_ESC_END_COMPOSITE;
4045 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4046 ~CODING_STATE_COMPOSITE;
4048 #endif /* ENABLE_COMPOSITE_CHARS */
4050 /**** directionality ****/
4053 iso->esc = ISO_ESC_5_11;
4056 /**** designation ****/
4058 case '$': /* multibyte charset prefix */
4059 iso->esc = ISO_ESC_2_4;
4063 if (0x28 <= c && c <= 0x2F)
4065 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4069 /* This function is called with CODESYS equal to nil when
4070 doing coding-system detection. */
4072 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4073 && fit_to_be_escape_quoted (c))
4075 iso->esc = ISO_ESC_LITERAL;
4076 *flags &= CODING_STATE_ISO2022_LOCK;
4086 /**** directionality ****/
4088 case ISO_ESC_5_11: /* ISO6429 direction control */
4091 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4092 goto directionality;
4094 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4095 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4096 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4100 case ISO_ESC_5_11_0:
4103 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4104 goto directionality;
4108 case ISO_ESC_5_11_1:
4111 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4112 goto directionality;
4116 case ISO_ESC_5_11_2:
4119 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4120 goto directionality;
4125 iso->esc = ISO_ESC_DIRECTIONALITY;
4126 /* Various junk here to attempt to preserve the direction sequences
4127 literally in the text if they would otherwise be swallowed due
4128 to invalid designations that don't show up as actual charset
4129 changes in the text. */
4130 if (iso->invalid_switch_dir)
4132 /* We already inserted a direction switch literally into the
4133 text. We assume (#### this may not be right) that the
4134 next direction switch is the one going the other way,
4135 and we need to output that literally as well. */
4136 iso->output_literally = 1;
4137 iso->invalid_switch_dir = 0;
4143 /* If we are in the thrall of an invalid designation,
4144 then stick the directionality sequence literally into the
4145 output stream so it ends up in the original text again. */
4146 for (jj = 0; jj < 4; jj++)
4147 if (iso->invalid_designated[jj])
4151 iso->output_literally = 1;
4152 iso->invalid_switch_dir = 1;
4155 /* Indicate that we haven't yet seen a valid designation,
4156 so that if a switch-dir is directly followed by an
4157 invalid designation, both get inserted literally. */
4158 iso->switched_dir_and_no_valid_charset_yet = 1;
4163 /**** designation ****/
4166 if (0x28 <= c && c <= 0x2F)
4168 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4171 if (0x40 <= c && c <= 0x42)
4173 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4174 *flags & CODING_STATE_R2L ?
4175 CHARSET_RIGHT_TO_LEFT :
4176 CHARSET_LEFT_TO_RIGHT);
4186 if (c < '0' || c > '~')
4187 return 0; /* bad final byte */
4189 if (iso->esc >= ISO_ESC_2_8 &&
4190 iso->esc <= ISO_ESC_2_15)
4192 type = ((iso->esc >= ISO_ESC_2_12) ?
4193 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4194 reg = (iso->esc - ISO_ESC_2_8) & 3;
4196 else if (iso->esc >= ISO_ESC_2_4_8 &&
4197 iso->esc <= ISO_ESC_2_4_15)
4199 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4200 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4201 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4205 /* Can this ever be reached? -slb */
4209 cs = CHARSET_BY_ATTRIBUTES (type, c,
4210 *flags & CODING_STATE_R2L ?
4211 CHARSET_RIGHT_TO_LEFT :
4212 CHARSET_LEFT_TO_RIGHT);
4218 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4222 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4223 /* can't invoke something that ain't there. */
4225 iso->esc = ISO_ESC_SINGLE_SHIFT;
4226 *flags &= CODING_STATE_ISO2022_LOCK;
4228 *flags |= CODING_STATE_SS2;
4230 *flags |= CODING_STATE_SS3;
4234 if (check_invalid_charsets &&
4235 !CHARSETP (iso->charset[reg]))
4236 /* can't invoke something that ain't there. */
4239 iso->register_right = reg;
4241 iso->register_left = reg;
4242 *flags &= CODING_STATE_ISO2022_LOCK;
4243 iso->esc = ISO_ESC_LOCKING_SHIFT;
4247 if (NILP (cs) && check_invalid_charsets)
4249 iso->invalid_designated[reg] = 1;
4250 iso->charset[reg] = Vcharset_ascii;
4251 iso->esc = ISO_ESC_DESIGNATE;
4252 *flags &= CODING_STATE_ISO2022_LOCK;
4253 iso->output_literally = 1;
4254 if (iso->switched_dir_and_no_valid_charset_yet)
4256 /* We encountered a switch-direction followed by an
4257 invalid designation. Ensure that the switch-direction
4258 gets outputted; otherwise it will probably get eaten
4259 when the text is written out again. */
4260 iso->switched_dir_and_no_valid_charset_yet = 0;
4261 iso->output_direction_sequence = 1;
4262 /* And make sure that the switch-dir going the other
4263 way gets outputted, as well. */
4264 iso->invalid_switch_dir = 1;
4268 /* This function is called with CODESYS equal to nil when
4269 doing coding-system detection. */
4270 if (!NILP (codesys))
4272 charset_conversion_spec_dynarr *dyn =
4273 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4279 for (i = 0; i < Dynarr_length (dyn); i++)
4281 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4282 if (EQ (cs, spec->from_charset))
4283 cs = spec->to_charset;
4288 iso->charset[reg] = cs;
4289 iso->esc = ISO_ESC_DESIGNATE;
4290 *flags &= CODING_STATE_ISO2022_LOCK;
4291 if (iso->invalid_designated[reg])
4293 iso->invalid_designated[reg] = 0;
4294 iso->output_literally = 1;
4296 if (iso->switched_dir_and_no_valid_charset_yet)
4297 iso->switched_dir_and_no_valid_charset_yet = 0;
4302 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4307 /* #### There are serious deficiencies in the recognition mechanism
4308 here. This needs to be much smarter if it's going to cut it.
4309 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4310 it should be detected as Latin-1.
4311 All the ISO2022 stuff in this file should be synced up with the
4312 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4313 Perhaps we should wait till R2L works in FSF Emacs? */
4315 if (!st->iso2022.initted)
4317 reset_iso2022 (Qnil, &st->iso2022.iso);
4318 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4319 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4320 CODING_CATEGORY_ISO_8_1_MASK |
4321 CODING_CATEGORY_ISO_8_2_MASK |
4322 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4323 st->iso2022.flags = 0;
4324 st->iso2022.high_byte_count = 0;
4325 st->iso2022.saw_single_shift = 0;
4326 st->iso2022.initted = 1;
4329 mask = st->iso2022.mask;
4336 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4337 st->iso2022.high_byte_count++;
4341 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4343 if (st->iso2022.high_byte_count & 1)
4344 /* odd number of high bytes; assume not iso-8-2 */
4345 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4347 st->iso2022.high_byte_count = 0;
4348 st->iso2022.saw_single_shift = 0;
4350 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4352 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4353 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4354 { /* control chars */
4357 /* Allow and ignore control characters that you might
4358 reasonably see in a text file */
4363 case 8: /* backspace */
4364 case 11: /* vertical tab */
4365 case 12: /* form feed */
4366 case 26: /* MS-DOS C-z junk */
4367 case 31: /* '^_' -- for info */
4368 goto label_continue_loop;
4375 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4378 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4379 &st->iso2022.flags, 0))
4381 switch (st->iso2022.iso.esc)
4383 case ISO_ESC_DESIGNATE:
4384 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4385 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4387 case ISO_ESC_LOCKING_SHIFT:
4388 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4389 goto ran_out_of_chars;
4390 case ISO_ESC_SINGLE_SHIFT:
4391 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4392 st->iso2022.saw_single_shift = 1;
4401 goto ran_out_of_chars;
4404 label_continue_loop:;
4413 postprocess_iso2022_mask (int mask)
4415 /* #### kind of cheesy */
4416 /* If seven-bit ISO is allowed, then assume that the encoding is
4417 entirely seven-bit and turn off the eight-bit ones. */
4418 if (mask & CODING_CATEGORY_ISO_7_MASK)
4419 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4420 CODING_CATEGORY_ISO_8_1_MASK |
4421 CODING_CATEGORY_ISO_8_2_MASK);
4425 /* If FLAGS is a null pointer or specifies right-to-left motion,
4426 output a switch-dir-to-left-to-right sequence to DST.
4427 Also update FLAGS if it is not a null pointer.
4428 If INTERNAL_P is set, we are outputting in internal format and
4429 need to handle the CSI differently. */
4432 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4433 unsigned_char_dynarr *dst,
4434 unsigned int *flags,
4437 if (!flags || (*flags & CODING_STATE_R2L))
4439 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4441 Dynarr_add (dst, ISO_CODE_ESC);
4442 Dynarr_add (dst, '[');
4444 else if (internal_p)
4445 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4447 Dynarr_add (dst, ISO_CODE_CSI);
4448 Dynarr_add (dst, '0');
4449 Dynarr_add (dst, ']');
4451 *flags &= ~CODING_STATE_R2L;
4455 /* If FLAGS is a null pointer or specifies a direction different from
4456 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4457 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4458 sequence to DST. Also update FLAGS if it is not a null pointer.
4459 If INTERNAL_P is set, we are outputting in internal format and
4460 need to handle the CSI differently. */
4463 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4464 unsigned_char_dynarr *dst, unsigned int *flags,
4467 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4468 direction == CHARSET_LEFT_TO_RIGHT)
4469 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4470 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4471 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4472 direction == CHARSET_RIGHT_TO_LEFT)
4474 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4476 Dynarr_add (dst, ISO_CODE_ESC);
4477 Dynarr_add (dst, '[');
4479 else if (internal_p)
4480 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4482 Dynarr_add (dst, ISO_CODE_CSI);
4483 Dynarr_add (dst, '2');
4484 Dynarr_add (dst, ']');
4486 *flags |= CODING_STATE_R2L;
4490 /* Convert ISO2022-format data to internal format. */
4493 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4494 unsigned_char_dynarr *dst, unsigned int n)
4496 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4497 unsigned int flags = str->flags;
4498 unsigned int ch = str->ch;
4499 eol_type_t eol_type = str->eol_type;
4500 #ifdef ENABLE_COMPOSITE_CHARS
4501 unsigned_char_dynarr *real_dst = dst;
4503 Lisp_Object coding_system;
4505 XSETCODING_SYSTEM (coding_system, str->codesys);
4507 #ifdef ENABLE_COMPOSITE_CHARS
4508 if (flags & CODING_STATE_COMPOSITE)
4509 dst = str->iso2022.composite_chars;
4510 #endif /* ENABLE_COMPOSITE_CHARS */
4514 unsigned char c = *src++;
4515 if (flags & CODING_STATE_ESCAPE)
4516 { /* Within ESC sequence */
4517 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4522 switch (str->iso2022.esc)
4524 #ifdef ENABLE_COMPOSITE_CHARS
4525 case ISO_ESC_START_COMPOSITE:
4526 if (str->iso2022.composite_chars)
4527 Dynarr_reset (str->iso2022.composite_chars);
4529 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4530 dst = str->iso2022.composite_chars;
4532 case ISO_ESC_END_COMPOSITE:
4534 Bufbyte comstr[MAX_EMCHAR_LEN];
4536 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4537 Dynarr_length (dst));
4539 len = set_charptr_emchar (comstr, emch);
4540 Dynarr_add_many (dst, comstr, len);
4543 #endif /* ENABLE_COMPOSITE_CHARS */
4545 case ISO_ESC_LITERAL:
4546 DECODE_ADD_BINARY_CHAR (c, dst);
4550 /* Everything else handled already */
4555 /* Attempted error recovery. */
4556 if (str->iso2022.output_direction_sequence)
4557 ensure_correct_direction (flags & CODING_STATE_R2L ?
4558 CHARSET_RIGHT_TO_LEFT :
4559 CHARSET_LEFT_TO_RIGHT,
4560 str->codesys, dst, 0, 1);
4561 /* More error recovery. */
4562 if (!retval || str->iso2022.output_literally)
4564 /* Output the (possibly invalid) sequence */
4566 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4567 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4568 flags &= CODING_STATE_ISO2022_LOCK;
4570 n++, src--;/* Repeat the loop with the same character. */
4573 /* No sense in reprocessing the final byte of the
4574 escape sequence; it could mess things up anyway.
4576 DECODE_ADD_BINARY_CHAR (c, dst);
4581 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4582 { /* Control characters */
4584 /***** Error-handling *****/
4586 /* If we were in the middle of a character, dump out the
4587 partial character. */
4588 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4590 /* If we just saw a single-shift character, dump it out.
4591 This may dump out the wrong sort of single-shift character,
4592 but least it will give an indication that something went
4594 if (flags & CODING_STATE_SS2)
4596 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4597 flags &= ~CODING_STATE_SS2;
4599 if (flags & CODING_STATE_SS3)
4601 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4602 flags &= ~CODING_STATE_SS3;
4605 /***** Now handle the control characters. *****/
4608 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4610 flags &= CODING_STATE_ISO2022_LOCK;
4612 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4613 DECODE_ADD_BINARY_CHAR (c, dst);
4616 { /* Graphic characters */
4617 Lisp_Object charset;
4621 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4623 /* Now determine the charset. */
4624 reg = ((flags & CODING_STATE_SS2) ? 2
4625 : (flags & CODING_STATE_SS3) ? 3
4626 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4627 : str->iso2022.register_left);
4628 charset = str->iso2022.charset[reg];
4630 /* Error checking: */
4631 if (! CHARSETP (charset)
4632 || str->iso2022.invalid_designated[reg]
4633 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4634 && XCHARSET_CHARS (charset) == 94))
4635 /* Mrmph. We are trying to invoke a register that has no
4636 or an invalid charset in it, or trying to add a character
4637 outside the range of the charset. Insert that char literally
4638 to preserve it for the output. */
4640 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4641 DECODE_ADD_BINARY_CHAR (c, dst);
4646 /* Things are probably hunky-dorey. */
4648 /* Fetch reverse charset, maybe. */
4649 if (((flags & CODING_STATE_R2L) &&
4650 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4652 (!(flags & CODING_STATE_R2L) &&
4653 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4655 Lisp_Object new_charset =
4656 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4657 if (!NILP (new_charset))
4658 charset = new_charset;
4661 lb = XCHARSET_LEADING_BYTE (charset);
4662 switch (XCHARSET_REP_BYTES (charset))
4665 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4666 Dynarr_add (dst, c & 0x7F);
4669 case 2: /* one-byte official */
4670 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4671 Dynarr_add (dst, lb);
4672 Dynarr_add (dst, c | 0x80);
4675 case 3: /* one-byte private or two-byte official */
4676 if (XCHARSET_PRIVATE_P (charset))
4678 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4679 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4680 Dynarr_add (dst, lb);
4681 Dynarr_add (dst, c | 0x80);
4687 Dynarr_add (dst, lb);
4688 Dynarr_add (dst, ch | 0x80);
4689 Dynarr_add (dst, c | 0x80);
4697 default: /* two-byte private */
4700 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4701 Dynarr_add (dst, lb);
4702 Dynarr_add (dst, ch | 0x80);
4703 Dynarr_add (dst, c | 0x80);
4712 flags &= CODING_STATE_ISO2022_LOCK;
4715 label_continue_loop:;
4718 if (flags & CODING_STATE_END)
4719 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4726 /***** ISO2022 encoder *****/
4728 /* Designate CHARSET into register REG. */
4731 iso2022_designate (Lisp_Object charset, unsigned char reg,
4732 struct encoding_stream *str, unsigned_char_dynarr *dst)
4734 static CONST char inter94[] = "()*+";
4735 static CONST char inter96[] = ",-./";
4737 unsigned char final;
4738 Lisp_Object old_charset = str->iso2022.charset[reg];
4740 str->iso2022.charset[reg] = charset;
4741 if (!CHARSETP (charset))
4742 /* charset might be an initial nil or t. */
4744 type = XCHARSET_TYPE (charset);
4745 final = XCHARSET_FINAL (charset);
4746 if (!str->iso2022.force_charset_on_output[reg] &&
4747 CHARSETP (old_charset) &&
4748 XCHARSET_TYPE (old_charset) == type &&
4749 XCHARSET_FINAL (old_charset) == final)
4752 str->iso2022.force_charset_on_output[reg] = 0;
4755 charset_conversion_spec_dynarr *dyn =
4756 str->codesys->iso2022.output_conv;
4762 for (i = 0; i < Dynarr_length (dyn); i++)
4764 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4765 if (EQ (charset, spec->from_charset))
4766 charset = spec->to_charset;
4771 Dynarr_add (dst, ISO_CODE_ESC);
4774 case CHARSET_TYPE_94:
4775 Dynarr_add (dst, inter94[reg]);
4777 case CHARSET_TYPE_96:
4778 Dynarr_add (dst, inter96[reg]);
4780 case CHARSET_TYPE_94X94:
4781 Dynarr_add (dst, '$');
4783 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4786 Dynarr_add (dst, inter94[reg]);
4788 case CHARSET_TYPE_96X96:
4789 Dynarr_add (dst, '$');
4790 Dynarr_add (dst, inter96[reg]);
4793 Dynarr_add (dst, final);
4797 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4799 if (str->iso2022.register_left != 0)
4801 Dynarr_add (dst, ISO_CODE_SI);
4802 str->iso2022.register_left = 0;
4807 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4809 if (str->iso2022.register_left != 1)
4811 Dynarr_add (dst, ISO_CODE_SO);
4812 str->iso2022.register_left = 1;
4816 /* Convert internally-formatted data to ISO2022 format. */
4819 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4820 unsigned_char_dynarr *dst, unsigned int n)
4822 unsigned char charmask, c;
4823 unsigned char char_boundary;
4824 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4825 unsigned int flags = str->flags;
4826 unsigned int ch = str->ch;
4827 Lisp_Coding_System *codesys = str->codesys;
4828 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4830 Lisp_Object charset;
4833 #ifdef ENABLE_COMPOSITE_CHARS
4834 /* flags for handling composite chars. We do a little switcharoo
4835 on the source while we're outputting the composite char. */
4836 unsigned int saved_n = 0;
4837 CONST unsigned char *saved_src = NULL;
4838 int in_composite = 0;
4839 #endif /* ENABLE_COMPOSITE_CHARS */
4841 char_boundary = str->iso2022.current_char_boundary;
4842 charset = str->iso2022.current_charset;
4843 half = str->iso2022.current_half;
4845 #ifdef ENABLE_COMPOSITE_CHARS
4852 if (BYTE_ASCII_P (c))
4853 { /* Processing ASCII character */
4856 restore_left_to_right_direction (codesys, dst, &flags, 0);
4858 /* Make sure G0 contains ASCII */
4859 if ((c > ' ' && c < ISO_CODE_DEL) ||
4860 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4862 ensure_normal_shift (str, dst);
4863 iso2022_designate (Vcharset_ascii, 0, str, dst);
4866 /* If necessary, restore everything to the default state
4869 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4871 restore_left_to_right_direction (codesys, dst, &flags, 0);
4873 ensure_normal_shift (str, dst);
4875 for (i = 0; i < 4; i++)
4877 Lisp_Object initial_charset =
4878 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4879 iso2022_designate (initial_charset, i, str, dst);
4884 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4885 Dynarr_add (dst, '\r');
4886 if (eol_type != EOL_CR)
4887 Dynarr_add (dst, c);
4891 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4892 && fit_to_be_escape_quoted (c))
4893 Dynarr_add (dst, ISO_CODE_ESC);
4894 Dynarr_add (dst, c);
4899 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4900 { /* Processing Leading Byte */
4902 charset = CHARSET_BY_LEADING_BYTE (c);
4903 if (LEADING_BYTE_PREFIX_P(c))
4905 else if (!EQ (charset, Vcharset_control_1)
4906 #ifdef ENABLE_COMPOSITE_CHARS
4907 && !EQ (charset, Vcharset_composite)
4913 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4914 codesys, dst, &flags, 0);
4916 /* Now determine which register to use. */
4918 for (i = 0; i < 4; i++)
4920 if (EQ (charset, str->iso2022.charset[i]) ||
4922 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4931 if (XCHARSET_GRAPHIC (charset) != 0)
4933 if (!NILP (str->iso2022.charset[1]) &&
4934 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4935 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4937 else if (!NILP (str->iso2022.charset[2]))
4939 else if (!NILP (str->iso2022.charset[3]))
4948 iso2022_designate (charset, reg, str, dst);
4950 /* Now invoke that register. */
4954 ensure_normal_shift (str, dst);
4959 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4961 ensure_shift_out (str, dst);
4969 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4971 Dynarr_add (dst, ISO_CODE_ESC);
4972 Dynarr_add (dst, 'N');
4977 Dynarr_add (dst, ISO_CODE_SS2);
4983 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4985 Dynarr_add (dst, ISO_CODE_ESC);
4986 Dynarr_add (dst, 'O');
4991 Dynarr_add (dst, ISO_CODE_SS3);
5003 { /* Processing Non-ASCII character */
5004 charmask = (half == 0 ? 0x7F : 0xFF);
5006 if (EQ (charset, Vcharset_control_1))
5008 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5009 && fit_to_be_escape_quoted (c))
5010 Dynarr_add (dst, ISO_CODE_ESC);
5011 /* you asked for it ... */
5012 Dynarr_add (dst, c - 0x20);
5016 switch (XCHARSET_REP_BYTES (charset))
5019 Dynarr_add (dst, c & charmask);
5022 if (XCHARSET_PRIVATE_P (charset))
5024 Dynarr_add (dst, c & charmask);
5029 #ifdef ENABLE_COMPOSITE_CHARS
5030 if (EQ (charset, Vcharset_composite))
5034 /* #### Bother! We don't know how to
5036 Dynarr_add (dst, '~');
5040 Emchar emch = MAKE_CHAR (Vcharset_composite,
5041 ch & 0x7F, c & 0x7F);
5042 Lisp_Object lstr = composite_char_string (emch);
5046 src = XSTRING_DATA (lstr);
5047 n = XSTRING_LENGTH (lstr);
5048 Dynarr_add (dst, ISO_CODE_ESC);
5049 Dynarr_add (dst, '0'); /* start composing */
5053 #endif /* ENABLE_COMPOSITE_CHARS */
5055 Dynarr_add (dst, ch & charmask);
5056 Dynarr_add (dst, c & charmask);
5069 Dynarr_add (dst, ch & charmask);
5070 Dynarr_add (dst, c & charmask);
5086 #ifdef ENABLE_COMPOSITE_CHARS
5092 Dynarr_add (dst, ISO_CODE_ESC);
5093 Dynarr_add (dst, '1'); /* end composing */
5094 goto back_to_square_n; /* Wheeeeeeeee ..... */
5096 #endif /* ENABLE_COMPOSITE_CHARS */
5098 if (char_boundary && flags & CODING_STATE_END)
5100 restore_left_to_right_direction (codesys, dst, &flags, 0);
5101 ensure_normal_shift (str, dst);
5102 for (i = 0; i < 4; i++)
5104 Lisp_Object initial_charset =
5105 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5106 iso2022_designate (initial_charset, i, str, dst);
5112 str->iso2022.current_char_boundary = char_boundary;
5113 str->iso2022.current_charset = charset;
5114 str->iso2022.current_half = half;
5116 /* Verbum caro factum est! */
5120 /************************************************************************/
5121 /* No-conversion methods */
5122 /************************************************************************/
5124 /* This is used when reading in "binary" files -- i.e. files that may
5125 contain all 256 possible byte values and that are not to be
5126 interpreted as being in any particular decoding. */
5128 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5129 unsigned_char_dynarr *dst, unsigned int n)
5132 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5133 unsigned int flags = str->flags;
5134 unsigned int ch = str->ch;
5135 eol_type_t eol_type = str->eol_type;
5141 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5142 DECODE_ADD_BINARY_CHAR (c, dst);
5143 label_continue_loop:;
5146 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5153 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5154 unsigned_char_dynarr *dst, unsigned int n)
5157 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5158 unsigned int flags = str->flags;
5159 unsigned int ch = str->ch;
5160 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5167 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5168 Dynarr_add (dst, '\r');
5169 if (eol_type != EOL_CR)
5170 Dynarr_add (dst, '\n');
5173 else if (BYTE_ASCII_P (c))
5176 Dynarr_add (dst, c);
5178 else if (BUFBYTE_LEADING_BYTE_P (c))
5181 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5182 c == LEADING_BYTE_CONTROL_1)
5185 Dynarr_add (dst, '~'); /* untranslatable character */
5189 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5190 Dynarr_add (dst, c);
5191 else if (ch == LEADING_BYTE_CONTROL_1)
5194 Dynarr_add (dst, c - 0x20);
5196 /* else it should be the second or third byte of an
5197 untranslatable character, so ignore it */
5207 /************************************************************************/
5208 /* Simple internal/external functions */
5209 /************************************************************************/
5211 static Extbyte_dynarr *conversion_out_dynarr;
5212 static Bufbyte_dynarr *conversion_in_dynarr;
5214 /* Determine coding system from coding format */
5216 /* #### not correct for all values of `fmt'! */
5218 external_data_format_to_coding_system (enum external_data_format fmt)
5222 case FORMAT_FILENAME:
5223 case FORMAT_TERMINAL:
5224 if (EQ (Vfile_name_coding_system, Qnil) ||
5225 EQ (Vfile_name_coding_system, Qbinary))
5228 return Fget_coding_system (Vfile_name_coding_system);
5231 return Fget_coding_system (Qctext);
5239 convert_to_external_format (CONST Bufbyte *ptr,
5242 enum external_data_format fmt)
5244 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5246 if (!conversion_out_dynarr)
5247 conversion_out_dynarr = Dynarr_new (Extbyte);
5249 Dynarr_reset (conversion_out_dynarr);
5251 if (NILP (coding_system))
5253 CONST Bufbyte *end = ptr + len;
5258 (BYTE_ASCII_P (*ptr)) ? *ptr :
5259 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5260 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5263 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5267 #ifdef ERROR_CHECK_BUFPOS
5268 assert (ptr == end);
5273 Lisp_Object instream, outstream, da_outstream;
5274 Lstream *istr, *ostr;
5275 struct gcpro gcpro1, gcpro2, gcpro3;
5276 char tempbuf[1024]; /* some random amount */
5278 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5279 da_outstream = make_dynarr_output_stream
5280 ((unsigned_char_dynarr *) conversion_out_dynarr);
5282 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5283 istr = XLSTREAM (instream);
5284 ostr = XLSTREAM (outstream);
5285 GCPRO3 (instream, outstream, da_outstream);
5288 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5291 Lstream_write (ostr, tempbuf, size_in_bytes);
5293 Lstream_close (istr);
5294 Lstream_close (ostr);
5296 Lstream_delete (istr);
5297 Lstream_delete (ostr);
5298 Lstream_delete (XLSTREAM (da_outstream));
5301 *len_out = Dynarr_length (conversion_out_dynarr);
5302 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5303 return Dynarr_atp (conversion_out_dynarr, 0);
5307 convert_from_external_format (CONST Extbyte *ptr,
5310 enum external_data_format fmt)
5312 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5314 if (!conversion_in_dynarr)
5315 conversion_in_dynarr = Dynarr_new (Bufbyte);
5317 Dynarr_reset (conversion_in_dynarr);
5319 if (NILP (coding_system))
5321 CONST Extbyte *end = ptr + len;
5322 for (; ptr < end; ptr++)
5325 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5330 Lisp_Object instream, outstream, da_outstream;
5331 Lstream *istr, *ostr;
5332 struct gcpro gcpro1, gcpro2, gcpro3;
5333 char tempbuf[1024]; /* some random amount */
5335 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5336 da_outstream = make_dynarr_output_stream
5337 ((unsigned_char_dynarr *) conversion_in_dynarr);
5339 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5340 istr = XLSTREAM (instream);
5341 ostr = XLSTREAM (outstream);
5342 GCPRO3 (instream, outstream, da_outstream);
5345 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5348 Lstream_write (ostr, tempbuf, size_in_bytes);
5350 Lstream_close (istr);
5351 Lstream_close (ostr);
5353 Lstream_delete (istr);
5354 Lstream_delete (ostr);
5355 Lstream_delete (XLSTREAM (da_outstream));
5358 *len_out = Dynarr_length (conversion_in_dynarr);
5359 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5360 return Dynarr_atp (conversion_in_dynarr, 0);
5364 /************************************************************************/
5365 /* Initialization */
5366 /************************************************************************/
5369 syms_of_mule_coding (void)
5371 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5372 deferror (&Qcoding_system_error, "coding-system-error",
5373 "Coding-system error", Qio_error);
5375 DEFSUBR (Fcoding_system_p);
5376 DEFSUBR (Ffind_coding_system);
5377 DEFSUBR (Fget_coding_system);
5378 DEFSUBR (Fcoding_system_list);
5379 DEFSUBR (Fcoding_system_name);
5380 DEFSUBR (Fmake_coding_system);
5381 DEFSUBR (Fcopy_coding_system);
5382 DEFSUBR (Fsubsidiary_coding_system);
5384 DEFSUBR (Fcoding_system_type);
5385 DEFSUBR (Fcoding_system_doc_string);
5387 DEFSUBR (Fcoding_system_charset);
5389 DEFSUBR (Fcoding_system_property);
5391 DEFSUBR (Fcoding_category_list);
5392 DEFSUBR (Fset_coding_priority_list);
5393 DEFSUBR (Fcoding_priority_list);
5394 DEFSUBR (Fset_coding_category_system);
5395 DEFSUBR (Fcoding_category_system);
5397 DEFSUBR (Fdetect_coding_region);
5398 DEFSUBR (Fdecode_coding_region);
5399 DEFSUBR (Fencode_coding_region);
5401 DEFSUBR (Fdecode_shift_jis_char);
5402 DEFSUBR (Fencode_shift_jis_char);
5403 DEFSUBR (Fdecode_big5_char);
5404 DEFSUBR (Fencode_big5_char);
5405 DEFSUBR (Fset_ucs_char);
5406 DEFSUBR (Fucs_char);
5407 DEFSUBR (Fset_char_ucs);
5408 DEFSUBR (Fchar_ucs);
5410 defsymbol (&Qcoding_system_p, "coding-system-p");
5411 defsymbol (&Qno_conversion, "no-conversion");
5413 defsymbol (&Qbig5, "big5");
5414 defsymbol (&Qshift_jis, "shift-jis");
5415 defsymbol (&Qucs4, "ucs-4");
5416 defsymbol (&Qutf8, "utf-8");
5417 defsymbol (&Qccl, "ccl");
5418 defsymbol (&Qiso2022, "iso2022");
5420 defsymbol (&Qmnemonic, "mnemonic");
5421 defsymbol (&Qeol_type, "eol-type");
5422 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5423 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5425 defsymbol (&Qcr, "cr");
5426 defsymbol (&Qlf, "lf");
5427 defsymbol (&Qcrlf, "crlf");
5428 defsymbol (&Qeol_cr, "eol-cr");
5429 defsymbol (&Qeol_lf, "eol-lf");
5430 defsymbol (&Qeol_crlf, "eol-crlf");
5432 defsymbol (&Qcharset_g0, "charset-g0");
5433 defsymbol (&Qcharset_g1, "charset-g1");
5434 defsymbol (&Qcharset_g2, "charset-g2");
5435 defsymbol (&Qcharset_g3, "charset-g3");
5436 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5437 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5438 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5439 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5440 defsymbol (&Qno_iso6429, "no-iso6429");
5441 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5442 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5444 defsymbol (&Qshort, "short");
5445 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5446 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5447 defsymbol (&Qseven, "seven");
5448 defsymbol (&Qlock_shift, "lock-shift");
5449 defsymbol (&Qescape_quoted, "escape-quoted");
5451 defsymbol (&Qencode, "encode");
5452 defsymbol (&Qdecode, "decode");
5455 defsymbol (&Qctext, "ctext");
5456 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5458 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5460 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5462 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5464 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5466 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5468 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5470 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5472 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5475 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5480 lstream_type_create_mule_coding (void)
5482 LSTREAM_HAS_METHOD (decoding, reader);
5483 LSTREAM_HAS_METHOD (decoding, writer);
5484 LSTREAM_HAS_METHOD (decoding, rewinder);
5485 LSTREAM_HAS_METHOD (decoding, seekable_p);
5486 LSTREAM_HAS_METHOD (decoding, flusher);
5487 LSTREAM_HAS_METHOD (decoding, closer);
5488 LSTREAM_HAS_METHOD (decoding, marker);
5490 LSTREAM_HAS_METHOD (encoding, reader);
5491 LSTREAM_HAS_METHOD (encoding, writer);
5492 LSTREAM_HAS_METHOD (encoding, rewinder);
5493 LSTREAM_HAS_METHOD (encoding, seekable_p);
5494 LSTREAM_HAS_METHOD (encoding, flusher);
5495 LSTREAM_HAS_METHOD (encoding, closer);
5496 LSTREAM_HAS_METHOD (encoding, marker);
5500 vars_of_mule_coding (void)
5504 /* Initialize to something reasonable ... */
5505 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5507 coding_category_system[i] = Qnil;
5508 coding_category_by_priority[i] = i;
5511 Fprovide (intern ("file-coding"));
5513 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5514 Coding system used for TTY keyboard input.
5515 Not used under a windowing system.
5517 Vkeyboard_coding_system = Qnil;
5519 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5520 Coding system used for TTY display output.
5521 Not used under a windowing system.
5523 Vterminal_coding_system = Qnil;
5525 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5526 Overriding coding system used when writing a file or process.
5527 You should *bind* this, not set it. If this is non-nil, it specifies
5528 the coding system that will be used when a file or process is read
5529 in, and overrides `buffer-file-coding-system-for-read',
5530 `insert-file-contents-pre-hook', etc. Use those variables instead of
5531 this one for permanent changes to the environment.
5533 Vcoding_system_for_read = Qnil;
5535 DEFVAR_LISP ("coding-system-for-write",
5536 &Vcoding_system_for_write /*
5537 Overriding coding system used when writing a file or process.
5538 You should *bind* this, not set it. If this is non-nil, it specifies
5539 the coding system that will be used when a file or process is wrote
5540 in, and overrides `buffer-file-coding-system',
5541 `write-region-pre-hook', etc. Use those variables instead of this one
5542 for permanent changes to the environment.
5544 Vcoding_system_for_write = Qnil;
5546 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5547 Coding system used to convert pathnames when accessing files.
5549 Vfile_name_coding_system = Qnil;
5551 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5552 Non-nil means the buffer contents are regarded as multi-byte form
5553 of characters, not a binary code. This affects the display, file I/O,
5554 and behaviors of various editing commands.
5556 Setting this to nil does not do anything.
5558 enable_multibyte_characters = 1;
5562 complex_vars_of_mule_coding (void)
5564 staticpro (&Vcoding_system_hash_table);
5565 Vcoding_system_hash_table =
5566 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5568 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5570 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5572 struct codesys_prop csp; \
5574 csp.prop_type = (Prop_Type); \
5575 Dynarr_add (the_codesys_prop_dynarr, csp); \
5578 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5579 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5580 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5581 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5582 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5583 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5584 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5586 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5587 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5588 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5589 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5590 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5591 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5592 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5593 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5594 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5595 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5596 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5597 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5598 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5599 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5600 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5601 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5602 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5604 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5605 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5607 /* Need to create this here or we're really screwed. */
5608 Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
5609 list2 (Qmnemonic, build_string ("Noconv")));
5611 Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
5614 /* Need this for bootstrapping */
5615 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5616 Fget_coding_system (Qno_conversion);
5622 for (i = 0; i < 65536; i++)
5623 ucs_to_mule_table[i] = Qnil;
5625 staticpro (&mule_to_ucs_table);
5626 mule_to_ucs_table = Fmake_char_table(Qgeneric);