1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
36 #include "file-coding.h"
38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
40 Lisp_Object Vkeyboard_coding_system;
41 Lisp_Object Vterminal_coding_system;
42 Lisp_Object Vcoding_system_for_read;
43 Lisp_Object Vcoding_system_for_write;
44 Lisp_Object Vfile_name_coding_system;
46 /* Table of symbols identifying each coding category. */
47 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
49 /* Coding system currently associated with each coding category. */
50 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
52 /* Table of all coding categories in decreasing order of priority.
53 This describes a permutation of the possible coding categories. */
54 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
56 Lisp_Object Qcoding_system_p;
58 Lisp_Object Qno_conversion, Qccl, Qiso2022;
59 /* Qinternal in general.c */
61 Lisp_Object Qmnemonic, Qeol_type;
62 Lisp_Object Qcr, Qcrlf, Qlf;
63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
64 Lisp_Object Qpost_read_conversion;
65 Lisp_Object Qpre_write_conversion;
68 Lisp_Object Qucs4, Qutf8;
69 Lisp_Object Qbig5, Qshift_jis;
70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
73 Lisp_Object Qno_iso6429;
74 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
75 Lisp_Object Qctext, Qescape_quoted;
76 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
78 Lisp_Object Qencode, Qdecode;
80 Lisp_Object Vcoding_system_hash_table;
82 int enable_multibyte_characters;
85 /* Additional information used by the ISO2022 decoder and detector. */
86 struct iso2022_decoder
88 /* CHARSET holds the character sets currently assigned to the G0
89 through G3 variables. It is initialized from the array
90 INITIAL_CHARSET in CODESYS. */
91 Lisp_Object charset[4];
93 /* Which registers are currently invoked into the left (GL) and
94 right (GR) halves of the 8-bit encoding space? */
95 int register_left, register_right;
97 /* ISO_ESC holds a value indicating part of an escape sequence
98 that has already been seen. */
99 enum iso_esc_flag esc;
101 /* This records the bytes we've seen so far in an escape sequence,
102 in case the sequence is invalid (we spit out the bytes unchanged). */
103 unsigned char esc_bytes[8];
105 /* Index for next byte to store in ISO escape sequence. */
108 #ifdef ENABLE_COMPOSITE_CHARS
109 /* Stuff seen so far when composing a string. */
110 unsigned_char_dynarr *composite_chars;
113 /* If we saw an invalid designation sequence for a particular
114 register, we flag it here and switch to ASCII. The next time we
115 see a valid designation for this register, we turn off the flag
116 and do the designation normally, but pretend the sequence was
117 invalid. The effect of all this is that (most of the time) the
118 escape sequences for both the switch to the unknown charset, and
119 the switch back to the known charset, get inserted literally into
120 the buffer and saved out as such. The hope is that we can
121 preserve the escape sequences so that the resulting written out
122 file makes sense. If we don't do any of this, the designation
123 to the invalid charset will be preserved but that switch back
124 to the known charset will probably get eaten because it was
125 the same charset that was already present in the register. */
126 unsigned char invalid_designated[4];
128 /* We try to do similar things as above for direction-switching
129 sequences. If we encountered a direction switch while an
130 invalid designation was present, or an invalid designation
131 just after a direction switch (i.e. no valid designation
132 encountered yet), we insert the direction-switch escape
133 sequence literally into the output stream, and later on
134 insert the corresponding direction-restoring escape sequence
136 unsigned int switched_dir_and_no_valid_charset_yet :1;
137 unsigned int invalid_switch_dir :1;
139 /* Tells the decoder to output the escape sequence literally
140 even though it was valid. Used in the games we play to
141 avoid lossage when we encounter invalid designations. */
142 unsigned int output_literally :1;
143 /* We encountered a direction switch followed by an invalid
144 designation. We didn't output the direction switch
145 literally because we didn't know about the invalid designation;
146 but we have to do so now. */
147 unsigned int output_direction_sequence :1;
150 EXFUN (Fcopy_coding_system, 2);
152 struct detection_state;
153 static int detect_coding_sjis (struct detection_state *st,
154 CONST unsigned char *src,
156 static void decode_coding_sjis (Lstream *decoding,
157 CONST unsigned char *src,
158 unsigned_char_dynarr *dst,
160 static void encode_coding_sjis (Lstream *encoding,
161 CONST unsigned char *src,
162 unsigned_char_dynarr *dst,
164 static int detect_coding_big5 (struct detection_state *st,
165 CONST unsigned char *src,
167 static void decode_coding_big5 (Lstream *decoding,
168 CONST unsigned char *src,
169 unsigned_char_dynarr *dst, unsigned int n);
170 static void encode_coding_big5 (Lstream *encoding,
171 CONST unsigned char *src,
172 unsigned_char_dynarr *dst, unsigned int n);
173 static int detect_coding_ucs4 (struct detection_state *st,
174 CONST unsigned char *src,
176 static void decode_coding_ucs4 (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static void encode_coding_ucs4 (Lstream *encoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
182 static int detect_coding_utf8 (struct detection_state *st,
183 CONST unsigned char *src,
185 static void decode_coding_utf8 (Lstream *decoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst, unsigned int n);
188 static void encode_coding_utf8 (Lstream *encoding,
189 CONST unsigned char *src,
190 unsigned_char_dynarr *dst, unsigned int n);
191 static int postprocess_iso2022_mask (int mask);
192 static void reset_iso2022 (Lisp_Object coding_system,
193 struct iso2022_decoder *iso);
194 static int detect_coding_iso2022 (struct detection_state *st,
195 CONST unsigned char *src,
197 static void decode_coding_iso2022 (Lstream *decoding,
198 CONST unsigned char *src,
199 unsigned_char_dynarr *dst, unsigned int n);
200 static void encode_coding_iso2022 (Lstream *encoding,
201 CONST unsigned char *src,
202 unsigned_char_dynarr *dst, unsigned int n);
204 static void decode_coding_no_conversion (Lstream *decoding,
205 CONST unsigned char *src,
206 unsigned_char_dynarr *dst,
208 static void encode_coding_no_conversion (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst,
212 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
213 unsigned_char_dynarr *dst, unsigned int n);
214 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
217 typedef struct codesys_prop codesys_prop;
226 Dynarr_declare (codesys_prop);
227 } codesys_prop_dynarr;
229 codesys_prop_dynarr *the_codesys_prop_dynarr;
231 enum codesys_prop_enum
234 CODESYS_PROP_ISO2022,
239 /************************************************************************/
240 /* Coding system functions */
241 /************************************************************************/
243 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
244 static void print_coding_system (Lisp_Object, Lisp_Object, int);
245 static void finalize_coding_system (void *header, int for_disksave);
247 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
248 mark_coding_system, print_coding_system,
249 finalize_coding_system,
250 0, 0, struct Lisp_Coding_System);
253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
255 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
257 markobj (CODING_SYSTEM_NAME (codesys));
258 markobj (CODING_SYSTEM_DOC_STRING (codesys));
259 markobj (CODING_SYSTEM_MNEMONIC (codesys));
260 markobj (CODING_SYSTEM_EOL_LF (codesys));
261 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
262 markobj (CODING_SYSTEM_EOL_CR (codesys));
264 switch (CODING_SYSTEM_TYPE (codesys))
268 case CODESYS_ISO2022:
269 for (i = 0; i < 4; i++)
270 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
271 if (codesys->iso2022.input_conv)
273 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
275 struct charset_conversion_spec *ccs =
276 Dynarr_atp (codesys->iso2022.input_conv, i);
277 markobj (ccs->from_charset);
278 markobj (ccs->to_charset);
281 if (codesys->iso2022.output_conv)
283 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
285 struct charset_conversion_spec *ccs =
286 Dynarr_atp (codesys->iso2022.output_conv, i);
287 markobj (ccs->from_charset);
288 markobj (ccs->to_charset);
294 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
295 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
302 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
303 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
310 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
312 error ("printing unreadable object #<coding_system 0x%x>",
315 write_c_string ("#<coding_system ", printcharfun);
316 print_internal (c->name, printcharfun, 1);
317 write_c_string (">", printcharfun);
321 finalize_coding_system (void *header, int for_disksave)
323 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
324 /* Since coding systems never go away, this function is not
325 necessary. But it would be necessary if we changed things
326 so that coding systems could go away. */
327 if (!for_disksave) /* see comment in lstream.c */
329 switch (CODING_SYSTEM_TYPE (c))
332 case CODESYS_ISO2022:
333 if (c->iso2022.input_conv)
335 Dynarr_free (c->iso2022.input_conv);
336 c->iso2022.input_conv = 0;
338 if (c->iso2022.output_conv)
340 Dynarr_free (c->iso2022.output_conv);
341 c->iso2022.output_conv = 0;
352 symbol_to_eol_type (Lisp_Object symbol)
354 CHECK_SYMBOL (symbol);
355 if (NILP (symbol)) return EOL_AUTODETECT;
356 if (EQ (symbol, Qlf)) return EOL_LF;
357 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
358 if (EQ (symbol, Qcr)) return EOL_CR;
360 signal_simple_error ("Unrecognized eol type", symbol);
361 return EOL_AUTODETECT; /* not reached */
365 eol_type_to_symbol (enum eol_type type)
370 case EOL_LF: return Qlf;
371 case EOL_CRLF: return Qcrlf;
372 case EOL_CR: return Qcr;
373 case EOL_AUTODETECT: return Qnil;
378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
380 Lisp_Object codesys_obj;
381 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
382 char *codesys_name = (char *) alloca (len + 7);
384 char *codesys_mnemonic=0;
386 Lisp_Object codesys_name_sym, sub_codesys_obj;
390 XSETCODING_SYSTEM (codesys_obj, codesys);
392 memcpy (codesys_name,
393 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
395 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
397 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
398 codesys_mnemonic = (char *) alloca (mlen + 7);
399 memcpy (codesys_mnemonic,
400 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
403 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
404 strcpy (codesys_name + len, "-" op_sys); \
406 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
407 codesys_name_sym = intern (codesys_name); \
408 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
409 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
411 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
412 build_string (codesys_mnemonic); \
413 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
416 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
417 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
418 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
421 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
422 Return t if OBJECT is a coding system.
423 A coding system is an object that defines how text containing multiple
424 character sets is encoded into a stream of (typically 8-bit) bytes.
425 The coding system is used to decode the stream into a series of
426 characters (which may be from multiple charsets) when the text is read
427 from a file or process, and is used to encode the text back into the
428 same format when it is written out to a file or process.
430 For example, many ISO2022-compliant coding systems (such as Compound
431 Text, which is used for inter-client data under the X Window System)
432 use escape sequences to switch between different charsets -- Japanese
433 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
434 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
435 `make-coding-system' for more information.
437 Coding systems are normally identified using a symbol, and the
438 symbol is accepted in place of the actual coding system object whenever
439 a coding system is called for. (This is similar to how faces work.)
443 return CODING_SYSTEMP (object) ? Qt : Qnil;
446 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
447 Retrieve the coding system of the given name.
449 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
450 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
451 If there is no such coding system, nil is returned. Otherwise the
452 associated coding system object is returned.
454 (coding_system_or_name))
456 if (CODING_SYSTEMP (coding_system_or_name))
457 return coding_system_or_name;
459 if (NILP (coding_system_or_name))
460 coding_system_or_name = Qbinary;
462 CHECK_SYMBOL (coding_system_or_name);
464 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
467 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
468 Retrieve the coding system of the given name.
469 Same as `find-coding-system' except that if there is no such
470 coding system, an error is signaled instead of returning nil.
474 Lisp_Object coding_system = Ffind_coding_system (name);
476 if (NILP (coding_system))
477 signal_simple_error ("No such coding system", name);
478 return coding_system;
481 /* We store the coding systems in hash tables with the names as the key and the
482 actual coding system object as the value. Occasionally we need to use them
483 in a list format. These routines provide us with that. */
484 struct coding_system_list_closure
486 Lisp_Object *coding_system_list;
490 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
491 void *coding_system_list_closure)
493 /* This function can GC */
494 struct coding_system_list_closure *cscl =
495 (struct coding_system_list_closure *) coding_system_list_closure;
496 Lisp_Object *coding_system_list = cscl->coding_system_list;
498 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
499 *coding_system_list);
503 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
504 Return a list of the names of all defined coding systems.
508 Lisp_Object coding_system_list = Qnil;
510 struct coding_system_list_closure coding_system_list_closure;
512 GCPRO1 (coding_system_list);
513 coding_system_list_closure.coding_system_list = &coding_system_list;
514 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
515 &coding_system_list_closure);
518 return coding_system_list;
521 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
522 Return the name of the given coding system.
526 coding_system = Fget_coding_system (coding_system);
527 return XCODING_SYSTEM_NAME (coding_system);
530 static Lisp_Coding_System *
531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
533 Lisp_Coding_System *codesys =
534 alloc_lcrecord_type (Lisp_Coding_System, lrecord_coding_system);
536 zero_lcrecord (codesys);
537 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
538 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
539 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
540 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
541 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
542 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
543 CODING_SYSTEM_TYPE (codesys) = type;
544 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
546 if (type == CODESYS_ISO2022)
549 for (i = 0; i < 4; i++)
550 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
552 else if (type == CODESYS_CCL)
554 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
555 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
558 CODING_SYSTEM_NAME (codesys) = name;
564 /* Given a list of charset conversion specs as specified in a Lisp
565 program, parse it into STORE_HERE. */
568 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
569 Lisp_Object spec_list)
573 EXTERNAL_LIST_LOOP (rest, spec_list)
575 Lisp_Object car = XCAR (rest);
576 Lisp_Object from, to;
577 struct charset_conversion_spec spec;
579 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
580 signal_simple_error ("Invalid charset conversion spec", car);
581 from = Fget_charset (XCAR (car));
582 to = Fget_charset (XCAR (XCDR (car)));
583 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
584 signal_simple_error_2
585 ("Attempted conversion between different charset types",
587 spec.from_charset = from;
588 spec.to_charset = to;
590 Dynarr_add (store_here, spec);
594 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
595 specs, return the equivalent as the Lisp programmer would see it.
597 If LOAD_HERE is 0, return Qnil. */
600 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
607 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
609 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
610 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
613 return Fnreverse (result);
618 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
619 Register symbol NAME as a coding system.
621 TYPE describes the conversion method used and should be one of
624 Automatic conversion. XEmacs attempts to detect the coding system
627 No conversion. Use this for binary files and such. On output,
628 graphic characters that are not in ASCII or Latin-1 will be
629 replaced by a ?. (For a no-conversion-encoded buffer, these
630 characters will only be present if you explicitly insert them.)
632 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
634 ISO 10646 UCS-4 encoding.
636 ISO 10646 UTF-8 encoding.
638 Any ISO2022-compliant encoding. Among other things, this includes
639 JIS (the Japanese encoding commonly used for e-mail), EUC (the
640 standard Unix encoding for Japanese and other languages), and
641 Compound Text (the encoding used in X11). You can specify more
642 specific information about the conversion with the FLAGS argument.
644 Big5 (the encoding commonly used for Taiwanese).
646 The conversion is performed using a user-written pseudo-code
647 program. CCL (Code Conversion Language) is the name of this
650 Write out or read in the raw contents of the memory representing
651 the buffer's text. This is primarily useful for debugging
652 purposes, and is only enabled when XEmacs has been compiled with
653 DEBUG_XEMACS defined (via the --debug configure option).
654 WARNING: Reading in a file using 'internal conversion can result
655 in an internal inconsistency in the memory representing a
656 buffer's text, which will produce unpredictable results and may
657 cause XEmacs to crash. Under normal circumstances you should
658 never use 'internal conversion.
660 DOC-STRING is a string describing the coding system.
662 PROPS is a property list, describing the specific nature of the
663 character set. Recognized properties are:
666 String to be displayed in the modeline when this coding system is
670 End-of-line conversion to be used. It should be one of
673 Automatically detect the end-of-line type (LF, CRLF,
674 or CR). Also generate subsidiary coding systems named
675 `NAME-unix', `NAME-dos', and `NAME-mac', that are
676 identical to this coding system but have an EOL-TYPE
677 value of 'lf, 'crlf, and 'cr, respectively.
679 The end of a line is marked externally using ASCII LF.
680 Since this is also the way that XEmacs represents an
681 end-of-line internally, specifying this option results
682 in no end-of-line conversion. This is the standard
683 format for Unix text files.
685 The end of a line is marked externally using ASCII
686 CRLF. This is the standard format for MS-DOS text
689 The end of a line is marked externally using ASCII CR.
690 This is the standard format for Macintosh text files.
692 Automatically detect the end-of-line type but do not
693 generate subsidiary coding systems. (This value is
694 converted to nil when stored internally, and
695 `coding-system-property' will return nil.)
697 'post-read-conversion
698 Function called after a file has been read in, to perform the
699 decoding. Called with two arguments, BEG and END, denoting
700 a region of the current buffer to be decoded.
702 'pre-write-conversion
703 Function called before a file is written out, to perform the
704 encoding. Called with two arguments, BEG and END, denoting
705 a region of the current buffer to be encoded.
708 The following additional properties are recognized if TYPE is 'iso2022:
714 The character set initially designated to the G0 - G3 registers.
715 The value should be one of
717 -- A charset object (designate that character set)
718 -- nil (do not ever use this register)
719 -- t (no character set is initially designated to
720 the register, but may be later on; this automatically
721 sets the corresponding `force-g*-on-output' property)
727 If non-nil, send an explicit designation sequence on output before
728 using the specified register.
731 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
732 "ESC $ B" on output in place of the full designation sequences
733 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
736 If non-nil, don't designate ASCII to G0 at each end of line on output.
737 Setting this to non-nil also suppresses other state-resetting that
738 normally happens at the end of a line.
741 If non-nil, don't designate ASCII to G0 before control chars on output.
744 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
748 If non-nil, use locking-shift (SO/SI) instead of single-shift
749 or designation by escape sequence.
752 If non-nil, don't use ISO6429's direction specification.
755 If non-nil, literal control characters that are the same as
756 the beginning of a recognized ISO2022 or ISO6429 escape sequence
757 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
758 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
759 so that they can be properly distinguished from an escape sequence.
760 (Note that doing this results in a non-portable encoding.) This
761 encoding flag is used for byte-compiled files. Note that ESC
762 is a good choice for a quoting character because there are no
763 escape sequences whose second byte is a character from the Control-0
764 or Control-1 character sets; this is explicitly disallowed by the
767 'input-charset-conversion
768 A list of conversion specifications, specifying conversion of
769 characters in one charset to another when decoding is performed.
770 Each specification is a list of two elements: the source charset,
771 and the destination charset.
773 'output-charset-conversion
774 A list of conversion specifications, specifying conversion of
775 characters in one charset to another when encoding is performed.
776 The form of each specification is the same as for
777 'input-charset-conversion.
780 The following additional properties are recognized (and required)
784 CCL program used for decoding (converting to internal format).
787 CCL program used for encoding (converting to external format).
789 (name, type, doc_string, props))
791 Lisp_Coding_System *codesys;
792 Lisp_Object rest, key, value;
793 enum coding_system_type ty;
794 int need_to_setup_eol_systems = 1;
796 /* Convert type to constant */
797 if (NILP (type) || EQ (type, Qundecided))
798 { ty = CODESYS_AUTODETECT; }
800 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
801 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
802 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
803 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
804 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
805 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
807 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
809 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
812 signal_simple_error ("Invalid coding system type", type);
816 codesys = allocate_coding_system (ty, name);
818 if (NILP (doc_string))
819 doc_string = build_string ("");
821 CHECK_STRING (doc_string);
822 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
824 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
826 if (EQ (key, Qmnemonic))
829 CHECK_STRING (value);
830 CODING_SYSTEM_MNEMONIC (codesys) = value;
833 else if (EQ (key, Qeol_type))
835 need_to_setup_eol_systems = NILP (value);
838 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
841 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
842 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
844 else if (ty == CODESYS_ISO2022)
846 #define FROB_INITIAL_CHARSET(charset_num) \
847 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
848 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
850 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
851 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
852 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
853 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
855 #define FROB_FORCE_CHARSET(charset_num) \
856 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
858 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
859 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
860 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
861 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
863 #define FROB_BOOLEAN_PROPERTY(prop) \
864 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
866 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
867 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
868 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
869 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
870 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
871 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
872 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
874 else if (EQ (key, Qinput_charset_conversion))
876 codesys->iso2022.input_conv =
877 Dynarr_new (charset_conversion_spec);
878 parse_charset_conversion_specs (codesys->iso2022.input_conv,
881 else if (EQ (key, Qoutput_charset_conversion))
883 codesys->iso2022.output_conv =
884 Dynarr_new (charset_conversion_spec);
885 parse_charset_conversion_specs (codesys->iso2022.output_conv,
889 signal_simple_error ("Unrecognized property", key);
891 else if (EQ (type, Qccl))
893 if (EQ (key, Qdecode))
895 CHECK_VECTOR (value);
896 CODING_SYSTEM_CCL_DECODE (codesys) = value;
898 else if (EQ (key, Qencode))
900 CHECK_VECTOR (value);
901 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
904 signal_simple_error ("Unrecognized property", key);
908 signal_simple_error ("Unrecognized property", key);
911 if (need_to_setup_eol_systems)
912 setup_eol_coding_systems (codesys);
915 Lisp_Object codesys_obj;
916 XSETCODING_SYSTEM (codesys_obj, codesys);
917 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
922 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
923 Copy OLD-CODING-SYSTEM to NEW-NAME.
924 If NEW-NAME does not name an existing coding system, a new one will
927 (old_coding_system, new_name))
929 Lisp_Object new_coding_system;
930 old_coding_system = Fget_coding_system (old_coding_system);
931 new_coding_system = Ffind_coding_system (new_name);
932 if (NILP (new_coding_system))
934 XSETCODING_SYSTEM (new_coding_system,
935 allocate_coding_system
936 (XCODING_SYSTEM_TYPE (old_coding_system),
938 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
942 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
943 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
944 memcpy (((char *) to ) + sizeof (to->header),
945 ((char *) from) + sizeof (from->header),
946 sizeof (*from) - sizeof (from->header));
949 return new_coding_system;
953 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
955 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
956 Lisp_Object new_coding_system;
958 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
959 return coding_system;
963 case EOL_AUTODETECT: return coding_system;
964 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
965 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
966 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
970 return NILP (new_coding_system) ? coding_system : new_coding_system;
973 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
974 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
976 (coding_system, eol_type))
978 coding_system = Fget_coding_system (coding_system);
980 return subsidiary_coding_system (coding_system,
981 symbol_to_eol_type (eol_type));
985 /************************************************************************/
986 /* Coding system accessors */
987 /************************************************************************/
989 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
990 Return the doc string for CODING-SYSTEM.
994 coding_system = Fget_coding_system (coding_system);
995 return XCODING_SYSTEM_DOC_STRING (coding_system);
998 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
999 Return the type of CODING-SYSTEM.
1003 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1006 case CODESYS_AUTODETECT: return Qundecided;
1008 case CODESYS_SHIFT_JIS: return Qshift_jis;
1009 case CODESYS_ISO2022: return Qiso2022;
1010 case CODESYS_BIG5: return Qbig5;
1011 case CODESYS_UCS4: return Qucs4;
1012 case CODESYS_UTF8: return Qutf8;
1013 case CODESYS_CCL: return Qccl;
1015 case CODESYS_NO_CONVERSION: return Qno_conversion;
1017 case CODESYS_INTERNAL: return Qinternal;
1024 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1027 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1029 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1032 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1033 Return initial charset of CODING-SYSTEM designated to GNUM.
1036 (coding_system, gnum))
1038 coding_system = Fget_coding_system (coding_system);
1041 return coding_system_charset (coding_system, XINT (gnum));
1045 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1046 Return the PROP property of CODING-SYSTEM.
1048 (coding_system, prop))
1051 enum coding_system_type type;
1053 coding_system = Fget_coding_system (coding_system);
1054 CHECK_SYMBOL (prop);
1055 type = XCODING_SYSTEM_TYPE (coding_system);
1057 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1058 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1061 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1063 case CODESYS_PROP_ALL_OK:
1066 case CODESYS_PROP_ISO2022:
1067 if (type != CODESYS_ISO2022)
1069 ("Property only valid in ISO2022 coding systems",
1073 case CODESYS_PROP_CCL:
1074 if (type != CODESYS_CCL)
1076 ("Property only valid in CCL coding systems",
1086 signal_simple_error ("Unrecognized property", prop);
1088 if (EQ (prop, Qname))
1089 return XCODING_SYSTEM_NAME (coding_system);
1090 else if (EQ (prop, Qtype))
1091 return Fcoding_system_type (coding_system);
1092 else if (EQ (prop, Qdoc_string))
1093 return XCODING_SYSTEM_DOC_STRING (coding_system);
1094 else if (EQ (prop, Qmnemonic))
1095 return XCODING_SYSTEM_MNEMONIC (coding_system);
1096 else if (EQ (prop, Qeol_type))
1097 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1098 else if (EQ (prop, Qeol_lf))
1099 return XCODING_SYSTEM_EOL_LF (coding_system);
1100 else if (EQ (prop, Qeol_crlf))
1101 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1102 else if (EQ (prop, Qeol_cr))
1103 return XCODING_SYSTEM_EOL_CR (coding_system);
1104 else if (EQ (prop, Qpost_read_conversion))
1105 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1106 else if (EQ (prop, Qpre_write_conversion))
1107 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1109 else if (type == CODESYS_ISO2022)
1111 if (EQ (prop, Qcharset_g0))
1112 return coding_system_charset (coding_system, 0);
1113 else if (EQ (prop, Qcharset_g1))
1114 return coding_system_charset (coding_system, 1);
1115 else if (EQ (prop, Qcharset_g2))
1116 return coding_system_charset (coding_system, 2);
1117 else if (EQ (prop, Qcharset_g3))
1118 return coding_system_charset (coding_system, 3);
1120 #define FORCE_CHARSET(charset_num) \
1121 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1122 (coding_system, charset_num) ? Qt : Qnil)
1124 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1125 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1126 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1127 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1129 #define LISP_BOOLEAN(prop) \
1130 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1132 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1133 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1134 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1135 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1136 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1137 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1138 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1140 else if (EQ (prop, Qinput_charset_conversion))
1142 unparse_charset_conversion_specs
1143 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1144 else if (EQ (prop, Qoutput_charset_conversion))
1146 unparse_charset_conversion_specs
1147 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1151 else if (type == CODESYS_CCL)
1153 if (EQ (prop, Qdecode))
1154 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1155 else if (EQ (prop, Qencode))
1156 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1164 return Qnil; /* not reached */
1168 /************************************************************************/
1169 /* Coding category functions */
1170 /************************************************************************/
1173 decode_coding_category (Lisp_Object symbol)
1177 CHECK_SYMBOL (symbol);
1178 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1179 if (EQ (coding_category_symbol[i], symbol))
1182 signal_simple_error ("Unrecognized coding category", symbol);
1183 return 0; /* not reached */
1186 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1187 Return a list of all recognized coding categories.
1192 Lisp_Object list = Qnil;
1194 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1195 list = Fcons (coding_category_symbol[i], list);
1199 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1200 Change the priority order of the coding categories.
1201 LIST should be list of coding categories, in descending order of
1202 priority. Unspecified coding categories will be lower in priority
1203 than all specified ones, in the same relative order they were in
1208 int category_to_priority[CODING_CATEGORY_LAST + 1];
1212 /* First generate a list that maps coding categories to priorities. */
1214 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1215 category_to_priority[i] = -1;
1217 /* Highest priority comes from the specified list. */
1219 EXTERNAL_LIST_LOOP (rest, list)
1221 int cat = decode_coding_category (XCAR (rest));
1223 if (category_to_priority[cat] >= 0)
1224 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1225 category_to_priority[cat] = i++;
1228 /* Now go through the existing categories by priority to retrieve
1229 the categories not yet specified and preserve their priority
1231 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1233 int cat = coding_category_by_priority[j];
1234 if (category_to_priority[cat] < 0)
1235 category_to_priority[cat] = i++;
1238 /* Now we need to construct the inverse of the mapping we just
1241 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1242 coding_category_by_priority[category_to_priority[i]] = i;
1244 /* Phew! That was confusing. */
1248 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1249 Return a list of coding categories in descending order of priority.
1254 Lisp_Object list = Qnil;
1256 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1257 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1262 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1263 Change the coding system associated with a coding category.
1265 (coding_category, coding_system))
1267 int cat = decode_coding_category (coding_category);
1269 coding_system = Fget_coding_system (coding_system);
1270 coding_category_system[cat] = coding_system;
1274 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1275 Return the coding system associated with a coding category.
1279 int cat = decode_coding_category (coding_category);
1280 Lisp_Object sys = coding_category_system[cat];
1283 return XCODING_SYSTEM_NAME (sys);
1288 /************************************************************************/
1289 /* Detecting the encoding of data */
1290 /************************************************************************/
1292 struct detection_state
1294 enum eol_type eol_type;
1330 struct iso2022_decoder iso;
1332 int high_byte_count;
1333 unsigned int saw_single_shift:1;
1346 acceptable_control_char_p (int c)
1350 /* Allow and ignore control characters that you might
1351 reasonably see in a text file */
1356 case 8: /* backspace */
1357 case 11: /* vertical tab */
1358 case 12: /* form feed */
1359 case 26: /* MS-DOS C-z junk */
1360 case 31: /* '^_' -- for info */
1368 mask_has_at_most_one_bit_p (int mask)
1370 /* Perhaps the only thing useful you learn from intensive Microsoft
1371 technical interviews */
1372 return (mask & (mask - 1)) == 0;
1375 static enum eol_type
1376 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1385 st->eol.just_saw_cr = 1;
1390 if (st->eol.just_saw_cr)
1392 else if (st->eol.seen_anything)
1395 else if (st->eol.just_saw_cr)
1397 st->eol.just_saw_cr = 0;
1399 st->eol.seen_anything = 1;
1402 return EOL_AUTODETECT;
1405 /* Attempt to determine the encoding and EOL type of the given text.
1406 Before calling this function for the first type, you must initialize
1407 st->eol_type as appropriate and initialize st->mask to ~0.
1409 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1412 st->mask holds the determined coding category mask, or ~0 if only
1413 ASCII has been seen so far.
1417 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1418 is present in st->mask
1419 1 == definitive answers are here for both st->eol_type and st->mask
1423 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1424 unsigned int n, int just_do_eol)
1428 if (st->eol_type == EOL_AUTODETECT)
1429 st->eol_type = detect_eol_type (st, src, n);
1432 return st->eol_type != EOL_AUTODETECT;
1434 if (!st->seen_non_ascii)
1436 for (; n; n--, src++)
1439 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1441 st->seen_non_ascii = 1;
1443 st->shift_jis.mask = ~0;
1447 st->iso2022.mask = ~0;
1457 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1458 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1459 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1460 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1461 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1462 st->big5.mask = detect_coding_big5 (st, src, n);
1463 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1464 st->utf8.mask = detect_coding_utf8 (st, src, n);
1465 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1466 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1469 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1470 | st->utf8.mask | st->ucs4.mask;
1473 int retval = mask_has_at_most_one_bit_p (st->mask);
1474 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1475 return retval && st->eol_type != EOL_AUTODETECT;
1480 coding_system_from_mask (int mask)
1484 /* If the file was entirely or basically ASCII, use the
1485 default value of `buffer-file-coding-system'. */
1486 Lisp_Object retval =
1487 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1490 retval = Ffind_coding_system (retval);
1494 (Qbad_variable, Qwarning,
1495 "Invalid `default-buffer-file-coding-system', set to nil");
1496 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1500 retval = Fget_coding_system (Qno_conversion);
1508 mask = postprocess_iso2022_mask (mask);
1510 /* Look through the coding categories by priority and find
1511 the first one that is allowed. */
1512 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1514 cat = coding_category_by_priority[i];
1515 if ((mask & (1 << cat)) &&
1516 !NILP (coding_category_system[cat]))
1520 return coding_category_system[cat];
1522 return Fget_coding_system (Qno_conversion);
1526 /* Given a seekable read stream and potential coding system and EOL type
1527 as specified, do any autodetection that is called for. If the
1528 coding system and/or EOL type are not autodetect, they will be left
1529 alone; but this function will never return an autodetect coding system
1532 This function does not automatically fetch subsidiary coding systems;
1533 that should be unnecessary with the explicit eol-type argument. */
1536 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1537 enum eol_type *eol_type_in_out)
1539 struct detection_state decst;
1541 if (*eol_type_in_out == EOL_AUTODETECT)
1542 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1545 decst.eol_type = *eol_type_in_out;
1548 /* If autodetection is called for, do it now. */
1549 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1550 *eol_type_in_out == EOL_AUTODETECT)
1555 unsigned char random_buffer[4096];
1558 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1561 if (detect_coding_type (&decst, random_buffer, nread,
1562 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1563 CODESYS_AUTODETECT))
1567 *eol_type_in_out = decst.eol_type;
1568 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1569 *codesys_in_out = coding_system_from_mask (decst.mask);
1572 /* If we absolutely can't determine the EOL type, just assume LF. */
1573 if (*eol_type_in_out == EOL_AUTODETECT)
1574 *eol_type_in_out = EOL_LF;
1576 Lstream_rewind (stream);
1579 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1580 Detect coding system of the text in the region between START and END.
1581 Returned a list of possible coding systems ordered by priority.
1582 If only ASCII characters are found, it returns 'undecided or one of
1583 its subsidiary coding systems according to a detected end-of-line
1584 type. Optional arg BUFFER defaults to the current buffer.
1586 (start, end, buffer))
1588 Lisp_Object val = Qnil;
1589 struct buffer *buf = decode_buffer (buffer, 0);
1591 Lisp_Object instream, lb_instream;
1592 Lstream *istr, *lb_istr;
1593 struct detection_state decst;
1594 struct gcpro gcpro1, gcpro2;
1596 get_buffer_range_char (buf, start, end, &b, &e, 0);
1597 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1598 lb_istr = XLSTREAM (lb_instream);
1599 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1600 istr = XLSTREAM (instream);
1601 GCPRO2 (instream, lb_instream);
1603 decst.eol_type = EOL_AUTODETECT;
1607 unsigned char random_buffer[4096];
1608 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1612 if (detect_coding_type (&decst, random_buffer, nread, 0))
1616 if (decst.mask == ~0)
1617 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1625 decst.mask = postprocess_iso2022_mask (decst.mask);
1627 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1629 int sys = coding_category_by_priority[i];
1630 if (decst.mask & (1 << sys))
1632 Lisp_Object codesys = coding_category_system[sys];
1633 if (!NILP (codesys))
1634 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1635 val = Fcons (codesys, val);
1639 Lstream_close (istr);
1641 Lstream_delete (istr);
1642 Lstream_delete (lb_istr);
1647 /************************************************************************/
1648 /* Converting to internal Mule format ("decoding") */
1649 /************************************************************************/
1651 /* A decoding stream is a stream used for decoding text (i.e.
1652 converting from some external format to internal format).
1653 The decoding-stream object keeps track of the actual coding
1654 stream, the stream that is at the other end, and data that
1655 needs to be persistent across the lifetime of the stream. */
1657 /* Handle the EOL stuff related to just-read-in character C.
1658 EOL_TYPE is the EOL type of the coding stream.
1659 FLAGS is the current value of FLAGS in the coding stream, and may
1660 be modified by this macro. (The macro only looks at the
1661 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1662 bytes are to be written. You need to also define a local goto
1663 label "label_continue_loop" that is at the end of the main
1664 character-reading loop.
1666 If C is a CR character, then this macro handles it entirely and
1667 jumps to label_continue_loop. Otherwise, this macro does not add
1668 anything to DST, and continues normally. You should continue
1669 processing C normally after this macro. */
1671 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1675 if (eol_type == EOL_CR) \
1676 Dynarr_add (dst, '\n'); \
1677 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1678 Dynarr_add (dst, c); \
1680 flags |= CODING_STATE_CR; \
1681 goto label_continue_loop; \
1683 else if (flags & CODING_STATE_CR) \
1684 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1686 Dynarr_add (dst, '\r'); \
1687 flags &= ~CODING_STATE_CR; \
1691 /* C should be a binary character in the range 0 - 255; convert
1692 to internal format and add to Dynarr DST. */
1694 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1696 if (BYTE_ASCII_P (c)) \
1697 Dynarr_add (dst, c); \
1698 else if (BYTE_C1_P (c)) \
1700 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1701 Dynarr_add (dst, c + 0x20); \
1705 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1706 Dynarr_add (dst, c); \
1710 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1714 DECODE_ADD_BINARY_CHAR (ch, dst); \
1719 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1721 if (flags & CODING_STATE_END) \
1723 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1724 if (flags & CODING_STATE_CR) \
1725 Dynarr_add (dst, '\r'); \
1729 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1731 struct decoding_stream
1733 /* Coding system that governs the conversion. */
1734 Lisp_Coding_System *codesys;
1736 /* Stream that we read the encoded data from or
1737 write the decoded data to. */
1740 /* If we are reading, then we can return only a fixed amount of
1741 data, so if the conversion resulted in too much data, we store it
1742 here for retrieval the next time around. */
1743 unsigned_char_dynarr *runoff;
1745 /* FLAGS holds flags indicating the current state of the decoding.
1746 Some of these flags are dependent on the coding system. */
1749 /* CH holds a partially built-up character. Since we only deal
1750 with one- and two-byte characters at the moment, we only use
1751 this to store the first byte of a two-byte character. */
1754 /* EOL_TYPE specifies the type of end-of-line conversion that
1755 currently applies. We need to keep this separate from the
1756 EOL type stored in CODESYS because the latter might indicate
1757 automatic EOL-type detection while the former will always
1758 indicate a particular EOL type. */
1759 enum eol_type eol_type;
1761 /* Additional ISO2022 information. We define the structure above
1762 because it's also needed by the detection routines. */
1763 struct iso2022_decoder iso2022;
1765 /* Additional information (the state of the running CCL program)
1766 used by the CCL decoder. */
1767 struct ccl_program ccl;
1769 struct detection_state decst;
1772 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1773 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1774 static int decoding_rewinder (Lstream *stream);
1775 static int decoding_seekable_p (Lstream *stream);
1776 static int decoding_flusher (Lstream *stream);
1777 static int decoding_closer (Lstream *stream);
1779 static Lisp_Object decoding_marker (Lisp_Object stream,
1780 void (*markobj) (Lisp_Object));
1782 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1783 sizeof (struct decoding_stream));
1786 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1788 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1789 Lisp_Object str_obj;
1791 /* We do not need to mark the coding systems or charsets stored
1792 within the stream because they are stored in a global list
1793 and automatically marked. */
1795 XSETLSTREAM (str_obj, str);
1797 if (str->imp->marker)
1798 return (str->imp->marker) (str_obj, markobj);
1803 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1804 so we read data from the other end, decode it, and store it into DATA. */
1807 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1809 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1810 unsigned char *orig_data = data;
1812 int error_occurred = 0;
1814 /* We need to interface to mule_decode(), which expects to take some
1815 amount of data and store the result into a Dynarr. We have
1816 mule_decode() store into str->runoff, and take data from there
1819 /* We loop until we have enough data, reading chunks from the other
1820 end and decoding it. */
1823 /* Take data from the runoff if we can. Make sure to take at
1824 most SIZE bytes, and delete the data from the runoff. */
1825 if (Dynarr_length (str->runoff) > 0)
1827 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1828 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1829 Dynarr_delete_many (str->runoff, 0, chunk);
1835 break; /* No more room for data */
1837 if (str->flags & CODING_STATE_END)
1838 /* This means that on the previous iteration, we hit the EOF on
1839 the other end. We loop once more so that mule_decode() can
1840 output any final stuff it may be holding, or any "go back
1841 to a sane state" escape sequences. (This latter makes sense
1842 during encoding.) */
1845 /* Exhausted the runoff, so get some more. DATA has at least
1846 SIZE bytes left of storage in it, so it's OK to read directly
1847 into it. (We'll be overwriting above, after we've decoded it
1848 into the runoff.) */
1849 read_size = Lstream_read (str->other_end, data, size);
1856 /* There might be some more end data produced in the translation.
1857 See the comment above. */
1858 str->flags |= CODING_STATE_END;
1859 mule_decode (stream, data, str->runoff, read_size);
1862 if (data - orig_data == 0)
1863 return error_occurred ? -1 : 0;
1865 return data - orig_data;
1869 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1871 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1874 /* Decode all our data into the runoff, and then attempt to write
1875 it all out to the other end. Remove whatever chunk we succeeded
1877 mule_decode (stream, data, str->runoff, size);
1878 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1879 Dynarr_length (str->runoff));
1881 Dynarr_delete_many (str->runoff, 0, retval);
1882 /* Do NOT return retval. The return value indicates how much
1883 of the incoming data was written, not how many bytes were
1889 reset_decoding_stream (struct decoding_stream *str)
1892 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1894 Lisp_Object coding_system;
1895 XSETCODING_SYSTEM (coding_system, str->codesys);
1896 reset_iso2022 (coding_system, &str->iso2022);
1898 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1900 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1903 str->flags = str->ch = 0;
1907 decoding_rewinder (Lstream *stream)
1909 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1910 reset_decoding_stream (str);
1911 Dynarr_reset (str->runoff);
1912 return Lstream_rewind (str->other_end);
1916 decoding_seekable_p (Lstream *stream)
1918 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1919 return Lstream_seekable_p (str->other_end);
1923 decoding_flusher (Lstream *stream)
1925 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1926 return Lstream_flush (str->other_end);
1930 decoding_closer (Lstream *stream)
1932 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1933 if (stream->flags & LSTREAM_FL_WRITE)
1935 str->flags |= CODING_STATE_END;
1936 decoding_writer (stream, 0, 0);
1938 Dynarr_free (str->runoff);
1940 #ifdef ENABLE_COMPOSITE_CHARS
1941 if (str->iso2022.composite_chars)
1942 Dynarr_free (str->iso2022.composite_chars);
1945 return Lstream_close (str->other_end);
1949 decoding_stream_coding_system (Lstream *stream)
1951 Lisp_Object coding_system;
1952 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1954 XSETCODING_SYSTEM (coding_system, str->codesys);
1955 return subsidiary_coding_system (coding_system, str->eol_type);
1959 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1961 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1962 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1964 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1965 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1966 reset_decoding_stream (str);
1969 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
1970 stream for writing, no automatic code detection will be performed.
1971 The reason for this is that automatic code detection requires a
1972 seekable input. Things will also fail if you open a decoding
1973 stream for reading using a non-fully-specified coding system and
1974 a non-seekable input stream. */
1977 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1980 Lstream *lstr = Lstream_new (lstream_decoding, mode);
1981 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1985 str->other_end = stream;
1986 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1987 str->eol_type = EOL_AUTODETECT;
1988 if (!strcmp (mode, "r")
1989 && Lstream_seekable_p (stream))
1990 /* We can determine the coding system now. */
1991 determine_real_coding_system (stream, &codesys, &str->eol_type);
1992 set_decoding_stream_coding_system (lstr, codesys);
1993 str->decst.eol_type = str->eol_type;
1994 str->decst.mask = ~0;
1995 XSETLSTREAM (obj, lstr);
2000 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2002 return make_decoding_stream_1 (stream, codesys, "r");
2006 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2008 return make_decoding_stream_1 (stream, codesys, "w");
2011 /* Note: the decode_coding_* functions all take the same
2012 arguments as mule_decode(), which is to say some SRC data of
2013 size N, which is to be stored into dynamic array DST.
2014 DECODING is the stream within which the decoding is
2015 taking place, but no data is actually read from or
2016 written to that stream; that is handled in decoding_reader()
2017 or decoding_writer(). This allows the same functions to
2018 be used for both reading and writing. */
2021 mule_decode (Lstream *decoding, CONST unsigned char *src,
2022 unsigned_char_dynarr *dst, unsigned int n)
2024 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2026 /* If necessary, do encoding-detection now. We do this when
2027 we're a writing stream or a non-seekable reading stream,
2028 meaning that we can't just process the whole input,
2029 rewind, and start over. */
2031 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2032 str->eol_type == EOL_AUTODETECT)
2034 Lisp_Object codesys;
2036 XSETCODING_SYSTEM (codesys, str->codesys);
2037 detect_coding_type (&str->decst, src, n,
2038 CODING_SYSTEM_TYPE (str->codesys) !=
2039 CODESYS_AUTODETECT);
2040 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2041 str->decst.mask != ~0)
2042 /* #### This is cheesy. What we really ought to do is
2043 buffer up a certain amount of data so as to get a
2044 less random result. */
2045 codesys = coding_system_from_mask (str->decst.mask);
2046 str->eol_type = str->decst.eol_type;
2047 if (XCODING_SYSTEM (codesys) != str->codesys)
2049 /* Preserve the CODING_STATE_END flag in case it was set.
2050 If we erase it, bad things might happen. */
2051 int was_end = str->flags & CODING_STATE_END;
2052 set_decoding_stream_coding_system (decoding, codesys);
2054 str->flags |= CODING_STATE_END;
2058 switch (CODING_SYSTEM_TYPE (str->codesys))
2061 case CODESYS_INTERNAL:
2062 Dynarr_add_many (dst, src, n);
2065 case CODESYS_AUTODETECT:
2066 /* If we got this far and still haven't decided on the coding
2067 system, then do no conversion. */
2068 case CODESYS_NO_CONVERSION:
2069 decode_coding_no_conversion (decoding, src, dst, n);
2072 case CODESYS_SHIFT_JIS:
2073 decode_coding_sjis (decoding, src, dst, n);
2076 decode_coding_big5 (decoding, src, dst, n);
2079 decode_coding_ucs4 (decoding, src, dst, n);
2082 decode_coding_utf8 (decoding, src, dst, n);
2085 ccl_driver (&str->ccl, src, dst, n, 0);
2087 case CODESYS_ISO2022:
2088 decode_coding_iso2022 (decoding, src, dst, n);
2096 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2097 Decode the text between START and END which is encoded in CODING-SYSTEM.
2098 This is useful if you've read in encoded text from a file without decoding
2099 it (e.g. you read in a JIS-formatted file but used the `binary' or
2100 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2101 Return length of decoded text.
2102 BUFFER defaults to the current buffer if unspecified.
2104 (start, end, coding_system, buffer))
2107 struct buffer *buf = decode_buffer (buffer, 0);
2108 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2109 Lstream *istr, *ostr;
2110 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2112 get_buffer_range_char (buf, start, end, &b, &e, 0);
2114 barf_if_buffer_read_only (buf, b, e);
2116 coding_system = Fget_coding_system (coding_system);
2117 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2118 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2119 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2121 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2122 Fget_coding_system (Qbinary));
2123 istr = XLSTREAM (instream);
2124 ostr = XLSTREAM (outstream);
2125 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2127 /* The chain of streams looks like this:
2129 [BUFFER] <----- send through
2130 ------> [ENCODE AS BINARY]
2131 ------> [DECODE AS SPECIFIED]
2137 char tempbuf[1024]; /* some random amount */
2138 Bufpos newpos, even_newer_pos;
2139 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2140 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2144 newpos = lisp_buffer_stream_startpos (istr);
2145 Lstream_write (ostr, tempbuf, size_in_bytes);
2146 even_newer_pos = lisp_buffer_stream_startpos (istr);
2147 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2150 Lstream_close (istr);
2151 Lstream_close (ostr);
2153 Lstream_delete (istr);
2154 Lstream_delete (ostr);
2155 Lstream_delete (XLSTREAM (de_outstream));
2156 Lstream_delete (XLSTREAM (lb_outstream));
2161 /************************************************************************/
2162 /* Converting to an external encoding ("encoding") */
2163 /************************************************************************/
2165 /* An encoding stream is an output stream. When you create the
2166 stream, you specify the coding system that governs the encoding
2167 and another stream that the resulting encoded data is to be
2168 sent to, and then start sending data to it. */
2170 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2172 struct encoding_stream
2174 /* Coding system that governs the conversion. */
2175 Lisp_Coding_System *codesys;
2177 /* Stream that we read the encoded data from or
2178 write the decoded data to. */
2181 /* If we are reading, then we can return only a fixed amount of
2182 data, so if the conversion resulted in too much data, we store it
2183 here for retrieval the next time around. */
2184 unsigned_char_dynarr *runoff;
2186 /* FLAGS holds flags indicating the current state of the encoding.
2187 Some of these flags are dependent on the coding system. */
2190 /* CH holds a partially built-up character. Since we only deal
2191 with one- and two-byte characters at the moment, we only use
2192 this to store the first byte of a two-byte character. */
2195 /* Additional information used by the ISO2022 encoder. */
2198 /* CHARSET holds the character sets currently assigned to the G0
2199 through G3 registers. It is initialized from the array
2200 INITIAL_CHARSET in CODESYS. */
2201 Lisp_Object charset[4];
2203 /* Which registers are currently invoked into the left (GL) and
2204 right (GR) halves of the 8-bit encoding space? */
2205 int register_left, register_right;
2207 /* Whether we need to explicitly designate the charset in the
2208 G? register before using it. It is initialized from the
2209 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2210 unsigned char force_charset_on_output[4];
2212 /* Other state variables that need to be preserved across
2214 Lisp_Object current_charset;
2216 int current_char_boundary;
2219 /* Additional information (the state of the running CCL program)
2220 used by the CCL encoder. */
2221 struct ccl_program ccl;
2225 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2226 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2228 static int encoding_rewinder (Lstream *stream);
2229 static int encoding_seekable_p (Lstream *stream);
2230 static int encoding_flusher (Lstream *stream);
2231 static int encoding_closer (Lstream *stream);
2233 static Lisp_Object encoding_marker (Lisp_Object stream,
2234 void (*markobj) (Lisp_Object));
2236 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2237 sizeof (struct encoding_stream));
2240 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2242 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2243 Lisp_Object str_obj;
2245 /* We do not need to mark the coding systems or charsets stored
2246 within the stream because they are stored in a global list
2247 and automatically marked. */
2249 XSETLSTREAM (str_obj, str);
2251 if (str->imp->marker)
2252 return (str->imp->marker) (str_obj, markobj);
2257 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2258 so we read data from the other end, encode it, and store it into DATA. */
2261 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2263 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2264 unsigned char *orig_data = data;
2266 int error_occurred = 0;
2268 /* We need to interface to mule_encode(), which expects to take some
2269 amount of data and store the result into a Dynarr. We have
2270 mule_encode() store into str->runoff, and take data from there
2273 /* We loop until we have enough data, reading chunks from the other
2274 end and encoding it. */
2277 /* Take data from the runoff if we can. Make sure to take at
2278 most SIZE bytes, and delete the data from the runoff. */
2279 if (Dynarr_length (str->runoff) > 0)
2281 int chunk = min ((int) size, Dynarr_length (str->runoff));
2282 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2283 Dynarr_delete_many (str->runoff, 0, chunk);
2289 break; /* No more room for data */
2291 if (str->flags & CODING_STATE_END)
2292 /* This means that on the previous iteration, we hit the EOF on
2293 the other end. We loop once more so that mule_encode() can
2294 output any final stuff it may be holding, or any "go back
2295 to a sane state" escape sequences. (This latter makes sense
2296 during encoding.) */
2299 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2300 left of storage in it, so it's OK to read directly into it.
2301 (We'll be overwriting above, after we've encoded it into the
2303 read_size = Lstream_read (str->other_end, data, size);
2310 /* There might be some more end data produced in the translation.
2311 See the comment above. */
2312 str->flags |= CODING_STATE_END;
2313 mule_encode (stream, data, str->runoff, read_size);
2316 if (data == orig_data)
2317 return error_occurred ? -1 : 0;
2319 return data - orig_data;
2323 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2325 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2328 /* Encode all our data into the runoff, and then attempt to write
2329 it all out to the other end. Remove whatever chunk we succeeded
2331 mule_encode (stream, data, str->runoff, size);
2332 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2333 Dynarr_length (str->runoff));
2335 Dynarr_delete_many (str->runoff, 0, retval);
2336 /* Do NOT return retval. The return value indicates how much
2337 of the incoming data was written, not how many bytes were
2343 reset_encoding_stream (struct encoding_stream *str)
2346 switch (CODING_SYSTEM_TYPE (str->codesys))
2348 case CODESYS_ISO2022:
2352 for (i = 0; i < 4; i++)
2354 str->iso2022.charset[i] =
2355 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2356 str->iso2022.force_charset_on_output[i] =
2357 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2359 str->iso2022.register_left = 0;
2360 str->iso2022.register_right = 1;
2361 str->iso2022.current_charset = Qnil;
2362 str->iso2022.current_half = 0;
2363 str->iso2022.current_char_boundary = 1;
2367 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2374 str->flags = str->ch = 0;
2378 encoding_rewinder (Lstream *stream)
2380 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2381 reset_encoding_stream (str);
2382 Dynarr_reset (str->runoff);
2383 return Lstream_rewind (str->other_end);
2387 encoding_seekable_p (Lstream *stream)
2389 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2390 return Lstream_seekable_p (str->other_end);
2394 encoding_flusher (Lstream *stream)
2396 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2397 return Lstream_flush (str->other_end);
2401 encoding_closer (Lstream *stream)
2403 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2404 if (stream->flags & LSTREAM_FL_WRITE)
2406 str->flags |= CODING_STATE_END;
2407 encoding_writer (stream, 0, 0);
2409 Dynarr_free (str->runoff);
2410 return Lstream_close (str->other_end);
2414 encoding_stream_coding_system (Lstream *stream)
2416 Lisp_Object coding_system;
2417 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2419 XSETCODING_SYSTEM (coding_system, str->codesys);
2420 return coding_system;
2424 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2426 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2427 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2429 reset_encoding_stream (str);
2433 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2436 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2437 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2441 str->runoff = Dynarr_new (unsigned_char);
2442 str->other_end = stream;
2443 set_encoding_stream_coding_system (lstr, codesys);
2444 XSETLSTREAM (obj, lstr);
2449 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2451 return make_encoding_stream_1 (stream, codesys, "r");
2455 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2457 return make_encoding_stream_1 (stream, codesys, "w");
2460 /* Convert N bytes of internally-formatted data stored in SRC to an
2461 external format, according to the encoding stream ENCODING.
2462 Store the encoded data into DST. */
2465 mule_encode (Lstream *encoding, CONST unsigned char *src,
2466 unsigned_char_dynarr *dst, unsigned int n)
2468 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2470 switch (CODING_SYSTEM_TYPE (str->codesys))
2473 case CODESYS_INTERNAL:
2474 Dynarr_add_many (dst, src, n);
2477 case CODESYS_AUTODETECT:
2478 /* If we got this far and still haven't decided on the coding
2479 system, then do no conversion. */
2480 case CODESYS_NO_CONVERSION:
2481 encode_coding_no_conversion (encoding, src, dst, n);
2484 case CODESYS_SHIFT_JIS:
2485 encode_coding_sjis (encoding, src, dst, n);
2488 encode_coding_big5 (encoding, src, dst, n);
2491 encode_coding_ucs4 (encoding, src, dst, n);
2494 encode_coding_utf8 (encoding, src, dst, n);
2497 ccl_driver (&str->ccl, src, dst, n, 0);
2499 case CODESYS_ISO2022:
2500 encode_coding_iso2022 (encoding, src, dst, n);
2508 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2509 Encode the text between START and END using CODING-SYSTEM.
2510 This will, for example, convert Japanese characters into stuff such as
2511 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2512 text. BUFFER defaults to the current buffer if unspecified.
2514 (start, end, coding_system, buffer))
2517 struct buffer *buf = decode_buffer (buffer, 0);
2518 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2519 Lstream *istr, *ostr;
2520 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2522 get_buffer_range_char (buf, start, end, &b, &e, 0);
2524 barf_if_buffer_read_only (buf, b, e);
2526 coding_system = Fget_coding_system (coding_system);
2527 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2528 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2529 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2530 Fget_coding_system (Qbinary));
2531 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2533 istr = XLSTREAM (instream);
2534 ostr = XLSTREAM (outstream);
2535 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2536 /* The chain of streams looks like this:
2538 [BUFFER] <----- send through
2539 ------> [ENCODE AS SPECIFIED]
2540 ------> [DECODE AS BINARY]
2545 char tempbuf[1024]; /* some random amount */
2546 Bufpos newpos, even_newer_pos;
2547 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2548 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2552 newpos = lisp_buffer_stream_startpos (istr);
2553 Lstream_write (ostr, tempbuf, size_in_bytes);
2554 even_newer_pos = lisp_buffer_stream_startpos (istr);
2555 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2561 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2562 Lstream_close (istr);
2563 Lstream_close (ostr);
2565 Lstream_delete (istr);
2566 Lstream_delete (ostr);
2567 Lstream_delete (XLSTREAM (de_outstream));
2568 Lstream_delete (XLSTREAM (lb_outstream));
2569 return make_int (retlen);
2575 /************************************************************************/
2576 /* Shift-JIS methods */
2577 /************************************************************************/
2579 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2580 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2581 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2582 encoded by "position-code + 0x80". A character of JISX0208
2583 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2584 position-codes are divided and shifted so that it fit in the range
2587 --- CODE RANGE of Shift-JIS ---
2588 (character set) (range)
2590 JISX0201-Kana 0xA0 .. 0xDF
2591 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2592 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2593 -------------------------------
2597 /* Is this the first byte of a Shift-JIS two-byte char? */
2599 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2600 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2602 /* Is this the second byte of a Shift-JIS two-byte char? */
2604 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2605 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2607 #define BYTE_SJIS_KATAKANA_P(c) \
2608 ((c) >= 0xA1 && (c) <= 0xDF)
2611 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2619 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2621 if (st->shift_jis.in_second_byte)
2623 st->shift_jis.in_second_byte = 0;
2627 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2628 st->shift_jis.in_second_byte = 1;
2630 return CODING_CATEGORY_SHIFT_JIS_MASK;
2633 /* Convert Shift-JIS data to internal format. */
2636 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2637 unsigned_char_dynarr *dst, unsigned int n)
2640 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2641 unsigned int flags = str->flags;
2642 unsigned int ch = str->ch;
2643 eol_type_t eol_type = str->eol_type;
2651 /* Previous character was first byte of Shift-JIS Kanji char. */
2652 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2654 unsigned char e1, e2;
2656 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2657 DECODE_SJIS (ch, c, e1, e2);
2658 Dynarr_add (dst, e1);
2659 Dynarr_add (dst, e2);
2663 DECODE_ADD_BINARY_CHAR (ch, dst);
2664 DECODE_ADD_BINARY_CHAR (c, dst);
2670 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2671 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2673 else if (BYTE_SJIS_KATAKANA_P (c))
2675 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2676 Dynarr_add (dst, c);
2679 DECODE_ADD_BINARY_CHAR (c, dst);
2681 label_continue_loop:;
2684 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2690 /* Convert internally-formatted data to Shift-JIS. */
2693 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2694 unsigned_char_dynarr *dst, unsigned int n)
2697 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2698 unsigned int flags = str->flags;
2699 unsigned int ch = str->ch;
2700 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2707 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2708 Dynarr_add (dst, '\r');
2709 if (eol_type != EOL_CR)
2710 Dynarr_add (dst, '\n');
2713 else if (BYTE_ASCII_P (c))
2715 Dynarr_add (dst, c);
2718 else if (BUFBYTE_LEADING_BYTE_P (c))
2719 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2720 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2721 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2724 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2726 Dynarr_add (dst, c);
2729 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2730 ch == LEADING_BYTE_JAPANESE_JISX0208)
2734 unsigned char j1, j2;
2735 ENCODE_SJIS (ch, c, j1, j2);
2736 Dynarr_add (dst, j1);
2737 Dynarr_add (dst, j2);
2747 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2748 Decode a JISX0208 character of Shift-JIS coding-system.
2749 CODE is the character code in Shift-JIS as a cons of type bytes.
2750 Return the corresponding character.
2754 unsigned char c1, c2, s1, s2;
2757 CHECK_INT (XCAR (code));
2758 CHECK_INT (XCDR (code));
2759 s1 = XINT (XCAR (code));
2760 s2 = XINT (XCDR (code));
2761 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2762 BYTE_SJIS_TWO_BYTE_2_P (s2))
2764 DECODE_SJIS (s1, s2, c1, c2);
2765 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2766 c1 & 0x7F, c2 & 0x7F));
2772 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2773 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2774 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2778 Lisp_Object charset;
2781 CHECK_CHAR_COERCE_INT (ch);
2782 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2783 if (EQ (charset, Vcharset_japanese_jisx0208))
2785 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2786 return Fcons (make_int (s1), make_int (s2));
2793 /************************************************************************/
2795 /************************************************************************/
2797 /* BIG5 is a coding system encoding two character sets: ASCII and
2798 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2799 character set and is encoded in two-byte.
2801 --- CODE RANGE of BIG5 ---
2802 (character set) (range)
2804 Big5 (1st byte) 0xA1 .. 0xFE
2805 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2806 --------------------------
2808 Since the number of characters in Big5 is larger than maximum
2809 characters in Emacs' charset (96x96), it can't be handled as one
2810 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2811 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2812 contains frequently used characters and the latter contains less
2813 frequently used characters. */
2815 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2816 ((c) >= 0xA1 && (c) <= 0xFE)
2818 /* Is this the second byte of a Shift-JIS two-byte char? */
2820 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2821 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2823 /* Number of Big5 characters which have the same code in 1st byte. */
2825 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2827 /* Code conversion macros. These are macros because they are used in
2828 inner loops during code conversion.
2830 Note that temporary variables in macros introduce the classic
2831 dynamic-scoping problems with variable names. We use capital-
2832 lettered variables in the assumption that XEmacs does not use
2833 capital letters in variables except in a very formalized way
2836 /* Convert Big5 code (b1, b2) into its internal string representation
2839 /* There is a much simpler way to split the Big5 charset into two.
2840 For the moment I'm going to leave the algorithm as-is because it
2841 claims to separate out the most-used characters into a single
2842 charset, which perhaps will lead to optimizations in various
2845 The way the algorithm works is something like this:
2847 Big5 can be viewed as a 94x157 charset, where the row is
2848 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2849 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2850 the split between low and high column numbers is apparently
2851 meaningless; ascending rows produce less and less frequent chars.
2852 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2853 the first charset, and the upper half (0xC9 .. 0xFE) to the
2854 second. To do the conversion, we convert the character into
2855 a single number where 0 .. 156 is the first row, 157 .. 313
2856 is the second, etc. That way, the characters are ordered by
2857 decreasing frequency. Then we just chop the space in two
2858 and coerce the result into a 94x94 space.
2861 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2863 int B1 = b1, B2 = b2; \
2865 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2869 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2873 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2874 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2876 c1 = I / (0xFF - 0xA1) + 0xA1; \
2877 c2 = I % (0xFF - 0xA1) + 0xA1; \
2880 /* Convert the internal string representation of a Big5 character
2881 (lb, c1, c2) into Big5 code (b1, b2). */
2883 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2885 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2887 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2889 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2891 b1 = I / BIG5_SAME_ROW + 0xA1; \
2892 b2 = I % BIG5_SAME_ROW; \
2893 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2897 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2905 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2906 (c >= 0x80 && c <= 0xA0))
2908 if (st->big5.in_second_byte)
2910 st->big5.in_second_byte = 0;
2911 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2915 st->big5.in_second_byte = 1;
2917 return CODING_CATEGORY_BIG5_MASK;
2920 /* Convert Big5 data to internal format. */
2923 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2924 unsigned_char_dynarr *dst, unsigned int n)
2927 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2928 unsigned int flags = str->flags;
2929 unsigned int ch = str->ch;
2930 eol_type_t eol_type = str->eol_type;
2937 /* Previous character was first byte of Big5 char. */
2938 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2940 unsigned char b1, b2, b3;
2941 DECODE_BIG5 (ch, c, b1, b2, b3);
2942 Dynarr_add (dst, b1);
2943 Dynarr_add (dst, b2);
2944 Dynarr_add (dst, b3);
2948 DECODE_ADD_BINARY_CHAR (ch, dst);
2949 DECODE_ADD_BINARY_CHAR (c, dst);
2955 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2956 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2959 DECODE_ADD_BINARY_CHAR (c, dst);
2961 label_continue_loop:;
2964 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2970 /* Convert internally-formatted data to Big5. */
2973 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2974 unsigned_char_dynarr *dst, unsigned int n)
2977 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2978 unsigned int flags = str->flags;
2979 unsigned int ch = str->ch;
2980 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2987 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2988 Dynarr_add (dst, '\r');
2989 if (eol_type != EOL_CR)
2990 Dynarr_add (dst, '\n');
2992 else if (BYTE_ASCII_P (c))
2995 Dynarr_add (dst, c);
2997 else if (BUFBYTE_LEADING_BYTE_P (c))
2999 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3000 c == LEADING_BYTE_CHINESE_BIG5_2)
3002 /* A recognized leading byte. */
3004 continue; /* not done with this character. */
3006 /* otherwise just ignore this character. */
3008 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3009 ch == LEADING_BYTE_CHINESE_BIG5_2)
3011 /* Previous char was a recognized leading byte. */
3013 continue; /* not done with this character. */
3017 /* Encountering second byte of a Big5 character. */
3018 unsigned char b1, b2;
3020 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3021 Dynarr_add (dst, b1);
3022 Dynarr_add (dst, b2);
3033 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3034 Decode a Big5 character CODE of BIG5 coding-system.
3035 CODE is the character code in BIG5, a cons of two integers.
3036 Return the corresponding character.
3040 unsigned char c1, c2, b1, b2;
3043 CHECK_INT (XCAR (code));
3044 CHECK_INT (XCDR (code));
3045 b1 = XINT (XCAR (code));
3046 b2 = XINT (XCDR (code));
3047 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3048 BYTE_BIG5_TWO_BYTE_2_P (b2))
3051 Lisp_Object charset;
3052 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3053 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3054 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3060 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3061 Encode the Big5 character CH to BIG5 coding-system.
3062 Return the corresponding character code in Big5.
3066 Lisp_Object charset;
3069 CHECK_CHAR_COERCE_INT (ch);
3070 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3071 if (EQ (charset, Vcharset_chinese_big5_1) ||
3072 EQ (charset, Vcharset_chinese_big5_2))
3074 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3076 return Fcons (make_int (b1), make_int (b2));
3083 /************************************************************************/
3086 /* UCS-4 character codes are implemented as nonnegative integers. */
3088 /************************************************************************/
3090 Lisp_Object ucs_to_mule_table[65536];
3091 Lisp_Object mule_to_ucs_table;
3093 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3094 Map UCS-4 code CODE to Mule character CHARACTER.
3096 Return T on success, NIL on failure.
3102 CHECK_CHAR (character);
3106 if (c < sizeof (ucs_to_mule_table))
3108 ucs_to_mule_table[c] = character;
3116 ucs_to_char (unsigned long code)
3118 if (code < sizeof (ucs_to_mule_table))
3120 return ucs_to_mule_table[code];
3122 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3127 c = code % (94 * 94);
3129 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3130 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3131 CHARSET_LEFT_TO_RIGHT),
3132 c / 94 + 33, c % 94 + 33));
3138 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3139 Return Mule character corresponding to UCS code CODE (a positive integer).
3143 CHECK_NATNUM (code);
3144 return ucs_to_char (XINT (code));
3147 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3148 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3152 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3153 Fset_char_ucs is more restrictive on index arg, but should
3154 check code arg in a char_table method. */
3155 CHECK_CHAR (character);
3156 CHECK_NATNUM (code);
3157 return Fput_char_table (character, code, mule_to_ucs_table);
3160 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3161 Return the UCS code (a positive integer) corresponding to CHARACTER.
3165 return Fget_char_table (character, mule_to_ucs_table);
3168 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3169 JIS X 0208 double-width `=' instead.
3170 #### do something more appropriate (use blob?)
3171 Danger, Will Robinson! Data loss. Should we signal user? */
3173 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3175 Lisp_Object chr = ucs_to_char (ch);
3179 Bufbyte work[MAX_EMCHAR_LEN];
3184 simple_set_charptr_emchar (work, ch) :
3185 non_ascii_set_charptr_emchar (work, ch);
3186 Dynarr_add_many (dst, work, len);
3190 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3191 Dynarr_add (dst, 34 + 128);
3192 Dynarr_add (dst, 46 + 128);
3196 static unsigned long
3197 mule_char_to_ucs4 (Lisp_Object charset,
3198 unsigned char h, unsigned char l)
3201 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3208 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3209 (XCHARSET_CHARS (charset) == 94) )
3211 unsigned char final = XCHARSET_FINAL (charset);
3213 if ( ('@' <= final) && (final < 0x7f) )
3215 return 0xe00000 + (final - '@') * 94 * 94
3216 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3230 encode_ucs4 (Lisp_Object charset,
3231 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3233 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3234 Dynarr_add (dst, code >> 24);
3235 Dynarr_add (dst, (code >> 16) & 255);
3236 Dynarr_add (dst, (code >> 8) & 255);
3237 Dynarr_add (dst, code & 255);
3241 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3247 switch (st->ucs4.in_byte)
3256 st->ucs4.in_byte = 0;
3262 return CODING_CATEGORY_UCS4_MASK;
3266 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3267 unsigned_char_dynarr *dst, unsigned int n)
3269 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3270 unsigned int flags = str->flags;
3271 unsigned int ch = str->ch;
3275 unsigned char c = *src++;
3283 decode_ucs4 ( ( ch << 8 ) | c, dst);
3288 ch = ( ch << 8 ) | c;
3292 if (flags & CODING_STATE_END)
3293 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3300 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3301 unsigned_char_dynarr *dst, unsigned int n)
3303 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3304 unsigned int flags = str->flags;
3305 unsigned int ch = str->ch;
3306 unsigned char char_boundary = str->iso2022.current_char_boundary;
3307 Lisp_Object charset = str->iso2022.current_charset;
3309 #ifdef ENABLE_COMPOSITE_CHARS
3310 /* flags for handling composite chars. We do a little switcharoo
3311 on the source while we're outputting the composite char. */
3312 unsigned int saved_n = 0;
3313 CONST unsigned char *saved_src = NULL;
3314 int in_composite = 0;
3321 unsigned char c = *src++;
3323 if (BYTE_ASCII_P (c))
3324 { /* Processing ASCII character */
3326 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3329 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3330 { /* Processing Leading Byte */
3332 charset = CHARSET_BY_LEADING_BYTE (c);
3333 if (LEADING_BYTE_PREFIX_P(c))
3338 { /* Processing Non-ASCII character */
3340 if (EQ (charset, Vcharset_control_1))
3342 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3346 switch (XCHARSET_REP_BYTES (charset))
3349 encode_ucs4 (charset, c, 0, dst);
3352 if (XCHARSET_PRIVATE_P (charset))
3354 encode_ucs4 (charset, c, 0, dst);
3359 #ifdef ENABLE_COMPOSITE_CHARS
3360 if (EQ (charset, Vcharset_composite))
3364 /* #### Bother! We don't know how to
3366 Dynarr_add (dst, 0);
3367 Dynarr_add (dst, 0);
3368 Dynarr_add (dst, 0);
3369 Dynarr_add (dst, '~');
3373 Emchar emch = MAKE_CHAR (Vcharset_composite,
3374 ch & 0x7F, c & 0x7F);
3375 Lisp_Object lstr = composite_char_string (emch);
3379 src = XSTRING_DATA (lstr);
3380 n = XSTRING_LENGTH (lstr);
3384 #endif /* ENABLE_COMPOSITE_CHARS */
3386 encode_ucs4(charset, ch, c, dst);
3399 encode_ucs4 (charset, ch, c, dst);
3415 #ifdef ENABLE_COMPOSITE_CHARS
3421 goto back_to_square_n; /* Wheeeeeeeee ..... */
3423 #endif /* ENABLE_COMPOSITE_CHARS */
3427 str->iso2022.current_char_boundary = char_boundary;
3428 str->iso2022.current_charset = charset;
3430 /* Verbum caro factum est! */
3434 /************************************************************************/
3436 /************************************************************************/
3439 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3444 unsigned char c = *src++;
3445 switch (st->utf8.in_byte)
3448 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3451 st->utf8.in_byte = 5;
3453 st->utf8.in_byte = 4;
3455 st->utf8.in_byte = 3;
3457 st->utf8.in_byte = 2;
3459 st->utf8.in_byte = 1;
3464 if ((c & 0xc0) != 0x80)
3470 return CODING_CATEGORY_UTF8_MASK;
3474 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3475 unsigned_char_dynarr *dst, unsigned int n)
3477 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3478 unsigned int flags = str->flags;
3479 unsigned int ch = str->ch;
3480 eol_type_t eol_type = str->eol_type;
3484 unsigned char c = *src++;
3493 else if ( c >= 0xf8 )
3498 else if ( c >= 0xf0 )
3503 else if ( c >= 0xe0 )
3508 else if ( c >= 0xc0 )
3515 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3516 decode_ucs4 (c, dst);
3520 ch = ( ch << 6 ) | ( c & 0x3f );
3521 decode_ucs4 (ch, dst);
3526 ch = ( ch << 6 ) | ( c & 0x3f );
3529 label_continue_loop:;
3532 if (flags & CODING_STATE_END)
3533 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3540 encode_utf8 (Lisp_Object charset,
3541 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3543 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3546 Dynarr_add (dst, code);
3548 else if ( code <= 0x7ff )
3550 Dynarr_add (dst, (code >> 6) | 0xc0);
3551 Dynarr_add (dst, (code & 0x3f) | 0x80);
3553 else if ( code <= 0xffff )
3555 Dynarr_add (dst, (code >> 12) | 0xe0);
3556 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3557 Dynarr_add (dst, (code & 0x3f) | 0x80);
3559 else if ( code <= 0x1fffff )
3561 Dynarr_add (dst, (code >> 18) | 0xf0);
3562 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3563 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3564 Dynarr_add (dst, (code & 0x3f) | 0x80);
3566 else if ( code <= 0x3ffffff )
3568 Dynarr_add (dst, (code >> 24) | 0xf8);
3569 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3570 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3571 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3572 Dynarr_add (dst, (code & 0x3f) | 0x80);
3576 Dynarr_add (dst, (code >> 30) | 0xfc);
3577 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3578 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3579 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3580 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3581 Dynarr_add (dst, (code & 0x3f) | 0x80);
3586 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3587 unsigned_char_dynarr *dst, unsigned int n)
3589 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3590 unsigned int flags = str->flags;
3591 unsigned int ch = str->ch;
3592 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3593 unsigned char char_boundary = str->iso2022.current_char_boundary;
3594 Lisp_Object charset = str->iso2022.current_charset;
3596 #ifdef ENABLE_COMPOSITE_CHARS
3597 /* flags for handling composite chars. We do a little switcharoo
3598 on the source while we're outputting the composite char. */
3599 unsigned int saved_n = 0;
3600 CONST unsigned char *saved_src = NULL;
3601 int in_composite = 0;
3604 #endif /* ENABLE_COMPOSITE_CHARS */
3608 unsigned char c = *src++;
3610 if (BYTE_ASCII_P (c))
3611 { /* Processing ASCII character */
3615 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3616 Dynarr_add (dst, '\r');
3617 if (eol_type != EOL_CR)
3618 Dynarr_add (dst, c);
3621 encode_utf8 (Vcharset_ascii, c, 0, dst);
3624 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3625 { /* Processing Leading Byte */
3627 charset = CHARSET_BY_LEADING_BYTE (c);
3628 if (LEADING_BYTE_PREFIX_P(c))
3633 { /* Processing Non-ASCII character */
3635 if (EQ (charset, Vcharset_control_1))
3637 encode_utf8 (Vcharset_control_1, c, 0, dst);
3641 switch (XCHARSET_REP_BYTES (charset))
3644 encode_utf8 (charset, c, 0, dst);
3647 if (XCHARSET_PRIVATE_P (charset))
3649 encode_utf8 (charset, c, 0, dst);
3654 #ifdef ENABLE_COMPOSITE_CHARS
3655 if (EQ (charset, Vcharset_composite))
3659 /* #### Bother! We don't know how to
3661 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3665 Emchar emch = MAKE_CHAR (Vcharset_composite,
3666 ch & 0x7F, c & 0x7F);
3667 Lisp_Object lstr = composite_char_string (emch);
3671 src = XSTRING_DATA (lstr);
3672 n = XSTRING_LENGTH (lstr);
3676 #endif /* ENABLE_COMPOSITE_CHARS */
3678 encode_utf8 (charset, ch, c, dst);
3691 encode_utf8 (charset, ch, c, dst);
3707 #ifdef ENABLE_COMPOSITE_CHARS
3713 goto back_to_square_n; /* Wheeeeeeeee ..... */
3719 str->iso2022.current_char_boundary = char_boundary;
3720 str->iso2022.current_charset = charset;
3722 /* Verbum caro factum est! */
3726 /************************************************************************/
3727 /* ISO2022 methods */
3728 /************************************************************************/
3730 /* The following note describes the coding system ISO2022 briefly.
3731 Since the intention of this note is to help understand the
3732 functions in this file, some parts are NOT ACCURATE or OVERLY
3733 SIMPLIFIED. For thorough understanding, please refer to the
3734 original document of ISO2022.
3736 ISO2022 provides many mechanisms to encode several character sets
3737 in 7-bit and 8-bit environments. For 7-bit environments, all text
3738 is encoded using bytes less than 128. This may make the encoded
3739 text a little bit longer, but the text passes more easily through
3740 several gateways, some of which strip off MSB (Most Signigant Bit).
3742 There are two kinds of character sets: control character set and
3743 graphic character set. The former contains control characters such
3744 as `newline' and `escape' to provide control functions (control
3745 functions are also provided by escape sequences). The latter
3746 contains graphic characters such as 'A' and '-'. Emacs recognizes
3747 two control character sets and many graphic character sets.
3749 Graphic character sets are classified into one of the following
3750 four classes, according to the number of bytes (DIMENSION) and
3751 number of characters in one dimension (CHARS) of the set:
3752 - DIMENSION1_CHARS94
3753 - DIMENSION1_CHARS96
3754 - DIMENSION2_CHARS94
3755 - DIMENSION2_CHARS96
3757 In addition, each character set is assigned an identification tag,
3758 unique for each set, called "final character" (denoted as <F>
3759 hereafter). The <F> of each character set is decided by ECMA(*)
3760 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3761 (0x30..0x3F are for private use only).
3763 Note (*): ECMA = European Computer Manufacturers Association
3765 Here are examples of graphic character set [NAME(<F>)]:
3766 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3767 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3768 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3769 o DIMENSION2_CHARS96 -- none for the moment
3771 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3772 C0 [0x00..0x1F] -- control character plane 0
3773 GL [0x20..0x7F] -- graphic character plane 0
3774 C1 [0x80..0x9F] -- control character plane 1
3775 GR [0xA0..0xFF] -- graphic character plane 1
3777 A control character set is directly designated and invoked to C0 or
3778 C1 by an escape sequence. The most common case is that:
3779 - ISO646's control character set is designated/invoked to C0, and
3780 - ISO6429's control character set is designated/invoked to C1,
3781 and usually these designations/invocations are omitted in encoded
3782 text. In a 7-bit environment, only C0 can be used, and a control
3783 character for C1 is encoded by an appropriate escape sequence to
3784 fit into the environment. All control characters for C1 are
3785 defined to have corresponding escape sequences.
3787 A graphic character set is at first designated to one of four
3788 graphic registers (G0 through G3), then these graphic registers are
3789 invoked to GL or GR. These designations and invocations can be
3790 done independently. The most common case is that G0 is invoked to
3791 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3792 these invocations and designations are omitted in encoded text.
3793 In a 7-bit environment, only GL can be used.
3795 When a graphic character set of CHARS94 is invoked to GL, codes
3796 0x20 and 0x7F of the GL area work as control characters SPACE and
3797 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3800 There are two ways of invocation: locking-shift and single-shift.
3801 With locking-shift, the invocation lasts until the next different
3802 invocation, whereas with single-shift, the invocation affects the
3803 following character only and doesn't affect the locking-shift
3804 state. Invocations are done by the following control characters or
3807 ----------------------------------------------------------------------
3808 abbrev function cntrl escape seq description
3809 ----------------------------------------------------------------------
3810 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3811 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3812 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3813 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3814 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3815 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3816 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3817 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3818 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3819 ----------------------------------------------------------------------
3820 (*) These are not used by any known coding system.
3822 Control characters for these functions are defined by macros
3823 ISO_CODE_XXX in `coding.h'.
3825 Designations are done by the following escape sequences:
3826 ----------------------------------------------------------------------
3827 escape sequence description
3828 ----------------------------------------------------------------------
3829 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3830 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3831 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3832 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3833 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3834 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3835 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3836 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3837 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3838 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3839 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3840 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3841 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3842 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3843 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3844 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3845 ----------------------------------------------------------------------
3847 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3848 of dimension 1, chars 94, and final character <F>, etc...
3850 Note (*): Although these designations are not allowed in ISO2022,
3851 Emacs accepts them on decoding, and produces them on encoding
3852 CHARS96 character sets in a coding system which is characterized as
3853 7-bit environment, non-locking-shift, and non-single-shift.
3855 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3856 '(' can be omitted. We refer to this as "short-form" hereafter.
3858 Now you may notice that there are a lot of ways for encoding the
3859 same multilingual text in ISO2022. Actually, there exist many
3860 coding systems such as Compound Text (used in X11's inter client
3861 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3862 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3863 localized platforms), and all of these are variants of ISO2022.
3865 In addition to the above, Emacs handles two more kinds of escape
3866 sequences: ISO6429's direction specification and Emacs' private
3867 sequence for specifying character composition.
3869 ISO6429's direction specification takes the following form:
3870 o CSI ']' -- end of the current direction
3871 o CSI '0' ']' -- end of the current direction
3872 o CSI '1' ']' -- start of left-to-right text
3873 o CSI '2' ']' -- start of right-to-left text
3874 The control character CSI (0x9B: control sequence introducer) is
3875 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3877 Character composition specification takes the following form:
3878 o ESC '0' -- start character composition
3879 o ESC '1' -- end character composition
3880 Since these are not standard escape sequences of any ISO standard,
3881 their use with these meanings is restricted to Emacs only. */
3884 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3888 for (i = 0; i < 4; i++)
3890 if (!NILP (coding_system))
3892 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3894 iso->charset[i] = Qt;
3895 iso->invalid_designated[i] = 0;
3897 iso->esc = ISO_ESC_NOTHING;
3898 iso->esc_bytes_index = 0;
3899 iso->register_left = 0;
3900 iso->register_right = 1;
3901 iso->switched_dir_and_no_valid_charset_yet = 0;
3902 iso->invalid_switch_dir = 0;
3903 iso->output_direction_sequence = 0;
3904 iso->output_literally = 0;
3905 #ifdef ENABLE_COMPOSITE_CHARS
3906 if (iso->composite_chars)
3907 Dynarr_reset (iso->composite_chars);
3912 fit_to_be_escape_quoted (unsigned char c)
3929 /* Parse one byte of an ISO2022 escape sequence.
3930 If the result is an invalid escape sequence, return 0 and
3931 do not change anything in STR. Otherwise, if the result is
3932 an incomplete escape sequence, update ISO2022.ESC and
3933 ISO2022.ESC_BYTES and return -1. Otherwise, update
3934 all the state variables (but not ISO2022.ESC_BYTES) and
3937 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3938 or invocation of an invalid character set and treat that as
3939 an unrecognized escape sequence. */
3942 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3943 unsigned char c, unsigned int *flags,
3944 int check_invalid_charsets)
3946 /* (1) If we're at the end of a designation sequence, CS is the
3947 charset being designated and REG is the register to designate
3950 (2) If we're at the end of a locking-shift sequence, REG is
3951 the register to invoke and HALF (0 == left, 1 == right) is
3952 the half to invoke it into.
3954 (3) If we're at the end of a single-shift sequence, REG is
3955 the register to invoke. */
3956 Lisp_Object cs = Qnil;
3959 /* NOTE: This code does goto's all over the fucking place.
3960 The reason for this is that we're basically implementing
3961 a state machine here, and hierarchical languages like C
3962 don't really provide a clean way of doing this. */
3964 if (! (*flags & CODING_STATE_ESCAPE))
3965 /* At beginning of escape sequence; we need to reset our
3966 escape-state variables. */
3967 iso->esc = ISO_ESC_NOTHING;
3969 iso->output_literally = 0;
3970 iso->output_direction_sequence = 0;
3974 case ISO_ESC_NOTHING:
3975 iso->esc_bytes_index = 0;
3978 case ISO_CODE_ESC: /* Start escape sequence */
3979 *flags |= CODING_STATE_ESCAPE;
3983 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3984 *flags |= CODING_STATE_ESCAPE;
3985 iso->esc = ISO_ESC_5_11;
3988 case ISO_CODE_SO: /* locking shift 1 */
3991 case ISO_CODE_SI: /* locking shift 0 */
3995 case ISO_CODE_SS2: /* single shift */
3998 case ISO_CODE_SS3: /* single shift */
4002 default: /* Other control characters */
4009 /**** single shift ****/
4011 case 'N': /* single shift 2 */
4014 case 'O': /* single shift 3 */
4018 /**** locking shift ****/
4020 case '~': /* locking shift 1 right */
4023 case 'n': /* locking shift 2 */
4026 case '}': /* locking shift 2 right */
4029 case 'o': /* locking shift 3 */
4032 case '|': /* locking shift 3 right */
4036 #ifdef ENABLE_COMPOSITE_CHARS
4037 /**** composite ****/
4040 iso->esc = ISO_ESC_START_COMPOSITE;
4041 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4042 CODING_STATE_COMPOSITE;
4046 iso->esc = ISO_ESC_END_COMPOSITE;
4047 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4048 ~CODING_STATE_COMPOSITE;
4050 #endif /* ENABLE_COMPOSITE_CHARS */
4052 /**** directionality ****/
4055 iso->esc = ISO_ESC_5_11;
4058 /**** designation ****/
4060 case '$': /* multibyte charset prefix */
4061 iso->esc = ISO_ESC_2_4;
4065 if (0x28 <= c && c <= 0x2F)
4067 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4071 /* This function is called with CODESYS equal to nil when
4072 doing coding-system detection. */
4074 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4075 && fit_to_be_escape_quoted (c))
4077 iso->esc = ISO_ESC_LITERAL;
4078 *flags &= CODING_STATE_ISO2022_LOCK;
4088 /**** directionality ****/
4090 case ISO_ESC_5_11: /* ISO6429 direction control */
4093 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4094 goto directionality;
4096 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4097 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4098 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4102 case ISO_ESC_5_11_0:
4105 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4106 goto directionality;
4110 case ISO_ESC_5_11_1:
4113 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4114 goto directionality;
4118 case ISO_ESC_5_11_2:
4121 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4122 goto directionality;
4127 iso->esc = ISO_ESC_DIRECTIONALITY;
4128 /* Various junk here to attempt to preserve the direction sequences
4129 literally in the text if they would otherwise be swallowed due
4130 to invalid designations that don't show up as actual charset
4131 changes in the text. */
4132 if (iso->invalid_switch_dir)
4134 /* We already inserted a direction switch literally into the
4135 text. We assume (#### this may not be right) that the
4136 next direction switch is the one going the other way,
4137 and we need to output that literally as well. */
4138 iso->output_literally = 1;
4139 iso->invalid_switch_dir = 0;
4145 /* If we are in the thrall of an invalid designation,
4146 then stick the directionality sequence literally into the
4147 output stream so it ends up in the original text again. */
4148 for (jj = 0; jj < 4; jj++)
4149 if (iso->invalid_designated[jj])
4153 iso->output_literally = 1;
4154 iso->invalid_switch_dir = 1;
4157 /* Indicate that we haven't yet seen a valid designation,
4158 so that if a switch-dir is directly followed by an
4159 invalid designation, both get inserted literally. */
4160 iso->switched_dir_and_no_valid_charset_yet = 1;
4165 /**** designation ****/
4168 if (0x28 <= c && c <= 0x2F)
4170 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4173 if (0x40 <= c && c <= 0x42)
4175 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4176 *flags & CODING_STATE_R2L ?
4177 CHARSET_RIGHT_TO_LEFT :
4178 CHARSET_LEFT_TO_RIGHT);
4188 if (c < '0' || c > '~')
4189 return 0; /* bad final byte */
4191 if (iso->esc >= ISO_ESC_2_8 &&
4192 iso->esc <= ISO_ESC_2_15)
4194 type = ((iso->esc >= ISO_ESC_2_12) ?
4195 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4196 reg = (iso->esc - ISO_ESC_2_8) & 3;
4198 else if (iso->esc >= ISO_ESC_2_4_8 &&
4199 iso->esc <= ISO_ESC_2_4_15)
4201 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4202 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4203 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4207 /* Can this ever be reached? -slb */
4211 cs = CHARSET_BY_ATTRIBUTES (type, c,
4212 *flags & CODING_STATE_R2L ?
4213 CHARSET_RIGHT_TO_LEFT :
4214 CHARSET_LEFT_TO_RIGHT);
4220 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4224 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4225 /* can't invoke something that ain't there. */
4227 iso->esc = ISO_ESC_SINGLE_SHIFT;
4228 *flags &= CODING_STATE_ISO2022_LOCK;
4230 *flags |= CODING_STATE_SS2;
4232 *flags |= CODING_STATE_SS3;
4236 if (check_invalid_charsets &&
4237 !CHARSETP (iso->charset[reg]))
4238 /* can't invoke something that ain't there. */
4241 iso->register_right = reg;
4243 iso->register_left = reg;
4244 *flags &= CODING_STATE_ISO2022_LOCK;
4245 iso->esc = ISO_ESC_LOCKING_SHIFT;
4249 if (NILP (cs) && check_invalid_charsets)
4251 iso->invalid_designated[reg] = 1;
4252 iso->charset[reg] = Vcharset_ascii;
4253 iso->esc = ISO_ESC_DESIGNATE;
4254 *flags &= CODING_STATE_ISO2022_LOCK;
4255 iso->output_literally = 1;
4256 if (iso->switched_dir_and_no_valid_charset_yet)
4258 /* We encountered a switch-direction followed by an
4259 invalid designation. Ensure that the switch-direction
4260 gets outputted; otherwise it will probably get eaten
4261 when the text is written out again. */
4262 iso->switched_dir_and_no_valid_charset_yet = 0;
4263 iso->output_direction_sequence = 1;
4264 /* And make sure that the switch-dir going the other
4265 way gets outputted, as well. */
4266 iso->invalid_switch_dir = 1;
4270 /* This function is called with CODESYS equal to nil when
4271 doing coding-system detection. */
4272 if (!NILP (codesys))
4274 charset_conversion_spec_dynarr *dyn =
4275 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4281 for (i = 0; i < Dynarr_length (dyn); i++)
4283 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4284 if (EQ (cs, spec->from_charset))
4285 cs = spec->to_charset;
4290 iso->charset[reg] = cs;
4291 iso->esc = ISO_ESC_DESIGNATE;
4292 *flags &= CODING_STATE_ISO2022_LOCK;
4293 if (iso->invalid_designated[reg])
4295 iso->invalid_designated[reg] = 0;
4296 iso->output_literally = 1;
4298 if (iso->switched_dir_and_no_valid_charset_yet)
4299 iso->switched_dir_and_no_valid_charset_yet = 0;
4304 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4309 /* #### There are serious deficiencies in the recognition mechanism
4310 here. This needs to be much smarter if it's going to cut it.
4311 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4312 it should be detected as Latin-1.
4313 All the ISO2022 stuff in this file should be synced up with the
4314 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4315 Perhaps we should wait till R2L works in FSF Emacs? */
4317 if (!st->iso2022.initted)
4319 reset_iso2022 (Qnil, &st->iso2022.iso);
4320 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4321 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4322 CODING_CATEGORY_ISO_8_1_MASK |
4323 CODING_CATEGORY_ISO_8_2_MASK |
4324 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4325 st->iso2022.flags = 0;
4326 st->iso2022.high_byte_count = 0;
4327 st->iso2022.saw_single_shift = 0;
4328 st->iso2022.initted = 1;
4331 mask = st->iso2022.mask;
4338 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4339 st->iso2022.high_byte_count++;
4343 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4345 if (st->iso2022.high_byte_count & 1)
4346 /* odd number of high bytes; assume not iso-8-2 */
4347 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4349 st->iso2022.high_byte_count = 0;
4350 st->iso2022.saw_single_shift = 0;
4352 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4354 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4355 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4356 { /* control chars */
4359 /* Allow and ignore control characters that you might
4360 reasonably see in a text file */
4365 case 8: /* backspace */
4366 case 11: /* vertical tab */
4367 case 12: /* form feed */
4368 case 26: /* MS-DOS C-z junk */
4369 case 31: /* '^_' -- for info */
4370 goto label_continue_loop;
4377 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4380 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4381 &st->iso2022.flags, 0))
4383 switch (st->iso2022.iso.esc)
4385 case ISO_ESC_DESIGNATE:
4386 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4387 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4389 case ISO_ESC_LOCKING_SHIFT:
4390 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4391 goto ran_out_of_chars;
4392 case ISO_ESC_SINGLE_SHIFT:
4393 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4394 st->iso2022.saw_single_shift = 1;
4403 goto ran_out_of_chars;
4406 label_continue_loop:;
4415 postprocess_iso2022_mask (int mask)
4417 /* #### kind of cheesy */
4418 /* If seven-bit ISO is allowed, then assume that the encoding is
4419 entirely seven-bit and turn off the eight-bit ones. */
4420 if (mask & CODING_CATEGORY_ISO_7_MASK)
4421 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4422 CODING_CATEGORY_ISO_8_1_MASK |
4423 CODING_CATEGORY_ISO_8_2_MASK);
4427 /* If FLAGS is a null pointer or specifies right-to-left motion,
4428 output a switch-dir-to-left-to-right sequence to DST.
4429 Also update FLAGS if it is not a null pointer.
4430 If INTERNAL_P is set, we are outputting in internal format and
4431 need to handle the CSI differently. */
4434 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4435 unsigned_char_dynarr *dst,
4436 unsigned int *flags,
4439 if (!flags || (*flags & CODING_STATE_R2L))
4441 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4443 Dynarr_add (dst, ISO_CODE_ESC);
4444 Dynarr_add (dst, '[');
4446 else if (internal_p)
4447 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4449 Dynarr_add (dst, ISO_CODE_CSI);
4450 Dynarr_add (dst, '0');
4451 Dynarr_add (dst, ']');
4453 *flags &= ~CODING_STATE_R2L;
4457 /* If FLAGS is a null pointer or specifies a direction different from
4458 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4459 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4460 sequence to DST. Also update FLAGS if it is not a null pointer.
4461 If INTERNAL_P is set, we are outputting in internal format and
4462 need to handle the CSI differently. */
4465 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4466 unsigned_char_dynarr *dst, unsigned int *flags,
4469 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4470 direction == CHARSET_LEFT_TO_RIGHT)
4471 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4472 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4473 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4474 direction == CHARSET_RIGHT_TO_LEFT)
4476 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4478 Dynarr_add (dst, ISO_CODE_ESC);
4479 Dynarr_add (dst, '[');
4481 else if (internal_p)
4482 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4484 Dynarr_add (dst, ISO_CODE_CSI);
4485 Dynarr_add (dst, '2');
4486 Dynarr_add (dst, ']');
4488 *flags |= CODING_STATE_R2L;
4492 /* Convert ISO2022-format data to internal format. */
4495 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4496 unsigned_char_dynarr *dst, unsigned int n)
4498 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4499 unsigned int flags = str->flags;
4500 unsigned int ch = str->ch;
4501 eol_type_t eol_type = str->eol_type;
4502 #ifdef ENABLE_COMPOSITE_CHARS
4503 unsigned_char_dynarr *real_dst = dst;
4505 Lisp_Object coding_system;
4507 XSETCODING_SYSTEM (coding_system, str->codesys);
4509 #ifdef ENABLE_COMPOSITE_CHARS
4510 if (flags & CODING_STATE_COMPOSITE)
4511 dst = str->iso2022.composite_chars;
4512 #endif /* ENABLE_COMPOSITE_CHARS */
4516 unsigned char c = *src++;
4517 if (flags & CODING_STATE_ESCAPE)
4518 { /* Within ESC sequence */
4519 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4524 switch (str->iso2022.esc)
4526 #ifdef ENABLE_COMPOSITE_CHARS
4527 case ISO_ESC_START_COMPOSITE:
4528 if (str->iso2022.composite_chars)
4529 Dynarr_reset (str->iso2022.composite_chars);
4531 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4532 dst = str->iso2022.composite_chars;
4534 case ISO_ESC_END_COMPOSITE:
4536 Bufbyte comstr[MAX_EMCHAR_LEN];
4538 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4539 Dynarr_length (dst));
4541 len = set_charptr_emchar (comstr, emch);
4542 Dynarr_add_many (dst, comstr, len);
4545 #endif /* ENABLE_COMPOSITE_CHARS */
4547 case ISO_ESC_LITERAL:
4548 DECODE_ADD_BINARY_CHAR (c, dst);
4552 /* Everything else handled already */
4557 /* Attempted error recovery. */
4558 if (str->iso2022.output_direction_sequence)
4559 ensure_correct_direction (flags & CODING_STATE_R2L ?
4560 CHARSET_RIGHT_TO_LEFT :
4561 CHARSET_LEFT_TO_RIGHT,
4562 str->codesys, dst, 0, 1);
4563 /* More error recovery. */
4564 if (!retval || str->iso2022.output_literally)
4566 /* Output the (possibly invalid) sequence */
4568 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4569 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4570 flags &= CODING_STATE_ISO2022_LOCK;
4572 n++, src--;/* Repeat the loop with the same character. */
4575 /* No sense in reprocessing the final byte of the
4576 escape sequence; it could mess things up anyway.
4578 DECODE_ADD_BINARY_CHAR (c, dst);
4583 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4584 { /* Control characters */
4586 /***** Error-handling *****/
4588 /* If we were in the middle of a character, dump out the
4589 partial character. */
4590 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4592 /* If we just saw a single-shift character, dump it out.
4593 This may dump out the wrong sort of single-shift character,
4594 but least it will give an indication that something went
4596 if (flags & CODING_STATE_SS2)
4598 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4599 flags &= ~CODING_STATE_SS2;
4601 if (flags & CODING_STATE_SS3)
4603 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4604 flags &= ~CODING_STATE_SS3;
4607 /***** Now handle the control characters. *****/
4610 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4612 flags &= CODING_STATE_ISO2022_LOCK;
4614 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4615 DECODE_ADD_BINARY_CHAR (c, dst);
4618 { /* Graphic characters */
4619 Lisp_Object charset;
4623 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4625 /* Now determine the charset. */
4626 reg = ((flags & CODING_STATE_SS2) ? 2
4627 : (flags & CODING_STATE_SS3) ? 3
4628 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4629 : str->iso2022.register_left);
4630 charset = str->iso2022.charset[reg];
4632 /* Error checking: */
4633 if (! CHARSETP (charset)
4634 || str->iso2022.invalid_designated[reg]
4635 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4636 && XCHARSET_CHARS (charset) == 94))
4637 /* Mrmph. We are trying to invoke a register that has no
4638 or an invalid charset in it, or trying to add a character
4639 outside the range of the charset. Insert that char literally
4640 to preserve it for the output. */
4642 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4643 DECODE_ADD_BINARY_CHAR (c, dst);
4648 /* Things are probably hunky-dorey. */
4650 /* Fetch reverse charset, maybe. */
4651 if (((flags & CODING_STATE_R2L) &&
4652 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4654 (!(flags & CODING_STATE_R2L) &&
4655 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4657 Lisp_Object new_charset =
4658 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4659 if (!NILP (new_charset))
4660 charset = new_charset;
4663 lb = XCHARSET_LEADING_BYTE (charset);
4664 switch (XCHARSET_REP_BYTES (charset))
4667 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4668 Dynarr_add (dst, c & 0x7F);
4671 case 2: /* one-byte official */
4672 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4673 Dynarr_add (dst, lb);
4674 Dynarr_add (dst, c | 0x80);
4677 case 3: /* one-byte private or two-byte official */
4678 if (XCHARSET_PRIVATE_P (charset))
4680 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4681 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4682 Dynarr_add (dst, lb);
4683 Dynarr_add (dst, c | 0x80);
4689 Dynarr_add (dst, lb);
4690 Dynarr_add (dst, ch | 0x80);
4691 Dynarr_add (dst, c | 0x80);
4699 default: /* two-byte private */
4702 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4703 Dynarr_add (dst, lb);
4704 Dynarr_add (dst, ch | 0x80);
4705 Dynarr_add (dst, c | 0x80);
4714 flags &= CODING_STATE_ISO2022_LOCK;
4717 label_continue_loop:;
4720 if (flags & CODING_STATE_END)
4721 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4728 /***** ISO2022 encoder *****/
4730 /* Designate CHARSET into register REG. */
4733 iso2022_designate (Lisp_Object charset, unsigned char reg,
4734 struct encoding_stream *str, unsigned_char_dynarr *dst)
4736 static CONST char inter94[] = "()*+";
4737 static CONST char inter96[] = ",-./";
4739 unsigned char final;
4740 Lisp_Object old_charset = str->iso2022.charset[reg];
4742 str->iso2022.charset[reg] = charset;
4743 if (!CHARSETP (charset))
4744 /* charset might be an initial nil or t. */
4746 type = XCHARSET_TYPE (charset);
4747 final = XCHARSET_FINAL (charset);
4748 if (!str->iso2022.force_charset_on_output[reg] &&
4749 CHARSETP (old_charset) &&
4750 XCHARSET_TYPE (old_charset) == type &&
4751 XCHARSET_FINAL (old_charset) == final)
4754 str->iso2022.force_charset_on_output[reg] = 0;
4757 charset_conversion_spec_dynarr *dyn =
4758 str->codesys->iso2022.output_conv;
4764 for (i = 0; i < Dynarr_length (dyn); i++)
4766 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4767 if (EQ (charset, spec->from_charset))
4768 charset = spec->to_charset;
4773 Dynarr_add (dst, ISO_CODE_ESC);
4776 case CHARSET_TYPE_94:
4777 Dynarr_add (dst, inter94[reg]);
4779 case CHARSET_TYPE_96:
4780 Dynarr_add (dst, inter96[reg]);
4782 case CHARSET_TYPE_94X94:
4783 Dynarr_add (dst, '$');
4785 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4788 Dynarr_add (dst, inter94[reg]);
4790 case CHARSET_TYPE_96X96:
4791 Dynarr_add (dst, '$');
4792 Dynarr_add (dst, inter96[reg]);
4795 Dynarr_add (dst, final);
4799 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4801 if (str->iso2022.register_left != 0)
4803 Dynarr_add (dst, ISO_CODE_SI);
4804 str->iso2022.register_left = 0;
4809 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4811 if (str->iso2022.register_left != 1)
4813 Dynarr_add (dst, ISO_CODE_SO);
4814 str->iso2022.register_left = 1;
4818 /* Convert internally-formatted data to ISO2022 format. */
4821 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4822 unsigned_char_dynarr *dst, unsigned int n)
4824 unsigned char charmask, c;
4825 unsigned char char_boundary;
4826 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4827 unsigned int flags = str->flags;
4828 unsigned int ch = str->ch;
4829 Lisp_Coding_System *codesys = str->codesys;
4830 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4832 Lisp_Object charset;
4835 #ifdef ENABLE_COMPOSITE_CHARS
4836 /* flags for handling composite chars. We do a little switcharoo
4837 on the source while we're outputting the composite char. */
4838 unsigned int saved_n = 0;
4839 CONST unsigned char *saved_src = NULL;
4840 int in_composite = 0;
4841 #endif /* ENABLE_COMPOSITE_CHARS */
4843 char_boundary = str->iso2022.current_char_boundary;
4844 charset = str->iso2022.current_charset;
4845 half = str->iso2022.current_half;
4847 #ifdef ENABLE_COMPOSITE_CHARS
4854 if (BYTE_ASCII_P (c))
4855 { /* Processing ASCII character */
4858 restore_left_to_right_direction (codesys, dst, &flags, 0);
4860 /* Make sure G0 contains ASCII */
4861 if ((c > ' ' && c < ISO_CODE_DEL) ||
4862 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4864 ensure_normal_shift (str, dst);
4865 iso2022_designate (Vcharset_ascii, 0, str, dst);
4868 /* If necessary, restore everything to the default state
4871 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4873 restore_left_to_right_direction (codesys, dst, &flags, 0);
4875 ensure_normal_shift (str, dst);
4877 for (i = 0; i < 4; i++)
4879 Lisp_Object initial_charset =
4880 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4881 iso2022_designate (initial_charset, i, str, dst);
4886 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4887 Dynarr_add (dst, '\r');
4888 if (eol_type != EOL_CR)
4889 Dynarr_add (dst, c);
4893 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4894 && fit_to_be_escape_quoted (c))
4895 Dynarr_add (dst, ISO_CODE_ESC);
4896 Dynarr_add (dst, c);
4901 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4902 { /* Processing Leading Byte */
4904 charset = CHARSET_BY_LEADING_BYTE (c);
4905 if (LEADING_BYTE_PREFIX_P(c))
4907 else if (!EQ (charset, Vcharset_control_1)
4908 #ifdef ENABLE_COMPOSITE_CHARS
4909 && !EQ (charset, Vcharset_composite)
4915 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4916 codesys, dst, &flags, 0);
4918 /* Now determine which register to use. */
4920 for (i = 0; i < 4; i++)
4922 if (EQ (charset, str->iso2022.charset[i]) ||
4924 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4933 if (XCHARSET_GRAPHIC (charset) != 0)
4935 if (!NILP (str->iso2022.charset[1]) &&
4936 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4937 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4939 else if (!NILP (str->iso2022.charset[2]))
4941 else if (!NILP (str->iso2022.charset[3]))
4950 iso2022_designate (charset, reg, str, dst);
4952 /* Now invoke that register. */
4956 ensure_normal_shift (str, dst);
4961 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4963 ensure_shift_out (str, dst);
4971 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4973 Dynarr_add (dst, ISO_CODE_ESC);
4974 Dynarr_add (dst, 'N');
4979 Dynarr_add (dst, ISO_CODE_SS2);
4985 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4987 Dynarr_add (dst, ISO_CODE_ESC);
4988 Dynarr_add (dst, 'O');
4993 Dynarr_add (dst, ISO_CODE_SS3);
5005 { /* Processing Non-ASCII character */
5006 charmask = (half == 0 ? 0x7F : 0xFF);
5008 if (EQ (charset, Vcharset_control_1))
5010 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5011 && fit_to_be_escape_quoted (c))
5012 Dynarr_add (dst, ISO_CODE_ESC);
5013 /* you asked for it ... */
5014 Dynarr_add (dst, c - 0x20);
5018 switch (XCHARSET_REP_BYTES (charset))
5021 Dynarr_add (dst, c & charmask);
5024 if (XCHARSET_PRIVATE_P (charset))
5026 Dynarr_add (dst, c & charmask);
5031 #ifdef ENABLE_COMPOSITE_CHARS
5032 if (EQ (charset, Vcharset_composite))
5036 /* #### Bother! We don't know how to
5038 Dynarr_add (dst, '~');
5042 Emchar emch = MAKE_CHAR (Vcharset_composite,
5043 ch & 0x7F, c & 0x7F);
5044 Lisp_Object lstr = composite_char_string (emch);
5048 src = XSTRING_DATA (lstr);
5049 n = XSTRING_LENGTH (lstr);
5050 Dynarr_add (dst, ISO_CODE_ESC);
5051 Dynarr_add (dst, '0'); /* start composing */
5055 #endif /* ENABLE_COMPOSITE_CHARS */
5057 Dynarr_add (dst, ch & charmask);
5058 Dynarr_add (dst, c & charmask);
5071 Dynarr_add (dst, ch & charmask);
5072 Dynarr_add (dst, c & charmask);
5088 #ifdef ENABLE_COMPOSITE_CHARS
5094 Dynarr_add (dst, ISO_CODE_ESC);
5095 Dynarr_add (dst, '1'); /* end composing */
5096 goto back_to_square_n; /* Wheeeeeeeee ..... */
5098 #endif /* ENABLE_COMPOSITE_CHARS */
5100 if (char_boundary && flags & CODING_STATE_END)
5102 restore_left_to_right_direction (codesys, dst, &flags, 0);
5103 ensure_normal_shift (str, dst);
5104 for (i = 0; i < 4; i++)
5106 Lisp_Object initial_charset =
5107 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5108 iso2022_designate (initial_charset, i, str, dst);
5114 str->iso2022.current_char_boundary = char_boundary;
5115 str->iso2022.current_charset = charset;
5116 str->iso2022.current_half = half;
5118 /* Verbum caro factum est! */
5122 /************************************************************************/
5123 /* No-conversion methods */
5124 /************************************************************************/
5126 /* This is used when reading in "binary" files -- i.e. files that may
5127 contain all 256 possible byte values and that are not to be
5128 interpreted as being in any particular decoding. */
5130 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5131 unsigned_char_dynarr *dst, unsigned int n)
5134 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5135 unsigned int flags = str->flags;
5136 unsigned int ch = str->ch;
5137 eol_type_t eol_type = str->eol_type;
5143 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5144 DECODE_ADD_BINARY_CHAR (c, dst);
5145 label_continue_loop:;
5148 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5155 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5156 unsigned_char_dynarr *dst, unsigned int n)
5159 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5160 unsigned int flags = str->flags;
5161 unsigned int ch = str->ch;
5162 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5169 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5170 Dynarr_add (dst, '\r');
5171 if (eol_type != EOL_CR)
5172 Dynarr_add (dst, '\n');
5175 else if (BYTE_ASCII_P (c))
5178 Dynarr_add (dst, c);
5180 else if (BUFBYTE_LEADING_BYTE_P (c))
5183 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5184 c == LEADING_BYTE_CONTROL_1)
5187 Dynarr_add (dst, '~'); /* untranslatable character */
5191 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5192 Dynarr_add (dst, c);
5193 else if (ch == LEADING_BYTE_CONTROL_1)
5196 Dynarr_add (dst, c - 0x20);
5198 /* else it should be the second or third byte of an
5199 untranslatable character, so ignore it */
5209 /************************************************************************/
5210 /* Simple internal/external functions */
5211 /************************************************************************/
5213 static Extbyte_dynarr *conversion_out_dynarr;
5214 static Bufbyte_dynarr *conversion_in_dynarr;
5216 /* Determine coding system from coding format */
5218 /* #### not correct for all values of `fmt'! */
5220 external_data_format_to_coding_system (enum external_data_format fmt)
5224 case FORMAT_FILENAME:
5225 case FORMAT_TERMINAL:
5226 if (EQ (Vfile_name_coding_system, Qnil) ||
5227 EQ (Vfile_name_coding_system, Qbinary))
5230 return Fget_coding_system (Vfile_name_coding_system);
5233 return Fget_coding_system (Qctext);
5241 convert_to_external_format (CONST Bufbyte *ptr,
5244 enum external_data_format fmt)
5246 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5248 if (!conversion_out_dynarr)
5249 conversion_out_dynarr = Dynarr_new (Extbyte);
5251 Dynarr_reset (conversion_out_dynarr);
5253 if (NILP (coding_system))
5255 CONST Bufbyte *end = ptr + len;
5260 (BYTE_ASCII_P (*ptr)) ? *ptr :
5261 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5262 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5265 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5269 #ifdef ERROR_CHECK_BUFPOS
5270 assert (ptr == end);
5275 Lisp_Object instream, outstream, da_outstream;
5276 Lstream *istr, *ostr;
5277 struct gcpro gcpro1, gcpro2, gcpro3;
5278 char tempbuf[1024]; /* some random amount */
5280 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5281 da_outstream = make_dynarr_output_stream
5282 ((unsigned_char_dynarr *) conversion_out_dynarr);
5284 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5285 istr = XLSTREAM (instream);
5286 ostr = XLSTREAM (outstream);
5287 GCPRO3 (instream, outstream, da_outstream);
5290 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5293 Lstream_write (ostr, tempbuf, size_in_bytes);
5295 Lstream_close (istr);
5296 Lstream_close (ostr);
5298 Lstream_delete (istr);
5299 Lstream_delete (ostr);
5300 Lstream_delete (XLSTREAM (da_outstream));
5303 *len_out = Dynarr_length (conversion_out_dynarr);
5304 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5305 return Dynarr_atp (conversion_out_dynarr, 0);
5309 convert_from_external_format (CONST Extbyte *ptr,
5312 enum external_data_format fmt)
5314 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5316 if (!conversion_in_dynarr)
5317 conversion_in_dynarr = Dynarr_new (Bufbyte);
5319 Dynarr_reset (conversion_in_dynarr);
5321 if (NILP (coding_system))
5323 CONST Extbyte *end = ptr + len;
5324 for (; ptr < end; ptr++)
5327 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5332 Lisp_Object instream, outstream, da_outstream;
5333 Lstream *istr, *ostr;
5334 struct gcpro gcpro1, gcpro2, gcpro3;
5335 char tempbuf[1024]; /* some random amount */
5337 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5338 da_outstream = make_dynarr_output_stream
5339 ((unsigned_char_dynarr *) conversion_in_dynarr);
5341 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5342 istr = XLSTREAM (instream);
5343 ostr = XLSTREAM (outstream);
5344 GCPRO3 (instream, outstream, da_outstream);
5347 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5350 Lstream_write (ostr, tempbuf, size_in_bytes);
5352 Lstream_close (istr);
5353 Lstream_close (ostr);
5355 Lstream_delete (istr);
5356 Lstream_delete (ostr);
5357 Lstream_delete (XLSTREAM (da_outstream));
5360 *len_out = Dynarr_length (conversion_in_dynarr);
5361 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5362 return Dynarr_atp (conversion_in_dynarr, 0);
5366 /************************************************************************/
5367 /* Initialization */
5368 /************************************************************************/
5371 syms_of_mule_coding (void)
5373 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5374 deferror (&Qcoding_system_error, "coding-system-error",
5375 "Coding-system error", Qio_error);
5377 DEFSUBR (Fcoding_system_p);
5378 DEFSUBR (Ffind_coding_system);
5379 DEFSUBR (Fget_coding_system);
5380 DEFSUBR (Fcoding_system_list);
5381 DEFSUBR (Fcoding_system_name);
5382 DEFSUBR (Fmake_coding_system);
5383 DEFSUBR (Fcopy_coding_system);
5384 DEFSUBR (Fsubsidiary_coding_system);
5386 DEFSUBR (Fcoding_system_type);
5387 DEFSUBR (Fcoding_system_doc_string);
5389 DEFSUBR (Fcoding_system_charset);
5391 DEFSUBR (Fcoding_system_property);
5393 DEFSUBR (Fcoding_category_list);
5394 DEFSUBR (Fset_coding_priority_list);
5395 DEFSUBR (Fcoding_priority_list);
5396 DEFSUBR (Fset_coding_category_system);
5397 DEFSUBR (Fcoding_category_system);
5399 DEFSUBR (Fdetect_coding_region);
5400 DEFSUBR (Fdecode_coding_region);
5401 DEFSUBR (Fencode_coding_region);
5403 DEFSUBR (Fdecode_shift_jis_char);
5404 DEFSUBR (Fencode_shift_jis_char);
5405 DEFSUBR (Fdecode_big5_char);
5406 DEFSUBR (Fencode_big5_char);
5407 DEFSUBR (Fset_ucs_char);
5408 DEFSUBR (Fucs_char);
5409 DEFSUBR (Fset_char_ucs);
5410 DEFSUBR (Fchar_ucs);
5412 defsymbol (&Qcoding_system_p, "coding-system-p");
5413 defsymbol (&Qno_conversion, "no-conversion");
5415 defsymbol (&Qbig5, "big5");
5416 defsymbol (&Qshift_jis, "shift-jis");
5417 defsymbol (&Qucs4, "ucs-4");
5418 defsymbol (&Qutf8, "utf-8");
5419 defsymbol (&Qccl, "ccl");
5420 defsymbol (&Qiso2022, "iso2022");
5422 defsymbol (&Qmnemonic, "mnemonic");
5423 defsymbol (&Qeol_type, "eol-type");
5424 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5425 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5427 defsymbol (&Qcr, "cr");
5428 defsymbol (&Qlf, "lf");
5429 defsymbol (&Qcrlf, "crlf");
5430 defsymbol (&Qeol_cr, "eol-cr");
5431 defsymbol (&Qeol_lf, "eol-lf");
5432 defsymbol (&Qeol_crlf, "eol-crlf");
5434 defsymbol (&Qcharset_g0, "charset-g0");
5435 defsymbol (&Qcharset_g1, "charset-g1");
5436 defsymbol (&Qcharset_g2, "charset-g2");
5437 defsymbol (&Qcharset_g3, "charset-g3");
5438 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5439 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5440 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5441 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5442 defsymbol (&Qno_iso6429, "no-iso6429");
5443 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5444 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5446 defsymbol (&Qshort, "short");
5447 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5448 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5449 defsymbol (&Qseven, "seven");
5450 defsymbol (&Qlock_shift, "lock-shift");
5451 defsymbol (&Qescape_quoted, "escape-quoted");
5453 defsymbol (&Qencode, "encode");
5454 defsymbol (&Qdecode, "decode");
5457 defsymbol (&Qctext, "ctext");
5458 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5460 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5462 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5464 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5466 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5468 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5470 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5472 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5474 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5477 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5482 lstream_type_create_mule_coding (void)
5484 LSTREAM_HAS_METHOD (decoding, reader);
5485 LSTREAM_HAS_METHOD (decoding, writer);
5486 LSTREAM_HAS_METHOD (decoding, rewinder);
5487 LSTREAM_HAS_METHOD (decoding, seekable_p);
5488 LSTREAM_HAS_METHOD (decoding, flusher);
5489 LSTREAM_HAS_METHOD (decoding, closer);
5490 LSTREAM_HAS_METHOD (decoding, marker);
5492 LSTREAM_HAS_METHOD (encoding, reader);
5493 LSTREAM_HAS_METHOD (encoding, writer);
5494 LSTREAM_HAS_METHOD (encoding, rewinder);
5495 LSTREAM_HAS_METHOD (encoding, seekable_p);
5496 LSTREAM_HAS_METHOD (encoding, flusher);
5497 LSTREAM_HAS_METHOD (encoding, closer);
5498 LSTREAM_HAS_METHOD (encoding, marker);
5502 vars_of_mule_coding (void)
5506 /* Initialize to something reasonable ... */
5507 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5509 coding_category_system[i] = Qnil;
5510 coding_category_by_priority[i] = i;
5513 Fprovide (intern ("file-coding"));
5515 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5516 Coding system used for TTY keyboard input.
5517 Not used under a windowing system.
5519 Vkeyboard_coding_system = Qnil;
5521 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5522 Coding system used for TTY display output.
5523 Not used under a windowing system.
5525 Vterminal_coding_system = Qnil;
5527 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5528 Overriding coding system used when writing a file or process.
5529 You should *bind* this, not set it. If this is non-nil, it specifies
5530 the coding system that will be used when a file or process is read
5531 in, and overrides `buffer-file-coding-system-for-read',
5532 `insert-file-contents-pre-hook', etc. Use those variables instead of
5533 this one for permanent changes to the environment.
5535 Vcoding_system_for_read = Qnil;
5537 DEFVAR_LISP ("coding-system-for-write",
5538 &Vcoding_system_for_write /*
5539 Overriding coding system used when writing a file or process.
5540 You should *bind* this, not set it. If this is non-nil, it specifies
5541 the coding system that will be used when a file or process is wrote
5542 in, and overrides `buffer-file-coding-system',
5543 `write-region-pre-hook', etc. Use those variables instead of this one
5544 for permanent changes to the environment.
5546 Vcoding_system_for_write = Qnil;
5548 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5549 Coding system used to convert pathnames when accessing files.
5551 Vfile_name_coding_system = Qnil;
5553 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5554 Non-nil means the buffer contents are regarded as multi-byte form
5555 of characters, not a binary code. This affects the display, file I/O,
5556 and behaviors of various editing commands.
5558 Setting this to nil does not do anything.
5560 enable_multibyte_characters = 1;
5564 complex_vars_of_mule_coding (void)
5566 staticpro (&Vcoding_system_hash_table);
5567 Vcoding_system_hash_table =
5568 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5570 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5572 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5574 struct codesys_prop csp; \
5576 csp.prop_type = (Prop_Type); \
5577 Dynarr_add (the_codesys_prop_dynarr, csp); \
5580 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5581 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5582 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5583 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5584 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5585 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5586 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5588 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5589 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5590 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5591 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5592 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5593 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5594 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5595 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5596 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5597 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5598 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5599 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5600 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5601 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5602 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5603 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5604 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5606 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5607 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5609 /* Need to create this here or we're really screwed. */
5610 Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
5611 list2 (Qmnemonic, build_string ("Noconv")));
5613 Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
5616 /* Need this for bootstrapping */
5617 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5618 Fget_coding_system (Qno_conversion);
5624 for (i = 0; i < 65536; i++)
5625 ucs_to_mule_table[i] = Qnil;
5627 staticpro (&mule_to_ucs_table);
5628 mule_to_ucs_table = Fmake_char_table(Qgeneric);