1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
50 /* Coding system currently associated with each coding category. */
51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
53 /* Table of all coding categories in decreasing order of priority.
54 This describes a permutation of the possible coding categories. */
55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
57 Lisp_Object Qcoding_system_p;
59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
60 /* Qinternal in general.c */
62 Lisp_Object Qmnemonic, Qeol_type;
63 Lisp_Object Qcr, Qcrlf, Qlf;
64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
65 Lisp_Object Qpost_read_conversion;
66 Lisp_Object Qpre_write_conversion;
69 Lisp_Object Qucs4, Qutf8;
70 Lisp_Object Qbig5, Qshift_jis;
71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
74 Lisp_Object Qno_iso6429;
75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
76 Lisp_Object Qctext, Qescape_quoted;
77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
79 Lisp_Object Qencode, Qdecode;
81 Lisp_Object Vcoding_system_hash_table;
83 int enable_multibyte_characters;
86 /* Additional information used by the ISO2022 decoder and detector. */
87 struct iso2022_decoder
89 /* CHARSET holds the character sets currently assigned to the G0
90 through G3 variables. It is initialized from the array
91 INITIAL_CHARSET in CODESYS. */
92 Lisp_Object charset[4];
94 /* Which registers are currently invoked into the left (GL) and
95 right (GR) halves of the 8-bit encoding space? */
96 int register_left, register_right;
98 /* ISO_ESC holds a value indicating part of an escape sequence
99 that has already been seen. */
100 enum iso_esc_flag esc;
102 /* This records the bytes we've seen so far in an escape sequence,
103 in case the sequence is invalid (we spit out the bytes unchanged). */
104 unsigned char esc_bytes[8];
106 /* Index for next byte to store in ISO escape sequence. */
109 #ifdef ENABLE_COMPOSITE_CHARS
110 /* Stuff seen so far when composing a string. */
111 unsigned_char_dynarr *composite_chars;
114 /* If we saw an invalid designation sequence for a particular
115 register, we flag it here and switch to ASCII. The next time we
116 see a valid designation for this register, we turn off the flag
117 and do the designation normally, but pretend the sequence was
118 invalid. The effect of all this is that (most of the time) the
119 escape sequences for both the switch to the unknown charset, and
120 the switch back to the known charset, get inserted literally into
121 the buffer and saved out as such. The hope is that we can
122 preserve the escape sequences so that the resulting written out
123 file makes sense. If we don't do any of this, the designation
124 to the invalid charset will be preserved but that switch back
125 to the known charset will probably get eaten because it was
126 the same charset that was already present in the register. */
127 unsigned char invalid_designated[4];
129 /* We try to do similar things as above for direction-switching
130 sequences. If we encountered a direction switch while an
131 invalid designation was present, or an invalid designation
132 just after a direction switch (i.e. no valid designation
133 encountered yet), we insert the direction-switch escape
134 sequence literally into the output stream, and later on
135 insert the corresponding direction-restoring escape sequence
137 unsigned int switched_dir_and_no_valid_charset_yet :1;
138 unsigned int invalid_switch_dir :1;
140 /* Tells the decoder to output the escape sequence literally
141 even though it was valid. Used in the games we play to
142 avoid lossage when we encounter invalid designations. */
143 unsigned int output_literally :1;
144 /* We encountered a direction switch followed by an invalid
145 designation. We didn't output the direction switch
146 literally because we didn't know about the invalid designation;
147 but we have to do so now. */
148 unsigned int output_direction_sequence :1;
151 EXFUN (Fcopy_coding_system, 2);
153 struct detection_state;
156 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
157 unsigned_char_dynarr *dst, unsigned int n);
159 static int detect_coding_sjis (struct detection_state *st,
160 CONST unsigned char *src,
162 static void decode_coding_sjis (Lstream *decoding,
163 CONST unsigned char *src,
164 unsigned_char_dynarr *dst,
166 static void encode_coding_sjis (Lstream *encoding,
167 CONST unsigned char *src,
168 unsigned_char_dynarr *dst,
170 static int detect_coding_big5 (struct detection_state *st,
171 CONST unsigned char *src,
173 static void decode_coding_big5 (Lstream *decoding,
174 CONST unsigned char *src,
175 unsigned_char_dynarr *dst, unsigned int n);
176 static void encode_coding_big5 (Lstream *encoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static int detect_coding_ucs4 (struct detection_state *st,
180 CONST unsigned char *src,
182 static void decode_coding_ucs4 (Lstream *decoding,
183 CONST unsigned char *src,
184 unsigned_char_dynarr *dst, unsigned int n);
185 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
186 unsigned_char_dynarr *dst, unsigned int *flags);
187 void char_finish_ucs4 (struct encoding_stream *str,
188 unsigned_char_dynarr *dst, unsigned int *flags);
190 static int detect_coding_utf8 (struct detection_state *st,
191 CONST unsigned char *src,
193 static void decode_coding_utf8 (Lstream *decoding,
194 CONST unsigned char *src,
195 unsigned_char_dynarr *dst, unsigned int n);
196 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
197 unsigned_char_dynarr *dst, unsigned int *flags);
198 void char_finish_utf8 (struct encoding_stream *str,
199 unsigned_char_dynarr *dst, unsigned int *flags);
201 static int postprocess_iso2022_mask (int mask);
202 static void reset_iso2022 (Lisp_Object coding_system,
203 struct iso2022_decoder *iso);
204 static int detect_coding_iso2022 (struct detection_state *st,
205 CONST unsigned char *src,
207 static void decode_coding_iso2022 (Lstream *decoding,
208 CONST unsigned char *src,
209 unsigned_char_dynarr *dst, unsigned int n);
210 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
211 unsigned_char_dynarr *dst, unsigned int *flags);
212 void char_finish_iso2022 (struct encoding_stream *str,
213 unsigned_char_dynarr *dst, unsigned int *flags);
215 static void decode_coding_no_conversion (Lstream *decoding,
216 CONST unsigned char *src,
217 unsigned_char_dynarr *dst,
219 static void encode_coding_no_conversion (Lstream *encoding,
220 CONST unsigned char *src,
221 unsigned_char_dynarr *dst,
223 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
224 unsigned_char_dynarr *dst, unsigned int n);
225 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
226 unsigned_char_dynarr *dst, unsigned int n);
228 typedef struct codesys_prop codesys_prop;
237 Dynarr_declare (codesys_prop);
238 } codesys_prop_dynarr;
240 codesys_prop_dynarr *the_codesys_prop_dynarr;
242 enum codesys_prop_enum
245 CODESYS_PROP_ISO2022,
250 /************************************************************************/
251 /* Coding system functions */
252 /************************************************************************/
254 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
255 static void print_coding_system (Lisp_Object, Lisp_Object, int);
256 static void finalize_coding_system (void *header, int for_disksave);
259 static const struct lrecord_description ccs_description_1[] = {
260 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
264 static const struct struct_description ccs_description = {
265 sizeof(charset_conversion_spec),
269 static const struct lrecord_description ccsd_description_1[] = {
270 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
274 static const struct struct_description ccsd_description = {
275 sizeof(charset_conversion_spec_dynarr),
280 static const struct lrecord_description coding_system_description[] = {
281 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
282 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
283 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
285 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
286 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
287 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
288 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
293 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
294 mark_coding_system, print_coding_system,
295 finalize_coding_system,
296 0, 0, coding_system_description,
297 struct Lisp_Coding_System);
300 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
302 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
304 markobj (CODING_SYSTEM_NAME (codesys));
305 markobj (CODING_SYSTEM_DOC_STRING (codesys));
306 markobj (CODING_SYSTEM_MNEMONIC (codesys));
307 markobj (CODING_SYSTEM_EOL_LF (codesys));
308 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
309 markobj (CODING_SYSTEM_EOL_CR (codesys));
311 switch (CODING_SYSTEM_TYPE (codesys))
315 case CODESYS_ISO2022:
316 for (i = 0; i < 4; i++)
317 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
318 if (codesys->iso2022.input_conv)
320 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
322 struct charset_conversion_spec *ccs =
323 Dynarr_atp (codesys->iso2022.input_conv, i);
324 markobj (ccs->from_charset);
325 markobj (ccs->to_charset);
328 if (codesys->iso2022.output_conv)
330 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
332 struct charset_conversion_spec *ccs =
333 Dynarr_atp (codesys->iso2022.output_conv, i);
334 markobj (ccs->from_charset);
335 markobj (ccs->to_charset);
341 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
342 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
349 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
350 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
354 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
357 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
359 error ("printing unreadable object #<coding_system 0x%x>",
362 write_c_string ("#<coding_system ", printcharfun);
363 print_internal (c->name, printcharfun, 1);
364 write_c_string (">", printcharfun);
368 finalize_coding_system (void *header, int for_disksave)
370 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
371 /* Since coding systems never go away, this function is not
372 necessary. But it would be necessary if we changed things
373 so that coding systems could go away. */
374 if (!for_disksave) /* see comment in lstream.c */
376 switch (CODING_SYSTEM_TYPE (c))
379 case CODESYS_ISO2022:
380 if (c->iso2022.input_conv)
382 Dynarr_free (c->iso2022.input_conv);
383 c->iso2022.input_conv = 0;
385 if (c->iso2022.output_conv)
387 Dynarr_free (c->iso2022.output_conv);
388 c->iso2022.output_conv = 0;
399 symbol_to_eol_type (Lisp_Object symbol)
401 CHECK_SYMBOL (symbol);
402 if (NILP (symbol)) return EOL_AUTODETECT;
403 if (EQ (symbol, Qlf)) return EOL_LF;
404 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
405 if (EQ (symbol, Qcr)) return EOL_CR;
407 signal_simple_error ("Unrecognized eol type", symbol);
408 return EOL_AUTODETECT; /* not reached */
412 eol_type_to_symbol (enum eol_type type)
417 case EOL_LF: return Qlf;
418 case EOL_CRLF: return Qcrlf;
419 case EOL_CR: return Qcr;
420 case EOL_AUTODETECT: return Qnil;
425 setup_eol_coding_systems (Lisp_Coding_System *codesys)
427 Lisp_Object codesys_obj;
428 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
429 char *codesys_name = (char *) alloca (len + 7);
431 char *codesys_mnemonic=0;
433 Lisp_Object codesys_name_sym, sub_codesys_obj;
437 XSETCODING_SYSTEM (codesys_obj, codesys);
439 memcpy (codesys_name,
440 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
442 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
444 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
445 codesys_mnemonic = (char *) alloca (mlen + 7);
446 memcpy (codesys_mnemonic,
447 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
450 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
451 strcpy (codesys_name + len, "-" op_sys); \
453 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
454 codesys_name_sym = intern (codesys_name); \
455 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
456 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
458 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
459 build_string (codesys_mnemonic); \
460 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
463 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
464 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
465 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
468 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
469 Return t if OBJECT is a coding system.
470 A coding system is an object that defines how text containing multiple
471 character sets is encoded into a stream of (typically 8-bit) bytes.
472 The coding system is used to decode the stream into a series of
473 characters (which may be from multiple charsets) when the text is read
474 from a file or process, and is used to encode the text back into the
475 same format when it is written out to a file or process.
477 For example, many ISO2022-compliant coding systems (such as Compound
478 Text, which is used for inter-client data under the X Window System)
479 use escape sequences to switch between different charsets -- Japanese
480 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
481 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
482 `make-coding-system' for more information.
484 Coding systems are normally identified using a symbol, and the
485 symbol is accepted in place of the actual coding system object whenever
486 a coding system is called for. (This is similar to how faces work.)
490 return CODING_SYSTEMP (object) ? Qt : Qnil;
493 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
494 Retrieve the coding system of the given name.
496 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
497 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
498 If there is no such coding system, nil is returned. Otherwise the
499 associated coding system object is returned.
501 (coding_system_or_name))
503 if (CODING_SYSTEMP (coding_system_or_name))
504 return coding_system_or_name;
506 if (NILP (coding_system_or_name))
507 coding_system_or_name = Qbinary;
509 CHECK_SYMBOL (coding_system_or_name);
511 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
514 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
515 Retrieve the coding system of the given name.
516 Same as `find-coding-system' except that if there is no such
517 coding system, an error is signaled instead of returning nil.
521 Lisp_Object coding_system = Ffind_coding_system (name);
523 if (NILP (coding_system))
524 signal_simple_error ("No such coding system", name);
525 return coding_system;
528 /* We store the coding systems in hash tables with the names as the key and the
529 actual coding system object as the value. Occasionally we need to use them
530 in a list format. These routines provide us with that. */
531 struct coding_system_list_closure
533 Lisp_Object *coding_system_list;
537 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
538 void *coding_system_list_closure)
540 /* This function can GC */
541 struct coding_system_list_closure *cscl =
542 (struct coding_system_list_closure *) coding_system_list_closure;
543 Lisp_Object *coding_system_list = cscl->coding_system_list;
545 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
546 *coding_system_list);
550 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
551 Return a list of the names of all defined coding systems.
555 Lisp_Object coding_system_list = Qnil;
557 struct coding_system_list_closure coding_system_list_closure;
559 GCPRO1 (coding_system_list);
560 coding_system_list_closure.coding_system_list = &coding_system_list;
561 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
562 &coding_system_list_closure);
565 return coding_system_list;
568 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
569 Return the name of the given coding system.
573 coding_system = Fget_coding_system (coding_system);
574 return XCODING_SYSTEM_NAME (coding_system);
577 static Lisp_Coding_System *
578 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
580 Lisp_Coding_System *codesys =
581 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
583 zero_lcrecord (codesys);
584 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
585 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
586 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
587 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
588 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
589 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
590 CODING_SYSTEM_TYPE (codesys) = type;
591 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
593 if (type == CODESYS_ISO2022)
596 for (i = 0; i < 4; i++)
597 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
599 else if (type == CODESYS_CCL)
601 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
602 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
605 CODING_SYSTEM_NAME (codesys) = name;
611 /* Given a list of charset conversion specs as specified in a Lisp
612 program, parse it into STORE_HERE. */
615 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
616 Lisp_Object spec_list)
620 EXTERNAL_LIST_LOOP (rest, spec_list)
622 Lisp_Object car = XCAR (rest);
623 Lisp_Object from, to;
624 struct charset_conversion_spec spec;
626 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
627 signal_simple_error ("Invalid charset conversion spec", car);
628 from = Fget_charset (XCAR (car));
629 to = Fget_charset (XCAR (XCDR (car)));
630 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
631 signal_simple_error_2
632 ("Attempted conversion between different charset types",
634 spec.from_charset = from;
635 spec.to_charset = to;
637 Dynarr_add (store_here, spec);
641 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
642 specs, return the equivalent as the Lisp programmer would see it.
644 If LOAD_HERE is 0, return Qnil. */
647 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
654 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
656 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
657 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
660 return Fnreverse (result);
665 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
666 Register symbol NAME as a coding system.
668 TYPE describes the conversion method used and should be one of
671 Automatic conversion. XEmacs attempts to detect the coding system
674 No conversion. Use this for binary files and such. On output,
675 graphic characters that are not in ASCII or Latin-1 will be
676 replaced by a ?. (For a no-conversion-encoded buffer, these
677 characters will only be present if you explicitly insert them.)
679 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
681 ISO 10646 UCS-4 encoding.
683 ISO 10646 UTF-8 encoding.
685 Any ISO2022-compliant encoding. Among other things, this includes
686 JIS (the Japanese encoding commonly used for e-mail), EUC (the
687 standard Unix encoding for Japanese and other languages), and
688 Compound Text (the encoding used in X11). You can specify more
689 specific information about the conversion with the FLAGS argument.
691 Big5 (the encoding commonly used for Taiwanese).
693 The conversion is performed using a user-written pseudo-code
694 program. CCL (Code Conversion Language) is the name of this
697 Write out or read in the raw contents of the memory representing
698 the buffer's text. This is primarily useful for debugging
699 purposes, and is only enabled when XEmacs has been compiled with
700 DEBUG_XEMACS defined (via the --debug configure option).
701 WARNING: Reading in a file using 'internal conversion can result
702 in an internal inconsistency in the memory representing a
703 buffer's text, which will produce unpredictable results and may
704 cause XEmacs to crash. Under normal circumstances you should
705 never use 'internal conversion.
707 DOC-STRING is a string describing the coding system.
709 PROPS is a property list, describing the specific nature of the
710 character set. Recognized properties are:
713 String to be displayed in the modeline when this coding system is
717 End-of-line conversion to be used. It should be one of
720 Automatically detect the end-of-line type (LF, CRLF,
721 or CR). Also generate subsidiary coding systems named
722 `NAME-unix', `NAME-dos', and `NAME-mac', that are
723 identical to this coding system but have an EOL-TYPE
724 value of 'lf, 'crlf, and 'cr, respectively.
726 The end of a line is marked externally using ASCII LF.
727 Since this is also the way that XEmacs represents an
728 end-of-line internally, specifying this option results
729 in no end-of-line conversion. This is the standard
730 format for Unix text files.
732 The end of a line is marked externally using ASCII
733 CRLF. This is the standard format for MS-DOS text
736 The end of a line is marked externally using ASCII CR.
737 This is the standard format for Macintosh text files.
739 Automatically detect the end-of-line type but do not
740 generate subsidiary coding systems. (This value is
741 converted to nil when stored internally, and
742 `coding-system-property' will return nil.)
744 'post-read-conversion
745 Function called after a file has been read in, to perform the
746 decoding. Called with two arguments, BEG and END, denoting
747 a region of the current buffer to be decoded.
749 'pre-write-conversion
750 Function called before a file is written out, to perform the
751 encoding. Called with two arguments, BEG and END, denoting
752 a region of the current buffer to be encoded.
755 The following additional properties are recognized if TYPE is 'iso2022:
761 The character set initially designated to the G0 - G3 registers.
762 The value should be one of
764 -- A charset object (designate that character set)
765 -- nil (do not ever use this register)
766 -- t (no character set is initially designated to
767 the register, but may be later on; this automatically
768 sets the corresponding `force-g*-on-output' property)
774 If non-nil, send an explicit designation sequence on output before
775 using the specified register.
778 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
779 "ESC $ B" on output in place of the full designation sequences
780 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
783 If non-nil, don't designate ASCII to G0 at each end of line on output.
784 Setting this to non-nil also suppresses other state-resetting that
785 normally happens at the end of a line.
788 If non-nil, don't designate ASCII to G0 before control chars on output.
791 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
795 If non-nil, use locking-shift (SO/SI) instead of single-shift
796 or designation by escape sequence.
799 If non-nil, don't use ISO6429's direction specification.
802 If non-nil, literal control characters that are the same as
803 the beginning of a recognized ISO2022 or ISO6429 escape sequence
804 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
805 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
806 so that they can be properly distinguished from an escape sequence.
807 (Note that doing this results in a non-portable encoding.) This
808 encoding flag is used for byte-compiled files. Note that ESC
809 is a good choice for a quoting character because there are no
810 escape sequences whose second byte is a character from the Control-0
811 or Control-1 character sets; this is explicitly disallowed by the
814 'input-charset-conversion
815 A list of conversion specifications, specifying conversion of
816 characters in one charset to another when decoding is performed.
817 Each specification is a list of two elements: the source charset,
818 and the destination charset.
820 'output-charset-conversion
821 A list of conversion specifications, specifying conversion of
822 characters in one charset to another when encoding is performed.
823 The form of each specification is the same as for
824 'input-charset-conversion.
827 The following additional properties are recognized (and required)
831 CCL program used for decoding (converting to internal format).
834 CCL program used for encoding (converting to external format).
836 (name, type, doc_string, props))
838 Lisp_Coding_System *codesys;
839 Lisp_Object rest, key, value;
840 enum coding_system_type ty;
841 int need_to_setup_eol_systems = 1;
843 /* Convert type to constant */
844 if (NILP (type) || EQ (type, Qundecided))
845 { ty = CODESYS_AUTODETECT; }
847 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
848 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
849 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
850 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
851 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
852 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
854 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
856 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
859 signal_simple_error ("Invalid coding system type", type);
863 codesys = allocate_coding_system (ty, name);
865 if (NILP (doc_string))
866 doc_string = build_string ("");
868 CHECK_STRING (doc_string);
869 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
872 if (ty == CODESYS_NO_CONVERSION)
873 codesys->fixed.size = 1;
875 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
877 if (EQ (key, Qmnemonic))
880 CHECK_STRING (value);
881 CODING_SYSTEM_MNEMONIC (codesys) = value;
884 else if (EQ (key, Qeol_type))
886 need_to_setup_eol_systems = NILP (value);
889 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
892 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
893 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
895 else if (ty == CODESYS_ISO2022)
897 #define FROB_INITIAL_CHARSET(charset_num) \
898 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
899 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
901 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
902 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
903 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
904 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
906 #define FROB_FORCE_CHARSET(charset_num) \
907 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
909 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
910 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
911 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
912 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
914 #define FROB_BOOLEAN_PROPERTY(prop) \
915 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
917 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
918 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
919 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
920 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
921 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
922 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
923 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
925 else if (EQ (key, Qinput_charset_conversion))
927 codesys->iso2022.input_conv =
928 Dynarr_new (charset_conversion_spec);
929 parse_charset_conversion_specs (codesys->iso2022.input_conv,
932 else if (EQ (key, Qoutput_charset_conversion))
934 codesys->iso2022.output_conv =
935 Dynarr_new (charset_conversion_spec);
936 parse_charset_conversion_specs (codesys->iso2022.output_conv,
940 signal_simple_error ("Unrecognized property", key);
942 else if (EQ (type, Qccl))
944 if (EQ (key, Qdecode))
946 CHECK_VECTOR (value);
947 CODING_SYSTEM_CCL_DECODE (codesys) = value;
949 else if (EQ (key, Qencode))
951 CHECK_VECTOR (value);
952 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
955 signal_simple_error ("Unrecognized property", key);
959 signal_simple_error ("Unrecognized property", key);
962 if (need_to_setup_eol_systems)
963 setup_eol_coding_systems (codesys);
966 Lisp_Object codesys_obj;
967 XSETCODING_SYSTEM (codesys_obj, codesys);
968 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
973 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
974 Copy OLD-CODING-SYSTEM to NEW-NAME.
975 If NEW-NAME does not name an existing coding system, a new one will
978 (old_coding_system, new_name))
980 Lisp_Object new_coding_system;
981 old_coding_system = Fget_coding_system (old_coding_system);
982 new_coding_system = Ffind_coding_system (new_name);
983 if (NILP (new_coding_system))
985 XSETCODING_SYSTEM (new_coding_system,
986 allocate_coding_system
987 (XCODING_SYSTEM_TYPE (old_coding_system),
989 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
993 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
994 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
995 memcpy (((char *) to ) + sizeof (to->header),
996 ((char *) from) + sizeof (from->header),
997 sizeof (*from) - sizeof (from->header));
1000 return new_coding_system;
1003 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1004 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1006 (alias, coding_system))
1008 CHECK_SYMBOL (alias);
1009 if (!NILP (Ffind_coding_system (alias)))
1010 signal_simple_error ("Symbol already names a coding system", alias);
1011 coding_system = Fget_coding_system (coding_system);
1012 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1014 /* Set up aliases for subsidiaries. */
1015 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1018 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1019 #define FROB(type, name) \
1021 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1022 if (!NILP (subsidiary)) \
1023 Fdefine_coding_system_alias \
1024 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1027 FROB (CRLF, "-dos");
1031 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1032 but it doesn't look intentional, so I'd rather return something
1033 meaningful or nothing at all. */
1038 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1040 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1041 Lisp_Object new_coding_system;
1043 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1044 return coding_system;
1048 case EOL_AUTODETECT: return coding_system;
1049 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1050 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1051 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1055 return NILP (new_coding_system) ? coding_system : new_coding_system;
1058 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1059 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1061 (coding_system, eol_type))
1063 coding_system = Fget_coding_system (coding_system);
1065 return subsidiary_coding_system (coding_system,
1066 symbol_to_eol_type (eol_type));
1070 /************************************************************************/
1071 /* Coding system accessors */
1072 /************************************************************************/
1074 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1075 Return the doc string for CODING-SYSTEM.
1079 coding_system = Fget_coding_system (coding_system);
1080 return XCODING_SYSTEM_DOC_STRING (coding_system);
1083 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1084 Return the type of CODING-SYSTEM.
1088 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1091 case CODESYS_AUTODETECT: return Qundecided;
1093 case CODESYS_SHIFT_JIS: return Qshift_jis;
1094 case CODESYS_ISO2022: return Qiso2022;
1095 case CODESYS_BIG5: return Qbig5;
1096 case CODESYS_UCS4: return Qucs4;
1097 case CODESYS_UTF8: return Qutf8;
1098 case CODESYS_CCL: return Qccl;
1100 case CODESYS_NO_CONVERSION: return Qno_conversion;
1102 case CODESYS_INTERNAL: return Qinternal;
1109 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1112 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1114 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1117 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1118 Return initial charset of CODING-SYSTEM designated to GNUM.
1121 (coding_system, gnum))
1123 coding_system = Fget_coding_system (coding_system);
1126 return coding_system_charset (coding_system, XINT (gnum));
1130 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1131 Return the PROP property of CODING-SYSTEM.
1133 (coding_system, prop))
1136 enum coding_system_type type;
1138 coding_system = Fget_coding_system (coding_system);
1139 CHECK_SYMBOL (prop);
1140 type = XCODING_SYSTEM_TYPE (coding_system);
1142 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1143 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1146 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1148 case CODESYS_PROP_ALL_OK:
1151 case CODESYS_PROP_ISO2022:
1152 if (type != CODESYS_ISO2022)
1154 ("Property only valid in ISO2022 coding systems",
1158 case CODESYS_PROP_CCL:
1159 if (type != CODESYS_CCL)
1161 ("Property only valid in CCL coding systems",
1171 signal_simple_error ("Unrecognized property", prop);
1173 if (EQ (prop, Qname))
1174 return XCODING_SYSTEM_NAME (coding_system);
1175 else if (EQ (prop, Qtype))
1176 return Fcoding_system_type (coding_system);
1177 else if (EQ (prop, Qdoc_string))
1178 return XCODING_SYSTEM_DOC_STRING (coding_system);
1179 else if (EQ (prop, Qmnemonic))
1180 return XCODING_SYSTEM_MNEMONIC (coding_system);
1181 else if (EQ (prop, Qeol_type))
1182 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1183 else if (EQ (prop, Qeol_lf))
1184 return XCODING_SYSTEM_EOL_LF (coding_system);
1185 else if (EQ (prop, Qeol_crlf))
1186 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1187 else if (EQ (prop, Qeol_cr))
1188 return XCODING_SYSTEM_EOL_CR (coding_system);
1189 else if (EQ (prop, Qpost_read_conversion))
1190 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1191 else if (EQ (prop, Qpre_write_conversion))
1192 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1194 else if (type == CODESYS_ISO2022)
1196 if (EQ (prop, Qcharset_g0))
1197 return coding_system_charset (coding_system, 0);
1198 else if (EQ (prop, Qcharset_g1))
1199 return coding_system_charset (coding_system, 1);
1200 else if (EQ (prop, Qcharset_g2))
1201 return coding_system_charset (coding_system, 2);
1202 else if (EQ (prop, Qcharset_g3))
1203 return coding_system_charset (coding_system, 3);
1205 #define FORCE_CHARSET(charset_num) \
1206 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1207 (coding_system, charset_num) ? Qt : Qnil)
1209 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1210 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1211 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1212 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1214 #define LISP_BOOLEAN(prop) \
1215 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1217 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1218 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1219 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1220 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1221 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1222 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1223 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1225 else if (EQ (prop, Qinput_charset_conversion))
1227 unparse_charset_conversion_specs
1228 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1229 else if (EQ (prop, Qoutput_charset_conversion))
1231 unparse_charset_conversion_specs
1232 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1236 else if (type == CODESYS_CCL)
1238 if (EQ (prop, Qdecode))
1239 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1240 else if (EQ (prop, Qencode))
1241 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1249 return Qnil; /* not reached */
1253 /************************************************************************/
1254 /* Coding category functions */
1255 /************************************************************************/
1258 decode_coding_category (Lisp_Object symbol)
1262 CHECK_SYMBOL (symbol);
1263 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1264 if (EQ (coding_category_symbol[i], symbol))
1267 signal_simple_error ("Unrecognized coding category", symbol);
1268 return 0; /* not reached */
1271 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1272 Return a list of all recognized coding categories.
1277 Lisp_Object list = Qnil;
1279 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1280 list = Fcons (coding_category_symbol[i], list);
1284 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1285 Change the priority order of the coding categories.
1286 LIST should be list of coding categories, in descending order of
1287 priority. Unspecified coding categories will be lower in priority
1288 than all specified ones, in the same relative order they were in
1293 int category_to_priority[CODING_CATEGORY_LAST + 1];
1297 /* First generate a list that maps coding categories to priorities. */
1299 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1300 category_to_priority[i] = -1;
1302 /* Highest priority comes from the specified list. */
1304 EXTERNAL_LIST_LOOP (rest, list)
1306 int cat = decode_coding_category (XCAR (rest));
1308 if (category_to_priority[cat] >= 0)
1309 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1310 category_to_priority[cat] = i++;
1313 /* Now go through the existing categories by priority to retrieve
1314 the categories not yet specified and preserve their priority
1316 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1318 int cat = coding_category_by_priority[j];
1319 if (category_to_priority[cat] < 0)
1320 category_to_priority[cat] = i++;
1323 /* Now we need to construct the inverse of the mapping we just
1326 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1327 coding_category_by_priority[category_to_priority[i]] = i;
1329 /* Phew! That was confusing. */
1333 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1334 Return a list of coding categories in descending order of priority.
1339 Lisp_Object list = Qnil;
1341 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1342 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1347 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1348 Change the coding system associated with a coding category.
1350 (coding_category, coding_system))
1352 int cat = decode_coding_category (coding_category);
1354 coding_system = Fget_coding_system (coding_system);
1355 coding_category_system[cat] = coding_system;
1359 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1360 Return the coding system associated with a coding category.
1364 int cat = decode_coding_category (coding_category);
1365 Lisp_Object sys = coding_category_system[cat];
1368 return XCODING_SYSTEM_NAME (sys);
1373 /************************************************************************/
1374 /* Detecting the encoding of data */
1375 /************************************************************************/
1377 struct detection_state
1379 enum eol_type eol_type;
1415 struct iso2022_decoder iso;
1417 int high_byte_count;
1418 unsigned int saw_single_shift:1;
1431 acceptable_control_char_p (int c)
1435 /* Allow and ignore control characters that you might
1436 reasonably see in a text file */
1441 case 8: /* backspace */
1442 case 11: /* vertical tab */
1443 case 12: /* form feed */
1444 case 26: /* MS-DOS C-z junk */
1445 case 31: /* '^_' -- for info */
1453 mask_has_at_most_one_bit_p (int mask)
1455 /* Perhaps the only thing useful you learn from intensive Microsoft
1456 technical interviews */
1457 return (mask & (mask - 1)) == 0;
1460 static enum eol_type
1461 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1470 st->eol.just_saw_cr = 1;
1475 if (st->eol.just_saw_cr)
1477 else if (st->eol.seen_anything)
1480 else if (st->eol.just_saw_cr)
1482 st->eol.just_saw_cr = 0;
1484 st->eol.seen_anything = 1;
1487 return EOL_AUTODETECT;
1490 /* Attempt to determine the encoding and EOL type of the given text.
1491 Before calling this function for the first type, you must initialize
1492 st->eol_type as appropriate and initialize st->mask to ~0.
1494 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1497 st->mask holds the determined coding category mask, or ~0 if only
1498 ASCII has been seen so far.
1502 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1503 is present in st->mask
1504 1 == definitive answers are here for both st->eol_type and st->mask
1508 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1509 unsigned int n, int just_do_eol)
1513 if (st->eol_type == EOL_AUTODETECT)
1514 st->eol_type = detect_eol_type (st, src, n);
1517 return st->eol_type != EOL_AUTODETECT;
1519 if (!st->seen_non_ascii)
1521 for (; n; n--, src++)
1524 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1526 st->seen_non_ascii = 1;
1528 st->shift_jis.mask = ~0;
1532 st->iso2022.mask = ~0;
1542 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1543 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1544 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1545 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1546 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1547 st->big5.mask = detect_coding_big5 (st, src, n);
1548 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1549 st->utf8.mask = detect_coding_utf8 (st, src, n);
1550 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1551 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1554 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1555 | st->utf8.mask | st->ucs4.mask;
1558 int retval = mask_has_at_most_one_bit_p (st->mask);
1559 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1560 return retval && st->eol_type != EOL_AUTODETECT;
1565 coding_system_from_mask (int mask)
1569 /* If the file was entirely or basically ASCII, use the
1570 default value of `buffer-file-coding-system'. */
1571 Lisp_Object retval =
1572 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1575 retval = Ffind_coding_system (retval);
1579 (Qbad_variable, Qwarning,
1580 "Invalid `default-buffer-file-coding-system', set to nil");
1581 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1585 retval = Fget_coding_system (Qraw_text);
1593 mask = postprocess_iso2022_mask (mask);
1595 /* Look through the coding categories by priority and find
1596 the first one that is allowed. */
1597 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1599 cat = coding_category_by_priority[i];
1600 if ((mask & (1 << cat)) &&
1601 !NILP (coding_category_system[cat]))
1605 return coding_category_system[cat];
1607 return Fget_coding_system (Qraw_text);
1611 /* Given a seekable read stream and potential coding system and EOL type
1612 as specified, do any autodetection that is called for. If the
1613 coding system and/or EOL type are not autodetect, they will be left
1614 alone; but this function will never return an autodetect coding system
1617 This function does not automatically fetch subsidiary coding systems;
1618 that should be unnecessary with the explicit eol-type argument. */
1621 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1622 enum eol_type *eol_type_in_out)
1624 struct detection_state decst;
1626 if (*eol_type_in_out == EOL_AUTODETECT)
1627 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1630 decst.eol_type = *eol_type_in_out;
1633 /* If autodetection is called for, do it now. */
1634 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1635 *eol_type_in_out == EOL_AUTODETECT)
1637 unsigned char random_buffer[4096];
1639 Lisp_Object coding_system = Qnil;
1641 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1644 unsigned char *cp = random_buffer;
1646 while (cp < random_buffer + nread)
1648 if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1649 (*cp++ == 'o') && (cp < random_buffer + nread) &&
1650 (*cp++ == 'd') && (cp < random_buffer + nread) &&
1651 (*cp++ == 'i') && (cp < random_buffer + nread) &&
1652 (*cp++ == 'n') && (cp < random_buffer + nread) &&
1653 (*cp++ == 'g') && (cp < random_buffer + nread) &&
1654 (*cp++ == ':') && (cp < random_buffer + nread))
1656 unsigned char coding_system_name[4096 - 6];
1657 unsigned char *np = coding_system_name;
1659 while ( (cp < random_buffer + nread)
1660 && ((*cp == ' ') || (*cp == '\t')) )
1664 while ( (cp < random_buffer + nread) &&
1665 (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1671 = Ffind_coding_system (intern (coding_system_name));
1675 if (EQ(coding_system, Qnil))
1677 if (detect_coding_type (&decst, random_buffer, nread,
1678 XCODING_SYSTEM_TYPE (*codesys_in_out)
1679 != CODESYS_AUTODETECT))
1681 nread = Lstream_read (stream,
1682 random_buffer, sizeof (random_buffer));
1687 *eol_type_in_out = decst.eol_type;
1688 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1690 if (EQ(coding_system, Qnil))
1691 *codesys_in_out = coding_system_from_mask (decst.mask);
1693 *codesys_in_out = coding_system;
1696 /* If we absolutely can't determine the EOL type, just assume LF. */
1697 if (*eol_type_in_out == EOL_AUTODETECT)
1698 *eol_type_in_out = EOL_LF;
1700 Lstream_rewind (stream);
1703 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1704 Detect coding system of the text in the region between START and END.
1705 Returned a list of possible coding systems ordered by priority.
1706 If only ASCII characters are found, it returns 'undecided or one of
1707 its subsidiary coding systems according to a detected end-of-line
1708 type. Optional arg BUFFER defaults to the current buffer.
1710 (start, end, buffer))
1712 Lisp_Object val = Qnil;
1713 struct buffer *buf = decode_buffer (buffer, 0);
1715 Lisp_Object instream, lb_instream;
1716 Lstream *istr, *lb_istr;
1717 struct detection_state decst;
1718 struct gcpro gcpro1, gcpro2;
1720 get_buffer_range_char (buf, start, end, &b, &e, 0);
1721 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1722 lb_istr = XLSTREAM (lb_instream);
1723 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1724 istr = XLSTREAM (instream);
1725 GCPRO2 (instream, lb_instream);
1727 decst.eol_type = EOL_AUTODETECT;
1731 unsigned char random_buffer[4096];
1732 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1736 if (detect_coding_type (&decst, random_buffer, nread, 0))
1740 if (decst.mask == ~0)
1741 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1749 decst.mask = postprocess_iso2022_mask (decst.mask);
1751 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1753 int sys = coding_category_by_priority[i];
1754 if (decst.mask & (1 << sys))
1756 Lisp_Object codesys = coding_category_system[sys];
1757 if (!NILP (codesys))
1758 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1759 val = Fcons (codesys, val);
1763 Lstream_close (istr);
1765 Lstream_delete (istr);
1766 Lstream_delete (lb_istr);
1771 /************************************************************************/
1772 /* Converting to internal Mule format ("decoding") */
1773 /************************************************************************/
1775 /* A decoding stream is a stream used for decoding text (i.e.
1776 converting from some external format to internal format).
1777 The decoding-stream object keeps track of the actual coding
1778 stream, the stream that is at the other end, and data that
1779 needs to be persistent across the lifetime of the stream. */
1781 /* Handle the EOL stuff related to just-read-in character C.
1782 EOL_TYPE is the EOL type of the coding stream.
1783 FLAGS is the current value of FLAGS in the coding stream, and may
1784 be modified by this macro. (The macro only looks at the
1785 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1786 bytes are to be written. You need to also define a local goto
1787 label "label_continue_loop" that is at the end of the main
1788 character-reading loop.
1790 If C is a CR character, then this macro handles it entirely and
1791 jumps to label_continue_loop. Otherwise, this macro does not add
1792 anything to DST, and continues normally. You should continue
1793 processing C normally after this macro. */
1795 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1799 if (eol_type == EOL_CR) \
1800 Dynarr_add (dst, '\n'); \
1801 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1802 Dynarr_add (dst, c); \
1804 flags |= CODING_STATE_CR; \
1805 goto label_continue_loop; \
1807 else if (flags & CODING_STATE_CR) \
1808 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1810 Dynarr_add (dst, '\r'); \
1811 flags &= ~CODING_STATE_CR; \
1815 /* C should be a binary character in the range 0 - 255; convert
1816 to internal format and add to Dynarr DST. */
1819 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1821 if (BYTE_ASCII_P (c)) \
1822 Dynarr_add (dst, c); \
1825 Dynarr_add (dst, (c >> 6) | 0xc0); \
1826 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1831 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1835 Dynarr_add (dst, c);
1837 else if ( c <= 0x7ff )
1839 Dynarr_add (dst, (c >> 6) | 0xc0);
1840 Dynarr_add (dst, (c & 0x3f) | 0x80);
1842 else if ( c <= 0xffff )
1844 Dynarr_add (dst, (c >> 12) | 0xe0);
1845 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1846 Dynarr_add (dst, (c & 0x3f) | 0x80);
1848 else if ( c <= 0x1fffff )
1850 Dynarr_add (dst, (c >> 18) | 0xf0);
1851 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1852 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1853 Dynarr_add (dst, (c & 0x3f) | 0x80);
1855 else if ( c <= 0x3ffffff )
1857 Dynarr_add (dst, (c >> 24) | 0xf8);
1858 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1859 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1860 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1861 Dynarr_add (dst, (c & 0x3f) | 0x80);
1865 Dynarr_add (dst, (c >> 30) | 0xfc);
1866 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1867 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1868 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1869 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1870 Dynarr_add (dst, (c & 0x3f) | 0x80);
1874 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1876 if (BYTE_ASCII_P (c)) \
1877 Dynarr_add (dst, c); \
1878 else if (BYTE_C1_P (c)) \
1880 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1881 Dynarr_add (dst, c + 0x20); \
1885 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1886 Dynarr_add (dst, c); \
1891 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1895 DECODE_ADD_BINARY_CHAR (ch, dst); \
1900 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1902 if (flags & CODING_STATE_END) \
1904 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1905 if (flags & CODING_STATE_CR) \
1906 Dynarr_add (dst, '\r'); \
1910 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1912 struct decoding_stream
1914 /* Coding system that governs the conversion. */
1915 Lisp_Coding_System *codesys;
1917 /* Stream that we read the encoded data from or
1918 write the decoded data to. */
1921 /* If we are reading, then we can return only a fixed amount of
1922 data, so if the conversion resulted in too much data, we store it
1923 here for retrieval the next time around. */
1924 unsigned_char_dynarr *runoff;
1926 /* FLAGS holds flags indicating the current state of the decoding.
1927 Some of these flags are dependent on the coding system. */
1930 /* CH holds a partially built-up character. Since we only deal
1931 with one- and two-byte characters at the moment, we only use
1932 this to store the first byte of a two-byte character. */
1935 /* EOL_TYPE specifies the type of end-of-line conversion that
1936 currently applies. We need to keep this separate from the
1937 EOL type stored in CODESYS because the latter might indicate
1938 automatic EOL-type detection while the former will always
1939 indicate a particular EOL type. */
1940 enum eol_type eol_type;
1942 /* Additional ISO2022 information. We define the structure above
1943 because it's also needed by the detection routines. */
1944 struct iso2022_decoder iso2022;
1946 /* Additional information (the state of the running CCL program)
1947 used by the CCL decoder. */
1948 struct ccl_program ccl;
1950 /* counter for UTF-8 or UCS-4 */
1951 unsigned char counter;
1953 struct detection_state decst;
1956 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1957 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1958 static int decoding_rewinder (Lstream *stream);
1959 static int decoding_seekable_p (Lstream *stream);
1960 static int decoding_flusher (Lstream *stream);
1961 static int decoding_closer (Lstream *stream);
1963 static Lisp_Object decoding_marker (Lisp_Object stream,
1964 void (*markobj) (Lisp_Object));
1966 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1967 sizeof (struct decoding_stream));
1970 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1972 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1973 Lisp_Object str_obj;
1975 /* We do not need to mark the coding systems or charsets stored
1976 within the stream because they are stored in a global list
1977 and automatically marked. */
1979 XSETLSTREAM (str_obj, str);
1981 if (str->imp->marker)
1982 return (str->imp->marker) (str_obj, markobj);
1987 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1988 so we read data from the other end, decode it, and store it into DATA. */
1991 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1993 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1994 unsigned char *orig_data = data;
1996 int error_occurred = 0;
1998 /* We need to interface to mule_decode(), which expects to take some
1999 amount of data and store the result into a Dynarr. We have
2000 mule_decode() store into str->runoff, and take data from there
2003 /* We loop until we have enough data, reading chunks from the other
2004 end and decoding it. */
2007 /* Take data from the runoff if we can. Make sure to take at
2008 most SIZE bytes, and delete the data from the runoff. */
2009 if (Dynarr_length (str->runoff) > 0)
2011 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2012 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2013 Dynarr_delete_many (str->runoff, 0, chunk);
2019 break; /* No more room for data */
2021 if (str->flags & CODING_STATE_END)
2022 /* This means that on the previous iteration, we hit the EOF on
2023 the other end. We loop once more so that mule_decode() can
2024 output any final stuff it may be holding, or any "go back
2025 to a sane state" escape sequences. (This latter makes sense
2026 during encoding.) */
2029 /* Exhausted the runoff, so get some more. DATA has at least
2030 SIZE bytes left of storage in it, so it's OK to read directly
2031 into it. (We'll be overwriting above, after we've decoded it
2032 into the runoff.) */
2033 read_size = Lstream_read (str->other_end, data, size);
2040 /* There might be some more end data produced in the translation.
2041 See the comment above. */
2042 str->flags |= CODING_STATE_END;
2043 mule_decode (stream, data, str->runoff, read_size);
2046 if (data - orig_data == 0)
2047 return error_occurred ? -1 : 0;
2049 return data - orig_data;
2053 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2055 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2058 /* Decode all our data into the runoff, and then attempt to write
2059 it all out to the other end. Remove whatever chunk we succeeded
2061 mule_decode (stream, data, str->runoff, size);
2062 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2063 Dynarr_length (str->runoff));
2065 Dynarr_delete_many (str->runoff, 0, retval);
2066 /* Do NOT return retval. The return value indicates how much
2067 of the incoming data was written, not how many bytes were
2073 reset_decoding_stream (struct decoding_stream *str)
2076 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2078 Lisp_Object coding_system;
2079 XSETCODING_SYSTEM (coding_system, str->codesys);
2080 reset_iso2022 (coding_system, &str->iso2022);
2082 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2084 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2088 str->flags = str->ch = 0;
2092 decoding_rewinder (Lstream *stream)
2094 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2095 reset_decoding_stream (str);
2096 Dynarr_reset (str->runoff);
2097 return Lstream_rewind (str->other_end);
2101 decoding_seekable_p (Lstream *stream)
2103 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2104 return Lstream_seekable_p (str->other_end);
2108 decoding_flusher (Lstream *stream)
2110 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2111 return Lstream_flush (str->other_end);
2115 decoding_closer (Lstream *stream)
2117 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2118 if (stream->flags & LSTREAM_FL_WRITE)
2120 str->flags |= CODING_STATE_END;
2121 decoding_writer (stream, 0, 0);
2123 Dynarr_free (str->runoff);
2125 #ifdef ENABLE_COMPOSITE_CHARS
2126 if (str->iso2022.composite_chars)
2127 Dynarr_free (str->iso2022.composite_chars);
2130 return Lstream_close (str->other_end);
2134 decoding_stream_coding_system (Lstream *stream)
2136 Lisp_Object coding_system;
2137 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2139 XSETCODING_SYSTEM (coding_system, str->codesys);
2140 return subsidiary_coding_system (coding_system, str->eol_type);
2144 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2146 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2147 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2149 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2150 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2151 reset_decoding_stream (str);
2154 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2155 stream for writing, no automatic code detection will be performed.
2156 The reason for this is that automatic code detection requires a
2157 seekable input. Things will also fail if you open a decoding
2158 stream for reading using a non-fully-specified coding system and
2159 a non-seekable input stream. */
2162 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2165 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2166 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2170 str->other_end = stream;
2171 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2172 str->eol_type = EOL_AUTODETECT;
2173 if (!strcmp (mode, "r")
2174 && Lstream_seekable_p (stream))
2175 /* We can determine the coding system now. */
2176 determine_real_coding_system (stream, &codesys, &str->eol_type);
2177 set_decoding_stream_coding_system (lstr, codesys);
2178 str->decst.eol_type = str->eol_type;
2179 str->decst.mask = ~0;
2180 XSETLSTREAM (obj, lstr);
2185 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2187 return make_decoding_stream_1 (stream, codesys, "r");
2191 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2193 return make_decoding_stream_1 (stream, codesys, "w");
2196 /* Note: the decode_coding_* functions all take the same
2197 arguments as mule_decode(), which is to say some SRC data of
2198 size N, which is to be stored into dynamic array DST.
2199 DECODING is the stream within which the decoding is
2200 taking place, but no data is actually read from or
2201 written to that stream; that is handled in decoding_reader()
2202 or decoding_writer(). This allows the same functions to
2203 be used for both reading and writing. */
2206 mule_decode (Lstream *decoding, CONST unsigned char *src,
2207 unsigned_char_dynarr *dst, unsigned int n)
2209 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2211 /* If necessary, do encoding-detection now. We do this when
2212 we're a writing stream or a non-seekable reading stream,
2213 meaning that we can't just process the whole input,
2214 rewind, and start over. */
2216 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2217 str->eol_type == EOL_AUTODETECT)
2219 Lisp_Object codesys;
2221 XSETCODING_SYSTEM (codesys, str->codesys);
2222 detect_coding_type (&str->decst, src, n,
2223 CODING_SYSTEM_TYPE (str->codesys) !=
2224 CODESYS_AUTODETECT);
2225 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2226 str->decst.mask != ~0)
2227 /* #### This is cheesy. What we really ought to do is
2228 buffer up a certain amount of data so as to get a
2229 less random result. */
2230 codesys = coding_system_from_mask (str->decst.mask);
2231 str->eol_type = str->decst.eol_type;
2232 if (XCODING_SYSTEM (codesys) != str->codesys)
2234 /* Preserve the CODING_STATE_END flag in case it was set.
2235 If we erase it, bad things might happen. */
2236 int was_end = str->flags & CODING_STATE_END;
2237 set_decoding_stream_coding_system (decoding, codesys);
2239 str->flags |= CODING_STATE_END;
2243 switch (CODING_SYSTEM_TYPE (str->codesys))
2246 case CODESYS_INTERNAL:
2247 Dynarr_add_many (dst, src, n);
2250 case CODESYS_AUTODETECT:
2251 /* If we got this far and still haven't decided on the coding
2252 system, then do no conversion. */
2253 case CODESYS_NO_CONVERSION:
2254 decode_coding_no_conversion (decoding, src, dst, n);
2257 case CODESYS_SHIFT_JIS:
2258 decode_coding_sjis (decoding, src, dst, n);
2261 decode_coding_big5 (decoding, src, dst, n);
2264 decode_coding_ucs4 (decoding, src, dst, n);
2267 decode_coding_utf8 (decoding, src, dst, n);
2270 str->ccl.last_block = str->flags & CODING_STATE_END;
2271 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2273 case CODESYS_ISO2022:
2274 decode_coding_iso2022 (decoding, src, dst, n);
2282 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2283 Decode the text between START and END which is encoded in CODING-SYSTEM.
2284 This is useful if you've read in encoded text from a file without decoding
2285 it (e.g. you read in a JIS-formatted file but used the `binary' or
2286 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2287 Return length of decoded text.
2288 BUFFER defaults to the current buffer if unspecified.
2290 (start, end, coding_system, buffer))
2293 struct buffer *buf = decode_buffer (buffer, 0);
2294 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2295 Lstream *istr, *ostr;
2296 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2298 get_buffer_range_char (buf, start, end, &b, &e, 0);
2300 barf_if_buffer_read_only (buf, b, e);
2302 coding_system = Fget_coding_system (coding_system);
2303 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2304 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2305 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2307 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2308 Fget_coding_system (Qbinary));
2309 istr = XLSTREAM (instream);
2310 ostr = XLSTREAM (outstream);
2311 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2313 /* The chain of streams looks like this:
2315 [BUFFER] <----- send through
2316 ------> [ENCODE AS BINARY]
2317 ------> [DECODE AS SPECIFIED]
2323 char tempbuf[1024]; /* some random amount */
2324 Bufpos newpos, even_newer_pos;
2325 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2326 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2330 newpos = lisp_buffer_stream_startpos (istr);
2331 Lstream_write (ostr, tempbuf, size_in_bytes);
2332 even_newer_pos = lisp_buffer_stream_startpos (istr);
2333 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2336 Lstream_close (istr);
2337 Lstream_close (ostr);
2339 Lstream_delete (istr);
2340 Lstream_delete (ostr);
2341 Lstream_delete (XLSTREAM (de_outstream));
2342 Lstream_delete (XLSTREAM (lb_outstream));
2347 /************************************************************************/
2348 /* Converting to an external encoding ("encoding") */
2349 /************************************************************************/
2351 /* An encoding stream is an output stream. When you create the
2352 stream, you specify the coding system that governs the encoding
2353 and another stream that the resulting encoded data is to be
2354 sent to, and then start sending data to it. */
2356 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2358 struct encoding_stream
2360 /* Coding system that governs the conversion. */
2361 Lisp_Coding_System *codesys;
2363 /* Stream that we read the encoded data from or
2364 write the decoded data to. */
2367 /* If we are reading, then we can return only a fixed amount of
2368 data, so if the conversion resulted in too much data, we store it
2369 here for retrieval the next time around. */
2370 unsigned_char_dynarr *runoff;
2372 /* FLAGS holds flags indicating the current state of the encoding.
2373 Some of these flags are dependent on the coding system. */
2376 /* CH holds a partially built-up character. Since we only deal
2377 with one- and two-byte characters at the moment, we only use
2378 this to store the first byte of a two-byte character. */
2381 /* Additional information used by the ISO2022 encoder. */
2384 /* CHARSET holds the character sets currently assigned to the G0
2385 through G3 registers. It is initialized from the array
2386 INITIAL_CHARSET in CODESYS. */
2387 Lisp_Object charset[4];
2389 /* Which registers are currently invoked into the left (GL) and
2390 right (GR) halves of the 8-bit encoding space? */
2391 int register_left, register_right;
2393 /* Whether we need to explicitly designate the charset in the
2394 G? register before using it. It is initialized from the
2395 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2396 unsigned char force_charset_on_output[4];
2398 /* Other state variables that need to be preserved across
2400 Lisp_Object current_charset;
2402 int current_char_boundary;
2405 void (*encode_char) (struct encoding_stream *str, Emchar c,
2406 unsigned_char_dynarr *dst, unsigned int *flags);
2407 void (*finish) (struct encoding_stream *str,
2408 unsigned_char_dynarr *dst, unsigned int *flags);
2410 /* Additional information (the state of the running CCL program)
2411 used by the CCL encoder. */
2412 struct ccl_program ccl;
2416 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2417 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2419 static int encoding_rewinder (Lstream *stream);
2420 static int encoding_seekable_p (Lstream *stream);
2421 static int encoding_flusher (Lstream *stream);
2422 static int encoding_closer (Lstream *stream);
2424 static Lisp_Object encoding_marker (Lisp_Object stream,
2425 void (*markobj) (Lisp_Object));
2427 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2428 sizeof (struct encoding_stream));
2431 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2433 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2434 Lisp_Object str_obj;
2436 /* We do not need to mark the coding systems or charsets stored
2437 within the stream because they are stored in a global list
2438 and automatically marked. */
2440 XSETLSTREAM (str_obj, str);
2442 if (str->imp->marker)
2443 return (str->imp->marker) (str_obj, markobj);
2448 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2449 so we read data from the other end, encode it, and store it into DATA. */
2452 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2454 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2455 unsigned char *orig_data = data;
2457 int error_occurred = 0;
2459 /* We need to interface to mule_encode(), which expects to take some
2460 amount of data and store the result into a Dynarr. We have
2461 mule_encode() store into str->runoff, and take data from there
2464 /* We loop until we have enough data, reading chunks from the other
2465 end and encoding it. */
2468 /* Take data from the runoff if we can. Make sure to take at
2469 most SIZE bytes, and delete the data from the runoff. */
2470 if (Dynarr_length (str->runoff) > 0)
2472 int chunk = min ((int) size, Dynarr_length (str->runoff));
2473 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2474 Dynarr_delete_many (str->runoff, 0, chunk);
2480 break; /* No more room for data */
2482 if (str->flags & CODING_STATE_END)
2483 /* This means that on the previous iteration, we hit the EOF on
2484 the other end. We loop once more so that mule_encode() can
2485 output any final stuff it may be holding, or any "go back
2486 to a sane state" escape sequences. (This latter makes sense
2487 during encoding.) */
2490 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2491 left of storage in it, so it's OK to read directly into it.
2492 (We'll be overwriting above, after we've encoded it into the
2494 read_size = Lstream_read (str->other_end, data, size);
2501 /* There might be some more end data produced in the translation.
2502 See the comment above. */
2503 str->flags |= CODING_STATE_END;
2504 mule_encode (stream, data, str->runoff, read_size);
2507 if (data == orig_data)
2508 return error_occurred ? -1 : 0;
2510 return data - orig_data;
2514 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2516 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2519 /* Encode all our data into the runoff, and then attempt to write
2520 it all out to the other end. Remove whatever chunk we succeeded
2522 mule_encode (stream, data, str->runoff, size);
2523 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2524 Dynarr_length (str->runoff));
2526 Dynarr_delete_many (str->runoff, 0, retval);
2527 /* Do NOT return retval. The return value indicates how much
2528 of the incoming data was written, not how many bytes were
2534 reset_encoding_stream (struct encoding_stream *str)
2537 switch (CODING_SYSTEM_TYPE (str->codesys))
2539 case CODESYS_ISO2022:
2543 str->encode_char = &char_encode_iso2022;
2544 str->finish = &char_finish_iso2022;
2545 for (i = 0; i < 4; i++)
2547 str->iso2022.charset[i] =
2548 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2549 str->iso2022.force_charset_on_output[i] =
2550 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2552 str->iso2022.register_left = 0;
2553 str->iso2022.register_right = 1;
2554 str->iso2022.current_charset = Qnil;
2555 str->iso2022.current_half = 0;
2559 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2562 str->encode_char = &char_encode_utf8;
2563 str->finish = &char_finish_utf8;
2568 str->iso2022.current_char_boundary = 0;
2569 str->flags = str->ch = 0;
2573 encoding_rewinder (Lstream *stream)
2575 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2576 reset_encoding_stream (str);
2577 Dynarr_reset (str->runoff);
2578 return Lstream_rewind (str->other_end);
2582 encoding_seekable_p (Lstream *stream)
2584 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2585 return Lstream_seekable_p (str->other_end);
2589 encoding_flusher (Lstream *stream)
2591 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2592 return Lstream_flush (str->other_end);
2596 encoding_closer (Lstream *stream)
2598 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2599 if (stream->flags & LSTREAM_FL_WRITE)
2601 str->flags |= CODING_STATE_END;
2602 encoding_writer (stream, 0, 0);
2604 Dynarr_free (str->runoff);
2605 return Lstream_close (str->other_end);
2609 encoding_stream_coding_system (Lstream *stream)
2611 Lisp_Object coding_system;
2612 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2614 XSETCODING_SYSTEM (coding_system, str->codesys);
2615 return coding_system;
2619 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2621 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2622 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2624 reset_encoding_stream (str);
2628 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2631 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2632 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2636 str->runoff = Dynarr_new (unsigned_char);
2637 str->other_end = stream;
2638 set_encoding_stream_coding_system (lstr, codesys);
2639 XSETLSTREAM (obj, lstr);
2644 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2646 return make_encoding_stream_1 (stream, codesys, "r");
2650 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2652 return make_encoding_stream_1 (stream, codesys, "w");
2655 /* Convert N bytes of internally-formatted data stored in SRC to an
2656 external format, according to the encoding stream ENCODING.
2657 Store the encoded data into DST. */
2660 mule_encode (Lstream *encoding, CONST unsigned char *src,
2661 unsigned_char_dynarr *dst, unsigned int n)
2663 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2665 switch (CODING_SYSTEM_TYPE (str->codesys))
2668 case CODESYS_INTERNAL:
2669 Dynarr_add_many (dst, src, n);
2672 case CODESYS_AUTODETECT:
2673 /* If we got this far and still haven't decided on the coding
2674 system, then do no conversion. */
2675 case CODESYS_NO_CONVERSION:
2676 encode_coding_no_conversion (encoding, src, dst, n);
2679 case CODESYS_SHIFT_JIS:
2680 encode_coding_sjis (encoding, src, dst, n);
2683 encode_coding_big5 (encoding, src, dst, n);
2686 str->ccl.last_block = str->flags & CODING_STATE_END;
2687 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2691 text_encode_generic (encoding, src, dst, n);
2695 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2696 Encode the text between START and END using CODING-SYSTEM.
2697 This will, for example, convert Japanese characters into stuff such as
2698 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2699 text. BUFFER defaults to the current buffer if unspecified.
2701 (start, end, coding_system, buffer))
2704 struct buffer *buf = decode_buffer (buffer, 0);
2705 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2706 Lstream *istr, *ostr;
2707 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2709 get_buffer_range_char (buf, start, end, &b, &e, 0);
2711 barf_if_buffer_read_only (buf, b, e);
2713 coding_system = Fget_coding_system (coding_system);
2714 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2715 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2716 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2717 Fget_coding_system (Qbinary));
2718 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2720 istr = XLSTREAM (instream);
2721 ostr = XLSTREAM (outstream);
2722 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2723 /* The chain of streams looks like this:
2725 [BUFFER] <----- send through
2726 ------> [ENCODE AS SPECIFIED]
2727 ------> [DECODE AS BINARY]
2732 char tempbuf[1024]; /* some random amount */
2733 Bufpos newpos, even_newer_pos;
2734 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2735 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2739 newpos = lisp_buffer_stream_startpos (istr);
2740 Lstream_write (ostr, tempbuf, size_in_bytes);
2741 even_newer_pos = lisp_buffer_stream_startpos (istr);
2742 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2748 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2749 Lstream_close (istr);
2750 Lstream_close (ostr);
2752 Lstream_delete (istr);
2753 Lstream_delete (ostr);
2754 Lstream_delete (XLSTREAM (de_outstream));
2755 Lstream_delete (XLSTREAM (lb_outstream));
2756 return make_int (retlen);
2763 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2764 unsigned_char_dynarr *dst, unsigned int n)
2767 unsigned char char_boundary;
2768 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2769 unsigned int flags = str->flags;
2770 Emchar ch = str->ch;
2772 char_boundary = str->iso2022.current_char_boundary;
2778 switch (char_boundary)
2786 else if ( c >= 0xf8 )
2791 else if ( c >= 0xf0 )
2796 else if ( c >= 0xe0 )
2801 else if ( c >= 0xc0 )
2808 (*str->encode_char) (str, c, dst, &flags);
2814 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2819 ch = ( ch << 6 ) | ( c & 0x3f );
2824 if ( (char_boundary == 0) && flags & CODING_STATE_END)
2826 (*str->finish) (str, dst, &flags);
2831 str->iso2022.current_char_boundary = char_boundary;
2833 /* Verbum caro factum est! */
2837 /************************************************************************/
2838 /* Shift-JIS methods */
2839 /************************************************************************/
2841 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2842 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2843 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2844 encoded by "position-code + 0x80". A character of JISX0208
2845 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2846 position-codes are divided and shifted so that it fit in the range
2849 --- CODE RANGE of Shift-JIS ---
2850 (character set) (range)
2852 JISX0201-Kana 0xA0 .. 0xDF
2853 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2854 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2855 -------------------------------
2859 /* Is this the first byte of a Shift-JIS two-byte char? */
2861 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2862 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2864 /* Is this the second byte of a Shift-JIS two-byte char? */
2866 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2867 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2869 #define BYTE_SJIS_KATAKANA_P(c) \
2870 ((c) >= 0xA1 && (c) <= 0xDF)
2873 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2881 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2883 if (st->shift_jis.in_second_byte)
2885 st->shift_jis.in_second_byte = 0;
2889 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2890 st->shift_jis.in_second_byte = 1;
2892 return CODING_CATEGORY_SHIFT_JIS_MASK;
2895 /* Convert Shift-JIS data to internal format. */
2898 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2899 unsigned_char_dynarr *dst, unsigned int n)
2902 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2903 unsigned int flags = str->flags;
2904 unsigned int ch = str->ch;
2905 eol_type_t eol_type = str->eol_type;
2913 /* Previous character was first byte of Shift-JIS Kanji char. */
2914 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2916 unsigned char e1, e2;
2918 DECODE_SJIS (ch, c, e1, e2);
2920 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2924 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2925 Dynarr_add (dst, e1);
2926 Dynarr_add (dst, e2);
2931 DECODE_ADD_BINARY_CHAR (ch, dst);
2932 DECODE_ADD_BINARY_CHAR (c, dst);
2938 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2939 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2941 else if (BYTE_SJIS_KATAKANA_P (c))
2944 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2947 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2948 Dynarr_add (dst, c);
2952 DECODE_ADD_BINARY_CHAR (c, dst);
2954 label_continue_loop:;
2957 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2963 /* Convert internally-formatted data to Shift-JIS. */
2966 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2967 unsigned_char_dynarr *dst, unsigned int n)
2970 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2971 unsigned int flags = str->flags;
2972 unsigned int ch = str->ch;
2973 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2975 unsigned char char_boundary = str->iso2022.current_char_boundary;
2982 switch (char_boundary)
2990 else if ( c >= 0xf8 )
2995 else if ( c >= 0xf0 )
3000 else if ( c >= 0xe0 )
3005 else if ( c >= 0xc0 )
3015 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3016 Dynarr_add (dst, '\r');
3017 if (eol_type != EOL_CR)
3018 Dynarr_add (dst, c);
3021 Dynarr_add (dst, c);
3026 ch = ( ch << 6 ) | ( c & 0x3f );
3028 Lisp_Object charset;
3029 unsigned int c1, c2, s1, s2;
3031 BREAKUP_CHAR (ch, charset, c1, c2);
3032 if (EQ(charset, Vcharset_katakana_jisx0201))
3034 Dynarr_add (dst, c1 | 0x80);
3036 else if (EQ(charset, Vcharset_japanese_jisx0208))
3038 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3039 Dynarr_add (dst, s1);
3040 Dynarr_add (dst, s2);
3046 ch = ( ch << 6 ) | ( c & 0x3f );
3052 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3053 Dynarr_add (dst, '\r');
3054 if (eol_type != EOL_CR)
3055 Dynarr_add (dst, '\n');
3058 else if (BYTE_ASCII_P (c))
3060 Dynarr_add (dst, c);
3063 else if (BUFBYTE_LEADING_BYTE_P (c))
3064 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3065 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3066 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3069 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3071 Dynarr_add (dst, c);
3074 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3075 ch == LEADING_BYTE_JAPANESE_JISX0208)
3079 unsigned char j1, j2;
3080 ENCODE_SJIS (ch, c, j1, j2);
3081 Dynarr_add (dst, j1);
3082 Dynarr_add (dst, j2);
3092 str->iso2022.current_char_boundary = char_boundary;
3096 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3097 Decode a JISX0208 character of Shift-JIS coding-system.
3098 CODE is the character code in Shift-JIS as a cons of type bytes.
3099 Return the corresponding character.
3103 unsigned char c1, c2, s1, s2;
3106 CHECK_INT (XCAR (code));
3107 CHECK_INT (XCDR (code));
3108 s1 = XINT (XCAR (code));
3109 s2 = XINT (XCDR (code));
3110 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3111 BYTE_SJIS_TWO_BYTE_2_P (s2))
3113 DECODE_SJIS (s1, s2, c1, c2);
3114 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3115 c1 & 0x7F, c2 & 0x7F));
3121 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3122 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3123 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3127 Lisp_Object charset;
3130 CHECK_CHAR_COERCE_INT (ch);
3131 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3132 if (EQ (charset, Vcharset_japanese_jisx0208))
3134 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3135 return Fcons (make_int (s1), make_int (s2));
3142 /************************************************************************/
3144 /************************************************************************/
3146 /* BIG5 is a coding system encoding two character sets: ASCII and
3147 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3148 character set and is encoded in two-byte.
3150 --- CODE RANGE of BIG5 ---
3151 (character set) (range)
3153 Big5 (1st byte) 0xA1 .. 0xFE
3154 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3155 --------------------------
3157 Since the number of characters in Big5 is larger than maximum
3158 characters in Emacs' charset (96x96), it can't be handled as one
3159 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3160 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3161 contains frequently used characters and the latter contains less
3162 frequently used characters. */
3164 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3165 ((c) >= 0xA1 && (c) <= 0xFE)
3167 /* Is this the second byte of a Shift-JIS two-byte char? */
3169 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3170 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3172 /* Number of Big5 characters which have the same code in 1st byte. */
3174 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3176 /* Code conversion macros. These are macros because they are used in
3177 inner loops during code conversion.
3179 Note that temporary variables in macros introduce the classic
3180 dynamic-scoping problems with variable names. We use capital-
3181 lettered variables in the assumption that XEmacs does not use
3182 capital letters in variables except in a very formalized way
3185 /* Convert Big5 code (b1, b2) into its internal string representation
3188 /* There is a much simpler way to split the Big5 charset into two.
3189 For the moment I'm going to leave the algorithm as-is because it
3190 claims to separate out the most-used characters into a single
3191 charset, which perhaps will lead to optimizations in various
3194 The way the algorithm works is something like this:
3196 Big5 can be viewed as a 94x157 charset, where the row is
3197 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3198 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3199 the split between low and high column numbers is apparently
3200 meaningless; ascending rows produce less and less frequent chars.
3201 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3202 the first charset, and the upper half (0xC9 .. 0xFE) to the
3203 second. To do the conversion, we convert the character into
3204 a single number where 0 .. 156 is the first row, 157 .. 313
3205 is the second, etc. That way, the characters are ordered by
3206 decreasing frequency. Then we just chop the space in two
3207 and coerce the result into a 94x94 space.
3210 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3212 int B1 = b1, B2 = b2; \
3214 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3218 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3222 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3223 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3225 c1 = I / (0xFF - 0xA1) + 0xA1; \
3226 c2 = I % (0xFF - 0xA1) + 0xA1; \
3229 /* Convert the internal string representation of a Big5 character
3230 (lb, c1, c2) into Big5 code (b1, b2). */
3232 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3234 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3236 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3238 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3240 b1 = I / BIG5_SAME_ROW + 0xA1; \
3241 b2 = I % BIG5_SAME_ROW; \
3242 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3246 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3254 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3255 (c >= 0x80 && c <= 0xA0))
3257 if (st->big5.in_second_byte)
3259 st->big5.in_second_byte = 0;
3260 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3264 st->big5.in_second_byte = 1;
3266 return CODING_CATEGORY_BIG5_MASK;
3269 /* Convert Big5 data to internal format. */
3272 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3273 unsigned_char_dynarr *dst, unsigned int n)
3276 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3277 unsigned int flags = str->flags;
3278 unsigned int ch = str->ch;
3279 eol_type_t eol_type = str->eol_type;
3286 /* Previous character was first byte of Big5 char. */
3287 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3289 unsigned char b1, b2, b3;
3290 DECODE_BIG5 (ch, c, b1, b2, b3);
3291 Dynarr_add (dst, b1);
3292 Dynarr_add (dst, b2);
3293 Dynarr_add (dst, b3);
3297 DECODE_ADD_BINARY_CHAR (ch, dst);
3298 DECODE_ADD_BINARY_CHAR (c, dst);
3304 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3305 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3308 DECODE_ADD_BINARY_CHAR (c, dst);
3310 label_continue_loop:;
3313 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3319 /* Convert internally-formatted data to Big5. */
3322 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3323 unsigned_char_dynarr *dst, unsigned int n)
3327 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3328 unsigned int flags = str->flags;
3329 unsigned int ch = str->ch;
3330 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3337 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3338 Dynarr_add (dst, '\r');
3339 if (eol_type != EOL_CR)
3340 Dynarr_add (dst, '\n');
3342 else if (BYTE_ASCII_P (c))
3345 Dynarr_add (dst, c);
3347 else if (BUFBYTE_LEADING_BYTE_P (c))
3349 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3350 c == LEADING_BYTE_CHINESE_BIG5_2)
3352 /* A recognized leading byte. */
3354 continue; /* not done with this character. */
3356 /* otherwise just ignore this character. */
3358 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3359 ch == LEADING_BYTE_CHINESE_BIG5_2)
3361 /* Previous char was a recognized leading byte. */
3363 continue; /* not done with this character. */
3367 /* Encountering second byte of a Big5 character. */
3368 unsigned char b1, b2;
3370 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3371 Dynarr_add (dst, b1);
3372 Dynarr_add (dst, b2);
3384 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3385 Decode a Big5 character CODE of BIG5 coding-system.
3386 CODE is the character code in BIG5, a cons of two integers.
3387 Return the corresponding character.
3391 unsigned char c1, c2, b1, b2;
3394 CHECK_INT (XCAR (code));
3395 CHECK_INT (XCDR (code));
3396 b1 = XINT (XCAR (code));
3397 b2 = XINT (XCDR (code));
3398 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3399 BYTE_BIG5_TWO_BYTE_2_P (b2))
3401 Charset_ID leading_byte;
3402 Lisp_Object charset;
3403 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3404 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3405 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3411 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3412 Encode the Big5 character CH to BIG5 coding-system.
3413 Return the corresponding character code in Big5.
3417 Lisp_Object charset;
3420 CHECK_CHAR_COERCE_INT (ch);
3421 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3422 if (EQ (charset, Vcharset_chinese_big5_1) ||
3423 EQ (charset, Vcharset_chinese_big5_2))
3425 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3427 return Fcons (make_int (b1), make_int (b2));
3434 /************************************************************************/
3436 /************************************************************************/
3439 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3445 switch (st->ucs4.in_byte)
3454 st->ucs4.in_byte = 0;
3460 return CODING_CATEGORY_UCS4_MASK;
3464 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3465 unsigned_char_dynarr *dst, unsigned int n)
3467 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3468 unsigned int flags = str->flags;
3469 unsigned int ch = str->ch;
3470 unsigned char counter = str->counter;
3474 unsigned char c = *src++;
3482 DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3487 ch = ( ch << 8 ) | c;
3491 if (counter & CODING_STATE_END)
3492 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3496 str->counter = counter;
3500 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3501 unsigned_char_dynarr *dst, unsigned int *flags)
3503 Dynarr_add (dst, ch >> 24);
3504 Dynarr_add (dst, (ch >> 16) & 255);
3505 Dynarr_add (dst, (ch >> 8) & 255);
3506 Dynarr_add (dst, ch & 255);
3510 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3511 unsigned int *flags)
3516 /************************************************************************/
3518 /************************************************************************/
3521 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3526 unsigned char c = *src++;
3527 switch (st->utf8.in_byte)
3530 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3533 st->utf8.in_byte = 5;
3535 st->utf8.in_byte = 4;
3537 st->utf8.in_byte = 3;
3539 st->utf8.in_byte = 2;
3541 st->utf8.in_byte = 1;
3546 if ((c & 0xc0) != 0x80)
3552 return CODING_CATEGORY_UTF8_MASK;
3556 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3557 unsigned_char_dynarr *dst, unsigned int n)
3559 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3560 unsigned int flags = str->flags;
3561 unsigned int ch = str->ch;
3562 eol_type_t eol_type = str->eol_type;
3563 unsigned char counter = str->counter;
3567 unsigned char c = *src++;
3576 else if ( c >= 0xf8 )
3581 else if ( c >= 0xf0 )
3586 else if ( c >= 0xe0 )
3591 else if ( c >= 0xc0 )
3598 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3599 DECODE_ADD_UCS_CHAR (c, dst);
3603 ch = ( ch << 6 ) | ( c & 0x3f );
3604 DECODE_ADD_UCS_CHAR (ch, dst);
3609 ch = ( ch << 6 ) | ( c & 0x3f );
3612 label_continue_loop:;
3615 if (flags & CODING_STATE_END)
3616 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3620 str->counter = counter;
3624 char_encode_utf8 (struct encoding_stream *str, Emchar code,
3625 unsigned_char_dynarr *dst, unsigned int *flags)
3629 Dynarr_add (dst, code);
3631 else if ( code <= 0x7ff )
3633 Dynarr_add (dst, (code >> 6) | 0xc0);
3634 Dynarr_add (dst, (code & 0x3f) | 0x80);
3636 else if ( code <= 0xffff )
3638 Dynarr_add (dst, (code >> 12) | 0xe0);
3639 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3640 Dynarr_add (dst, (code & 0x3f) | 0x80);
3642 else if ( code <= 0x1fffff )
3644 Dynarr_add (dst, (code >> 18) | 0xf0);
3645 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3646 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3647 Dynarr_add (dst, (code & 0x3f) | 0x80);
3649 else if ( code <= 0x3ffffff )
3651 Dynarr_add (dst, (code >> 24) | 0xf8);
3652 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3653 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3654 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3655 Dynarr_add (dst, (code & 0x3f) | 0x80);
3659 Dynarr_add (dst, (code >> 30) | 0xfc);
3660 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3661 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3662 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3663 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3664 Dynarr_add (dst, (code & 0x3f) | 0x80);
3669 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3670 unsigned int *flags)
3675 /************************************************************************/
3676 /* ISO2022 methods */
3677 /************************************************************************/
3679 /* The following note describes the coding system ISO2022 briefly.
3680 Since the intention of this note is to help understand the
3681 functions in this file, some parts are NOT ACCURATE or OVERLY
3682 SIMPLIFIED. For thorough understanding, please refer to the
3683 original document of ISO2022.
3685 ISO2022 provides many mechanisms to encode several character sets
3686 in 7-bit and 8-bit environments. For 7-bit environments, all text
3687 is encoded using bytes less than 128. This may make the encoded
3688 text a little bit longer, but the text passes more easily through
3689 several gateways, some of which strip off MSB (Most Signigant Bit).
3691 There are two kinds of character sets: control character set and
3692 graphic character set. The former contains control characters such
3693 as `newline' and `escape' to provide control functions (control
3694 functions are also provided by escape sequences). The latter
3695 contains graphic characters such as 'A' and '-'. Emacs recognizes
3696 two control character sets and many graphic character sets.
3698 Graphic character sets are classified into one of the following
3699 four classes, according to the number of bytes (DIMENSION) and
3700 number of characters in one dimension (CHARS) of the set:
3701 - DIMENSION1_CHARS94
3702 - DIMENSION1_CHARS96
3703 - DIMENSION2_CHARS94
3704 - DIMENSION2_CHARS96
3706 In addition, each character set is assigned an identification tag,
3707 unique for each set, called "final character" (denoted as <F>
3708 hereafter). The <F> of each character set is decided by ECMA(*)
3709 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3710 (0x30..0x3F are for private use only).
3712 Note (*): ECMA = European Computer Manufacturers Association
3714 Here are examples of graphic character set [NAME(<F>)]:
3715 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3716 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3717 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3718 o DIMENSION2_CHARS96 -- none for the moment
3720 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3721 C0 [0x00..0x1F] -- control character plane 0
3722 GL [0x20..0x7F] -- graphic character plane 0
3723 C1 [0x80..0x9F] -- control character plane 1
3724 GR [0xA0..0xFF] -- graphic character plane 1
3726 A control character set is directly designated and invoked to C0 or
3727 C1 by an escape sequence. The most common case is that:
3728 - ISO646's control character set is designated/invoked to C0, and
3729 - ISO6429's control character set is designated/invoked to C1,
3730 and usually these designations/invocations are omitted in encoded
3731 text. In a 7-bit environment, only C0 can be used, and a control
3732 character for C1 is encoded by an appropriate escape sequence to
3733 fit into the environment. All control characters for C1 are
3734 defined to have corresponding escape sequences.
3736 A graphic character set is at first designated to one of four
3737 graphic registers (G0 through G3), then these graphic registers are
3738 invoked to GL or GR. These designations and invocations can be
3739 done independently. The most common case is that G0 is invoked to
3740 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3741 these invocations and designations are omitted in encoded text.
3742 In a 7-bit environment, only GL can be used.
3744 When a graphic character set of CHARS94 is invoked to GL, codes
3745 0x20 and 0x7F of the GL area work as control characters SPACE and
3746 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3749 There are two ways of invocation: locking-shift and single-shift.
3750 With locking-shift, the invocation lasts until the next different
3751 invocation, whereas with single-shift, the invocation affects the
3752 following character only and doesn't affect the locking-shift
3753 state. Invocations are done by the following control characters or
3756 ----------------------------------------------------------------------
3757 abbrev function cntrl escape seq description
3758 ----------------------------------------------------------------------
3759 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3760 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3761 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3762 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3763 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3764 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3765 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3766 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3767 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3768 ----------------------------------------------------------------------
3769 (*) These are not used by any known coding system.
3771 Control characters for these functions are defined by macros
3772 ISO_CODE_XXX in `coding.h'.
3774 Designations are done by the following escape sequences:
3775 ----------------------------------------------------------------------
3776 escape sequence description
3777 ----------------------------------------------------------------------
3778 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3779 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3780 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3781 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3782 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3783 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3784 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3785 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3786 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3787 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3788 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3789 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3790 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3791 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3792 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3793 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3794 ----------------------------------------------------------------------
3796 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3797 of dimension 1, chars 94, and final character <F>, etc...
3799 Note (*): Although these designations are not allowed in ISO2022,
3800 Emacs accepts them on decoding, and produces them on encoding
3801 CHARS96 character sets in a coding system which is characterized as
3802 7-bit environment, non-locking-shift, and non-single-shift.
3804 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3805 '(' can be omitted. We refer to this as "short-form" hereafter.
3807 Now you may notice that there are a lot of ways for encoding the
3808 same multilingual text in ISO2022. Actually, there exist many
3809 coding systems such as Compound Text (used in X11's inter client
3810 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3811 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3812 localized platforms), and all of these are variants of ISO2022.
3814 In addition to the above, Emacs handles two more kinds of escape
3815 sequences: ISO6429's direction specification and Emacs' private
3816 sequence for specifying character composition.
3818 ISO6429's direction specification takes the following form:
3819 o CSI ']' -- end of the current direction
3820 o CSI '0' ']' -- end of the current direction
3821 o CSI '1' ']' -- start of left-to-right text
3822 o CSI '2' ']' -- start of right-to-left text
3823 The control character CSI (0x9B: control sequence introducer) is
3824 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3826 Character composition specification takes the following form:
3827 o ESC '0' -- start character composition
3828 o ESC '1' -- end character composition
3829 Since these are not standard escape sequences of any ISO standard,
3830 their use with these meanings is restricted to Emacs only. */
3833 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3837 for (i = 0; i < 4; i++)
3839 if (!NILP (coding_system))
3841 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3843 iso->charset[i] = Qt;
3844 iso->invalid_designated[i] = 0;
3846 iso->esc = ISO_ESC_NOTHING;
3847 iso->esc_bytes_index = 0;
3848 iso->register_left = 0;
3849 iso->register_right = 1;
3850 iso->switched_dir_and_no_valid_charset_yet = 0;
3851 iso->invalid_switch_dir = 0;
3852 iso->output_direction_sequence = 0;
3853 iso->output_literally = 0;
3854 #ifdef ENABLE_COMPOSITE_CHARS
3855 if (iso->composite_chars)
3856 Dynarr_reset (iso->composite_chars);
3861 fit_to_be_escape_quoted (unsigned char c)
3878 /* Parse one byte of an ISO2022 escape sequence.
3879 If the result is an invalid escape sequence, return 0 and
3880 do not change anything in STR. Otherwise, if the result is
3881 an incomplete escape sequence, update ISO2022.ESC and
3882 ISO2022.ESC_BYTES and return -1. Otherwise, update
3883 all the state variables (but not ISO2022.ESC_BYTES) and
3886 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3887 or invocation of an invalid character set and treat that as
3888 an unrecognized escape sequence. */
3891 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3892 unsigned char c, unsigned int *flags,
3893 int check_invalid_charsets)
3895 /* (1) If we're at the end of a designation sequence, CS is the
3896 charset being designated and REG is the register to designate
3899 (2) If we're at the end of a locking-shift sequence, REG is
3900 the register to invoke and HALF (0 == left, 1 == right) is
3901 the half to invoke it into.
3903 (3) If we're at the end of a single-shift sequence, REG is
3904 the register to invoke. */
3905 Lisp_Object cs = Qnil;
3908 /* NOTE: This code does goto's all over the fucking place.
3909 The reason for this is that we're basically implementing
3910 a state machine here, and hierarchical languages like C
3911 don't really provide a clean way of doing this. */
3913 if (! (*flags & CODING_STATE_ESCAPE))
3914 /* At beginning of escape sequence; we need to reset our
3915 escape-state variables. */
3916 iso->esc = ISO_ESC_NOTHING;
3918 iso->output_literally = 0;
3919 iso->output_direction_sequence = 0;
3923 case ISO_ESC_NOTHING:
3924 iso->esc_bytes_index = 0;
3927 case ISO_CODE_ESC: /* Start escape sequence */
3928 *flags |= CODING_STATE_ESCAPE;
3932 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3933 *flags |= CODING_STATE_ESCAPE;
3934 iso->esc = ISO_ESC_5_11;
3937 case ISO_CODE_SO: /* locking shift 1 */
3940 case ISO_CODE_SI: /* locking shift 0 */
3944 case ISO_CODE_SS2: /* single shift */
3947 case ISO_CODE_SS3: /* single shift */
3951 default: /* Other control characters */
3958 /**** single shift ****/
3960 case 'N': /* single shift 2 */
3963 case 'O': /* single shift 3 */
3967 /**** locking shift ****/
3969 case '~': /* locking shift 1 right */
3972 case 'n': /* locking shift 2 */
3975 case '}': /* locking shift 2 right */
3978 case 'o': /* locking shift 3 */
3981 case '|': /* locking shift 3 right */
3985 #ifdef ENABLE_COMPOSITE_CHARS
3986 /**** composite ****/
3989 iso->esc = ISO_ESC_START_COMPOSITE;
3990 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3991 CODING_STATE_COMPOSITE;
3995 iso->esc = ISO_ESC_END_COMPOSITE;
3996 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3997 ~CODING_STATE_COMPOSITE;
3999 #endif /* ENABLE_COMPOSITE_CHARS */
4001 /**** directionality ****/
4004 iso->esc = ISO_ESC_5_11;
4007 /**** designation ****/
4009 case '$': /* multibyte charset prefix */
4010 iso->esc = ISO_ESC_2_4;
4014 if (0x28 <= c && c <= 0x2F)
4016 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4020 /* This function is called with CODESYS equal to nil when
4021 doing coding-system detection. */
4023 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4024 && fit_to_be_escape_quoted (c))
4026 iso->esc = ISO_ESC_LITERAL;
4027 *flags &= CODING_STATE_ISO2022_LOCK;
4037 /**** directionality ****/
4039 case ISO_ESC_5_11: /* ISO6429 direction control */
4042 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4043 goto directionality;
4045 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4046 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4047 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4051 case ISO_ESC_5_11_0:
4054 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4055 goto directionality;
4059 case ISO_ESC_5_11_1:
4062 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4063 goto directionality;
4067 case ISO_ESC_5_11_2:
4070 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4071 goto directionality;
4076 iso->esc = ISO_ESC_DIRECTIONALITY;
4077 /* Various junk here to attempt to preserve the direction sequences
4078 literally in the text if they would otherwise be swallowed due
4079 to invalid designations that don't show up as actual charset
4080 changes in the text. */
4081 if (iso->invalid_switch_dir)
4083 /* We already inserted a direction switch literally into the
4084 text. We assume (#### this may not be right) that the
4085 next direction switch is the one going the other way,
4086 and we need to output that literally as well. */
4087 iso->output_literally = 1;
4088 iso->invalid_switch_dir = 0;
4094 /* If we are in the thrall of an invalid designation,
4095 then stick the directionality sequence literally into the
4096 output stream so it ends up in the original text again. */
4097 for (jj = 0; jj < 4; jj++)
4098 if (iso->invalid_designated[jj])
4102 iso->output_literally = 1;
4103 iso->invalid_switch_dir = 1;
4106 /* Indicate that we haven't yet seen a valid designation,
4107 so that if a switch-dir is directly followed by an
4108 invalid designation, both get inserted literally. */
4109 iso->switched_dir_and_no_valid_charset_yet = 1;
4114 /**** designation ****/
4117 if (0x28 <= c && c <= 0x2F)
4119 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4122 if (0x40 <= c && c <= 0x42)
4124 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4125 *flags & CODING_STATE_R2L ?
4126 CHARSET_RIGHT_TO_LEFT :
4127 CHARSET_LEFT_TO_RIGHT);
4137 if (c < '0' || c > '~')
4138 return 0; /* bad final byte */
4140 if (iso->esc >= ISO_ESC_2_8 &&
4141 iso->esc <= ISO_ESC_2_15)
4143 type = ((iso->esc >= ISO_ESC_2_12) ?
4144 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4145 reg = (iso->esc - ISO_ESC_2_8) & 3;
4147 else if (iso->esc >= ISO_ESC_2_4_8 &&
4148 iso->esc <= ISO_ESC_2_4_15)
4150 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4151 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4152 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4156 /* Can this ever be reached? -slb */
4160 cs = CHARSET_BY_ATTRIBUTES (type, c,
4161 *flags & CODING_STATE_R2L ?
4162 CHARSET_RIGHT_TO_LEFT :
4163 CHARSET_LEFT_TO_RIGHT);
4169 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4173 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4174 /* can't invoke something that ain't there. */
4176 iso->esc = ISO_ESC_SINGLE_SHIFT;
4177 *flags &= CODING_STATE_ISO2022_LOCK;
4179 *flags |= CODING_STATE_SS2;
4181 *flags |= CODING_STATE_SS3;
4185 if (check_invalid_charsets &&
4186 !CHARSETP (iso->charset[reg]))
4187 /* can't invoke something that ain't there. */
4190 iso->register_right = reg;
4192 iso->register_left = reg;
4193 *flags &= CODING_STATE_ISO2022_LOCK;
4194 iso->esc = ISO_ESC_LOCKING_SHIFT;
4198 if (NILP (cs) && check_invalid_charsets)
4200 iso->invalid_designated[reg] = 1;
4201 iso->charset[reg] = Vcharset_ascii;
4202 iso->esc = ISO_ESC_DESIGNATE;
4203 *flags &= CODING_STATE_ISO2022_LOCK;
4204 iso->output_literally = 1;
4205 if (iso->switched_dir_and_no_valid_charset_yet)
4207 /* We encountered a switch-direction followed by an
4208 invalid designation. Ensure that the switch-direction
4209 gets outputted; otherwise it will probably get eaten
4210 when the text is written out again. */
4211 iso->switched_dir_and_no_valid_charset_yet = 0;
4212 iso->output_direction_sequence = 1;
4213 /* And make sure that the switch-dir going the other
4214 way gets outputted, as well. */
4215 iso->invalid_switch_dir = 1;
4219 /* This function is called with CODESYS equal to nil when
4220 doing coding-system detection. */
4221 if (!NILP (codesys))
4223 charset_conversion_spec_dynarr *dyn =
4224 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4230 for (i = 0; i < Dynarr_length (dyn); i++)
4232 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4233 if (EQ (cs, spec->from_charset))
4234 cs = spec->to_charset;
4239 iso->charset[reg] = cs;
4240 iso->esc = ISO_ESC_DESIGNATE;
4241 *flags &= CODING_STATE_ISO2022_LOCK;
4242 if (iso->invalid_designated[reg])
4244 iso->invalid_designated[reg] = 0;
4245 iso->output_literally = 1;
4247 if (iso->switched_dir_and_no_valid_charset_yet)
4248 iso->switched_dir_and_no_valid_charset_yet = 0;
4253 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4258 /* #### There are serious deficiencies in the recognition mechanism
4259 here. This needs to be much smarter if it's going to cut it.
4260 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4261 it should be detected as Latin-1.
4262 All the ISO2022 stuff in this file should be synced up with the
4263 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4264 Perhaps we should wait till R2L works in FSF Emacs? */
4266 if (!st->iso2022.initted)
4268 reset_iso2022 (Qnil, &st->iso2022.iso);
4269 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4270 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4271 CODING_CATEGORY_ISO_8_1_MASK |
4272 CODING_CATEGORY_ISO_8_2_MASK |
4273 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4274 st->iso2022.flags = 0;
4275 st->iso2022.high_byte_count = 0;
4276 st->iso2022.saw_single_shift = 0;
4277 st->iso2022.initted = 1;
4280 mask = st->iso2022.mask;
4287 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4288 st->iso2022.high_byte_count++;
4292 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4294 if (st->iso2022.high_byte_count & 1)
4295 /* odd number of high bytes; assume not iso-8-2 */
4296 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4298 st->iso2022.high_byte_count = 0;
4299 st->iso2022.saw_single_shift = 0;
4301 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4303 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4304 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4305 { /* control chars */
4308 /* Allow and ignore control characters that you might
4309 reasonably see in a text file */
4314 case 8: /* backspace */
4315 case 11: /* vertical tab */
4316 case 12: /* form feed */
4317 case 26: /* MS-DOS C-z junk */
4318 case 31: /* '^_' -- for info */
4319 goto label_continue_loop;
4326 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4329 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4330 &st->iso2022.flags, 0))
4332 switch (st->iso2022.iso.esc)
4334 case ISO_ESC_DESIGNATE:
4335 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4336 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4338 case ISO_ESC_LOCKING_SHIFT:
4339 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4340 goto ran_out_of_chars;
4341 case ISO_ESC_SINGLE_SHIFT:
4342 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4343 st->iso2022.saw_single_shift = 1;
4352 goto ran_out_of_chars;
4355 label_continue_loop:;
4364 postprocess_iso2022_mask (int mask)
4366 /* #### kind of cheesy */
4367 /* If seven-bit ISO is allowed, then assume that the encoding is
4368 entirely seven-bit and turn off the eight-bit ones. */
4369 if (mask & CODING_CATEGORY_ISO_7_MASK)
4370 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4371 CODING_CATEGORY_ISO_8_1_MASK |
4372 CODING_CATEGORY_ISO_8_2_MASK);
4376 /* If FLAGS is a null pointer or specifies right-to-left motion,
4377 output a switch-dir-to-left-to-right sequence to DST.
4378 Also update FLAGS if it is not a null pointer.
4379 If INTERNAL_P is set, we are outputting in internal format and
4380 need to handle the CSI differently. */
4383 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4384 unsigned_char_dynarr *dst,
4385 unsigned int *flags,
4388 if (!flags || (*flags & CODING_STATE_R2L))
4390 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4392 Dynarr_add (dst, ISO_CODE_ESC);
4393 Dynarr_add (dst, '[');
4395 else if (internal_p)
4396 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4398 Dynarr_add (dst, ISO_CODE_CSI);
4399 Dynarr_add (dst, '0');
4400 Dynarr_add (dst, ']');
4402 *flags &= ~CODING_STATE_R2L;
4406 /* If FLAGS is a null pointer or specifies a direction different from
4407 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4408 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4409 sequence to DST. Also update FLAGS if it is not a null pointer.
4410 If INTERNAL_P is set, we are outputting in internal format and
4411 need to handle the CSI differently. */
4414 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4415 unsigned_char_dynarr *dst, unsigned int *flags,
4418 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4419 direction == CHARSET_LEFT_TO_RIGHT)
4420 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4421 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4422 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4423 direction == CHARSET_RIGHT_TO_LEFT)
4425 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4427 Dynarr_add (dst, ISO_CODE_ESC);
4428 Dynarr_add (dst, '[');
4430 else if (internal_p)
4431 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4433 Dynarr_add (dst, ISO_CODE_CSI);
4434 Dynarr_add (dst, '2');
4435 Dynarr_add (dst, ']');
4437 *flags |= CODING_STATE_R2L;
4441 /* Convert ISO2022-format data to internal format. */
4444 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4445 unsigned_char_dynarr *dst, unsigned int n)
4447 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4448 unsigned int flags = str->flags;
4449 unsigned int ch = str->ch;
4450 eol_type_t eol_type = str->eol_type;
4451 #ifdef ENABLE_COMPOSITE_CHARS
4452 unsigned_char_dynarr *real_dst = dst;
4454 Lisp_Object coding_system;
4456 XSETCODING_SYSTEM (coding_system, str->codesys);
4458 #ifdef ENABLE_COMPOSITE_CHARS
4459 if (flags & CODING_STATE_COMPOSITE)
4460 dst = str->iso2022.composite_chars;
4461 #endif /* ENABLE_COMPOSITE_CHARS */
4465 unsigned char c = *src++;
4466 if (flags & CODING_STATE_ESCAPE)
4467 { /* Within ESC sequence */
4468 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4473 switch (str->iso2022.esc)
4475 #ifdef ENABLE_COMPOSITE_CHARS
4476 case ISO_ESC_START_COMPOSITE:
4477 if (str->iso2022.composite_chars)
4478 Dynarr_reset (str->iso2022.composite_chars);
4480 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4481 dst = str->iso2022.composite_chars;
4483 case ISO_ESC_END_COMPOSITE:
4485 Bufbyte comstr[MAX_EMCHAR_LEN];
4487 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4488 Dynarr_length (dst));
4490 len = set_charptr_emchar (comstr, emch);
4491 Dynarr_add_many (dst, comstr, len);
4494 #endif /* ENABLE_COMPOSITE_CHARS */
4496 case ISO_ESC_LITERAL:
4497 DECODE_ADD_BINARY_CHAR (c, dst);
4501 /* Everything else handled already */
4506 /* Attempted error recovery. */
4507 if (str->iso2022.output_direction_sequence)
4508 ensure_correct_direction (flags & CODING_STATE_R2L ?
4509 CHARSET_RIGHT_TO_LEFT :
4510 CHARSET_LEFT_TO_RIGHT,
4511 str->codesys, dst, 0, 1);
4512 /* More error recovery. */
4513 if (!retval || str->iso2022.output_literally)
4515 /* Output the (possibly invalid) sequence */
4517 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4518 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4519 flags &= CODING_STATE_ISO2022_LOCK;
4521 n++, src--;/* Repeat the loop with the same character. */
4524 /* No sense in reprocessing the final byte of the
4525 escape sequence; it could mess things up anyway.
4527 DECODE_ADD_BINARY_CHAR (c, dst);
4532 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4533 { /* Control characters */
4535 /***** Error-handling *****/
4537 /* If we were in the middle of a character, dump out the
4538 partial character. */
4539 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4541 /* If we just saw a single-shift character, dump it out.
4542 This may dump out the wrong sort of single-shift character,
4543 but least it will give an indication that something went
4545 if (flags & CODING_STATE_SS2)
4547 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4548 flags &= ~CODING_STATE_SS2;
4550 if (flags & CODING_STATE_SS3)
4552 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4553 flags &= ~CODING_STATE_SS3;
4556 /***** Now handle the control characters. *****/
4559 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4561 flags &= CODING_STATE_ISO2022_LOCK;
4563 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4564 DECODE_ADD_BINARY_CHAR (c, dst);
4567 { /* Graphic characters */
4568 Lisp_Object charset;
4574 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4576 /* Now determine the charset. */
4577 reg = ((flags & CODING_STATE_SS2) ? 2
4578 : (flags & CODING_STATE_SS3) ? 3
4579 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4580 : str->iso2022.register_left);
4581 charset = str->iso2022.charset[reg];
4583 /* Error checking: */
4584 if (! CHARSETP (charset)
4585 || str->iso2022.invalid_designated[reg]
4586 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4587 && XCHARSET_CHARS (charset) == 94))
4588 /* Mrmph. We are trying to invoke a register that has no
4589 or an invalid charset in it, or trying to add a character
4590 outside the range of the charset. Insert that char literally
4591 to preserve it for the output. */
4593 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4594 DECODE_ADD_BINARY_CHAR (c, dst);
4599 /* Things are probably hunky-dorey. */
4601 /* Fetch reverse charset, maybe. */
4602 if (((flags & CODING_STATE_R2L) &&
4603 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4605 (!(flags & CODING_STATE_R2L) &&
4606 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4608 Lisp_Object new_charset =
4609 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4610 if (!NILP (new_charset))
4611 charset = new_charset;
4615 if (XCHARSET_DIMENSION (charset) == 1)
4617 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4619 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4624 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4630 lb = XCHARSET_LEADING_BYTE (charset);
4631 switch (XCHARSET_REP_BYTES (charset))
4634 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4635 Dynarr_add (dst, c & 0x7F);
4638 case 2: /* one-byte official */
4639 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4640 Dynarr_add (dst, lb);
4641 Dynarr_add (dst, c | 0x80);
4644 case 3: /* one-byte private or two-byte official */
4645 if (XCHARSET_PRIVATE_P (charset))
4647 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4648 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4649 Dynarr_add (dst, lb);
4650 Dynarr_add (dst, c | 0x80);
4656 Dynarr_add (dst, lb);
4657 Dynarr_add (dst, ch | 0x80);
4658 Dynarr_add (dst, c | 0x80);
4666 default: /* two-byte private */
4669 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4670 Dynarr_add (dst, lb);
4671 Dynarr_add (dst, ch | 0x80);
4672 Dynarr_add (dst, c | 0x80);
4682 flags &= CODING_STATE_ISO2022_LOCK;
4685 label_continue_loop:;
4688 if (flags & CODING_STATE_END)
4689 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4696 /***** ISO2022 encoder *****/
4698 /* Designate CHARSET into register REG. */
4701 iso2022_designate (Lisp_Object charset, unsigned char reg,
4702 struct encoding_stream *str, unsigned_char_dynarr *dst)
4704 static CONST char inter94[] = "()*+";
4705 static CONST char inter96[] = ",-./";
4707 unsigned char final;
4708 Lisp_Object old_charset = str->iso2022.charset[reg];
4710 str->iso2022.charset[reg] = charset;
4711 if (!CHARSETP (charset))
4712 /* charset might be an initial nil or t. */
4714 type = XCHARSET_TYPE (charset);
4715 final = XCHARSET_FINAL (charset);
4716 if (!str->iso2022.force_charset_on_output[reg] &&
4717 CHARSETP (old_charset) &&
4718 XCHARSET_TYPE (old_charset) == type &&
4719 XCHARSET_FINAL (old_charset) == final)
4722 str->iso2022.force_charset_on_output[reg] = 0;
4725 charset_conversion_spec_dynarr *dyn =
4726 str->codesys->iso2022.output_conv;
4732 for (i = 0; i < Dynarr_length (dyn); i++)
4734 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4735 if (EQ (charset, spec->from_charset))
4736 charset = spec->to_charset;
4741 Dynarr_add (dst, ISO_CODE_ESC);
4744 case CHARSET_TYPE_94:
4745 Dynarr_add (dst, inter94[reg]);
4747 case CHARSET_TYPE_96:
4748 Dynarr_add (dst, inter96[reg]);
4750 case CHARSET_TYPE_94X94:
4751 Dynarr_add (dst, '$');
4753 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4756 Dynarr_add (dst, inter94[reg]);
4758 case CHARSET_TYPE_96X96:
4759 Dynarr_add (dst, '$');
4760 Dynarr_add (dst, inter96[reg]);
4763 Dynarr_add (dst, final);
4767 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4769 if (str->iso2022.register_left != 0)
4771 Dynarr_add (dst, ISO_CODE_SI);
4772 str->iso2022.register_left = 0;
4777 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4779 if (str->iso2022.register_left != 1)
4781 Dynarr_add (dst, ISO_CODE_SO);
4782 str->iso2022.register_left = 1;
4787 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4788 unsigned_char_dynarr *dst, unsigned int *flags)
4790 unsigned char charmask;
4791 Lisp_Coding_System* codesys = str->codesys;
4792 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4794 Lisp_Object charset = str->iso2022.current_charset;
4795 int half = str->iso2022.current_half;
4796 unsigned int byte1, byte2;
4800 restore_left_to_right_direction (codesys, dst, flags, 0);
4802 /* Make sure G0 contains ASCII */
4803 if ((ch > ' ' && ch < ISO_CODE_DEL)
4804 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4806 ensure_normal_shift (str, dst);
4807 iso2022_designate (Vcharset_ascii, 0, str, dst);
4810 /* If necessary, restore everything to the default state
4812 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4814 restore_left_to_right_direction (codesys, dst, flags, 0);
4816 ensure_normal_shift (str, dst);
4818 for (i = 0; i < 4; i++)
4820 Lisp_Object initial_charset =
4821 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4822 iso2022_designate (initial_charset, i, str, dst);
4827 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4828 Dynarr_add (dst, '\r');
4829 if (eol_type != EOL_CR)
4830 Dynarr_add (dst, ch);
4834 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4835 && fit_to_be_escape_quoted (ch))
4836 Dynarr_add (dst, ISO_CODE_ESC);
4837 Dynarr_add (dst, ch);
4840 else if ( (0x80 <= ch) && (ch <= 0x9f) )
4842 charmask = (half == 0 ? 0x00 : 0x80);
4844 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4845 && fit_to_be_escape_quoted (ch))
4846 Dynarr_add (dst, ISO_CODE_ESC);
4847 /* you asked for it ... */
4848 Dynarr_add (dst, ch);
4854 BREAKUP_CHAR (ch, charset, byte1, byte2);
4855 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4856 codesys, dst, flags, 0);
4858 /* Now determine which register to use. */
4860 for (i = 0; i < 4; i++)
4862 if (EQ (charset, str->iso2022.charset[i]) ||
4864 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4873 if (XCHARSET_GRAPHIC (charset) != 0)
4875 if (!NILP (str->iso2022.charset[1]) &&
4876 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4877 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4879 else if (!NILP (str->iso2022.charset[2]))
4881 else if (!NILP (str->iso2022.charset[3]))
4890 iso2022_designate (charset, reg, str, dst);
4892 /* Now invoke that register. */
4896 ensure_normal_shift (str, dst);
4900 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4902 ensure_shift_out (str, dst);
4909 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4911 Dynarr_add (dst, ISO_CODE_ESC);
4912 Dynarr_add (dst, 'N');
4917 Dynarr_add (dst, ISO_CODE_SS2);
4922 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4924 Dynarr_add (dst, ISO_CODE_ESC);
4925 Dynarr_add (dst, 'O');
4930 Dynarr_add (dst, ISO_CODE_SS3);
4938 charmask = (half == 0 ? 0x00 : 0x80);
4940 switch (XCHARSET_DIMENSION (charset))
4943 Dynarr_add (dst, byte1 | charmask);
4946 Dynarr_add (dst, byte1 | charmask);
4947 Dynarr_add (dst, byte2 | charmask);
4953 str->iso2022.current_charset = charset;
4954 str->iso2022.current_half = half;
4958 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4959 unsigned int *flags)
4961 Lisp_Coding_System* codesys = str->codesys;
4964 restore_left_to_right_direction (codesys, dst, flags, 0);
4965 ensure_normal_shift (str, dst);
4966 for (i = 0; i < 4; i++)
4968 Lisp_Object initial_charset
4969 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4970 iso2022_designate (initial_charset, i, str, dst);
4975 /************************************************************************/
4976 /* No-conversion methods */
4977 /************************************************************************/
4979 /* This is used when reading in "binary" files -- i.e. files that may
4980 contain all 256 possible byte values and that are not to be
4981 interpreted as being in any particular decoding. */
4983 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4984 unsigned_char_dynarr *dst, unsigned int n)
4987 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4988 unsigned int flags = str->flags;
4989 unsigned int ch = str->ch;
4990 eol_type_t eol_type = str->eol_type;
4996 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4997 DECODE_ADD_BINARY_CHAR (c, dst);
4998 label_continue_loop:;
5001 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5008 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5009 unsigned_char_dynarr *dst, unsigned int n)
5012 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5013 unsigned int flags = str->flags;
5014 unsigned int ch = str->ch;
5015 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5017 unsigned char char_boundary = str->iso2022.current_char_boundary;
5024 switch (char_boundary)
5032 else if ( c >= 0xf8 )
5037 else if ( c >= 0xf0 )
5042 else if ( c >= 0xe0 )
5047 else if ( c >= 0xc0 )
5058 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5059 Dynarr_add (dst, '\r');
5060 if (eol_type != EOL_CR)
5061 Dynarr_add (dst, c);
5064 Dynarr_add (dst, c);
5069 ch = ( ch << 6 ) | ( c & 0x3f );
5070 switch ( str->codesys->fixed.size )
5073 Dynarr_add (dst, ch & 0xff);
5076 Dynarr_add (dst, (ch >> 8) & 0xff);
5077 Dynarr_add (dst, ch & 0xff);
5080 Dynarr_add (dst, (ch >> 16) & 0xff);
5081 Dynarr_add (dst, (ch >> 8) & 0xff);
5082 Dynarr_add (dst, ch & 0xff);
5085 Dynarr_add (dst, (ch >> 24) & 0xff);
5086 Dynarr_add (dst, (ch >> 16) & 0xff);
5087 Dynarr_add (dst, (ch >> 8) & 0xff);
5088 Dynarr_add (dst, ch & 0xff);
5091 fprintf(stderr, "It seems %d bytes stream.\n",
5092 str->codesys->fixed.size);
5098 ch = ( ch << 6 ) | ( c & 0x3f );
5101 #else /* not UTF2000 */
5104 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5105 Dynarr_add (dst, '\r');
5106 if (eol_type != EOL_CR)
5107 Dynarr_add (dst, '\n');
5110 else if (BYTE_ASCII_P (c))
5113 Dynarr_add (dst, c);
5115 else if (BUFBYTE_LEADING_BYTE_P (c))
5118 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5119 c == LEADING_BYTE_CONTROL_1)
5122 Dynarr_add (dst, '~'); /* untranslatable character */
5126 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5127 Dynarr_add (dst, c);
5128 else if (ch == LEADING_BYTE_CONTROL_1)
5131 Dynarr_add (dst, c - 0x20);
5133 /* else it should be the second or third byte of an
5134 untranslatable character, so ignore it */
5137 #endif /* not UTF2000 */
5143 str->iso2022.current_char_boundary = char_boundary;
5148 /************************************************************************/
5149 /* Simple internal/external functions */
5150 /************************************************************************/
5152 static Extbyte_dynarr *conversion_out_dynarr;
5153 static Bufbyte_dynarr *conversion_in_dynarr;
5155 /* Determine coding system from coding format */
5157 /* #### not correct for all values of `fmt'! */
5159 external_data_format_to_coding_system (enum external_data_format fmt)
5163 case FORMAT_FILENAME:
5164 case FORMAT_TERMINAL:
5165 if (EQ (Vfile_name_coding_system, Qnil) ||
5166 EQ (Vfile_name_coding_system, Qbinary))
5169 return Fget_coding_system (Vfile_name_coding_system);
5172 return Fget_coding_system (Qctext);
5180 convert_to_external_format (CONST Bufbyte *ptr,
5183 enum external_data_format fmt)
5185 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5187 if (!conversion_out_dynarr)
5188 conversion_out_dynarr = Dynarr_new (Extbyte);
5190 Dynarr_reset (conversion_out_dynarr);
5192 if (NILP (coding_system))
5194 CONST Bufbyte *end = ptr + len;
5200 (*ptr < 0xc0) ? *ptr :
5201 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5204 (BYTE_ASCII_P (*ptr)) ? *ptr :
5205 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5206 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5209 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5213 #ifdef ERROR_CHECK_BUFPOS
5214 assert (ptr == end);
5219 Lisp_Object instream, outstream, da_outstream;
5220 Lstream *istr, *ostr;
5221 struct gcpro gcpro1, gcpro2, gcpro3;
5222 char tempbuf[1024]; /* some random amount */
5224 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5225 da_outstream = make_dynarr_output_stream
5226 ((unsigned_char_dynarr *) conversion_out_dynarr);
5228 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5229 istr = XLSTREAM (instream);
5230 ostr = XLSTREAM (outstream);
5231 GCPRO3 (instream, outstream, da_outstream);
5234 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5237 Lstream_write (ostr, tempbuf, size_in_bytes);
5239 Lstream_close (istr);
5240 Lstream_close (ostr);
5242 Lstream_delete (istr);
5243 Lstream_delete (ostr);
5244 Lstream_delete (XLSTREAM (da_outstream));
5247 *len_out = Dynarr_length (conversion_out_dynarr);
5248 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5249 return Dynarr_atp (conversion_out_dynarr, 0);
5253 convert_from_external_format (CONST Extbyte *ptr,
5256 enum external_data_format fmt)
5258 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5260 if (!conversion_in_dynarr)
5261 conversion_in_dynarr = Dynarr_new (Bufbyte);
5263 Dynarr_reset (conversion_in_dynarr);
5265 if (NILP (coding_system))
5267 CONST Extbyte *end = ptr + len;
5268 for (; ptr < end; ptr++)
5271 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5276 Lisp_Object instream, outstream, da_outstream;
5277 Lstream *istr, *ostr;
5278 struct gcpro gcpro1, gcpro2, gcpro3;
5279 char tempbuf[1024]; /* some random amount */
5281 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5282 da_outstream = make_dynarr_output_stream
5283 ((unsigned_char_dynarr *) conversion_in_dynarr);
5285 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5286 istr = XLSTREAM (instream);
5287 ostr = XLSTREAM (outstream);
5288 GCPRO3 (instream, outstream, da_outstream);
5291 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5294 Lstream_write (ostr, tempbuf, size_in_bytes);
5296 Lstream_close (istr);
5297 Lstream_close (ostr);
5299 Lstream_delete (istr);
5300 Lstream_delete (ostr);
5301 Lstream_delete (XLSTREAM (da_outstream));
5304 *len_out = Dynarr_length (conversion_in_dynarr);
5305 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5306 return Dynarr_atp (conversion_in_dynarr, 0);
5310 /************************************************************************/
5311 /* Initialization */
5312 /************************************************************************/
5315 syms_of_file_coding (void)
5317 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5318 deferror (&Qcoding_system_error, "coding-system-error",
5319 "Coding-system error", Qio_error);
5321 DEFSUBR (Fcoding_system_p);
5322 DEFSUBR (Ffind_coding_system);
5323 DEFSUBR (Fget_coding_system);
5324 DEFSUBR (Fcoding_system_list);
5325 DEFSUBR (Fcoding_system_name);
5326 DEFSUBR (Fmake_coding_system);
5327 DEFSUBR (Fcopy_coding_system);
5328 DEFSUBR (Fdefine_coding_system_alias);
5329 DEFSUBR (Fsubsidiary_coding_system);
5331 DEFSUBR (Fcoding_system_type);
5332 DEFSUBR (Fcoding_system_doc_string);
5334 DEFSUBR (Fcoding_system_charset);
5336 DEFSUBR (Fcoding_system_property);
5338 DEFSUBR (Fcoding_category_list);
5339 DEFSUBR (Fset_coding_priority_list);
5340 DEFSUBR (Fcoding_priority_list);
5341 DEFSUBR (Fset_coding_category_system);
5342 DEFSUBR (Fcoding_category_system);
5344 DEFSUBR (Fdetect_coding_region);
5345 DEFSUBR (Fdecode_coding_region);
5346 DEFSUBR (Fencode_coding_region);
5348 DEFSUBR (Fdecode_shift_jis_char);
5349 DEFSUBR (Fencode_shift_jis_char);
5350 DEFSUBR (Fdecode_big5_char);
5351 DEFSUBR (Fencode_big5_char);
5353 defsymbol (&Qcoding_system_p, "coding-system-p");
5354 defsymbol (&Qno_conversion, "no-conversion");
5355 defsymbol (&Qraw_text, "raw-text");
5357 defsymbol (&Qbig5, "big5");
5358 defsymbol (&Qshift_jis, "shift-jis");
5359 defsymbol (&Qucs4, "ucs-4");
5360 defsymbol (&Qutf8, "utf-8");
5361 defsymbol (&Qccl, "ccl");
5362 defsymbol (&Qiso2022, "iso2022");
5364 defsymbol (&Qmnemonic, "mnemonic");
5365 defsymbol (&Qeol_type, "eol-type");
5366 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5367 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5369 defsymbol (&Qcr, "cr");
5370 defsymbol (&Qlf, "lf");
5371 defsymbol (&Qcrlf, "crlf");
5372 defsymbol (&Qeol_cr, "eol-cr");
5373 defsymbol (&Qeol_lf, "eol-lf");
5374 defsymbol (&Qeol_crlf, "eol-crlf");
5376 defsymbol (&Qcharset_g0, "charset-g0");
5377 defsymbol (&Qcharset_g1, "charset-g1");
5378 defsymbol (&Qcharset_g2, "charset-g2");
5379 defsymbol (&Qcharset_g3, "charset-g3");
5380 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5381 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5382 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5383 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5384 defsymbol (&Qno_iso6429, "no-iso6429");
5385 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5386 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5388 defsymbol (&Qshort, "short");
5389 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5390 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5391 defsymbol (&Qseven, "seven");
5392 defsymbol (&Qlock_shift, "lock-shift");
5393 defsymbol (&Qescape_quoted, "escape-quoted");
5395 defsymbol (&Qencode, "encode");
5396 defsymbol (&Qdecode, "decode");
5399 defsymbol (&Qctext, "ctext");
5400 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5402 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5404 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5406 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5408 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5410 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5412 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5414 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5416 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5419 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5424 lstream_type_create_file_coding (void)
5426 LSTREAM_HAS_METHOD (decoding, reader);
5427 LSTREAM_HAS_METHOD (decoding, writer);
5428 LSTREAM_HAS_METHOD (decoding, rewinder);
5429 LSTREAM_HAS_METHOD (decoding, seekable_p);
5430 LSTREAM_HAS_METHOD (decoding, flusher);
5431 LSTREAM_HAS_METHOD (decoding, closer);
5432 LSTREAM_HAS_METHOD (decoding, marker);
5434 LSTREAM_HAS_METHOD (encoding, reader);
5435 LSTREAM_HAS_METHOD (encoding, writer);
5436 LSTREAM_HAS_METHOD (encoding, rewinder);
5437 LSTREAM_HAS_METHOD (encoding, seekable_p);
5438 LSTREAM_HAS_METHOD (encoding, flusher);
5439 LSTREAM_HAS_METHOD (encoding, closer);
5440 LSTREAM_HAS_METHOD (encoding, marker);
5444 vars_of_file_coding (void)
5448 /* Initialize to something reasonable ... */
5449 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5451 coding_category_system[i] = Qnil;
5452 coding_category_by_priority[i] = i;
5455 Fprovide (intern ("file-coding"));
5457 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5458 Coding system used for TTY keyboard input.
5459 Not used under a windowing system.
5461 Vkeyboard_coding_system = Qnil;
5463 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5464 Coding system used for TTY display output.
5465 Not used under a windowing system.
5467 Vterminal_coding_system = Qnil;
5469 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5470 Overriding coding system used when writing a file or process.
5471 You should *bind* this, not set it. If this is non-nil, it specifies
5472 the coding system that will be used when a file or process is read
5473 in, and overrides `buffer-file-coding-system-for-read',
5474 `insert-file-contents-pre-hook', etc. Use those variables instead of
5475 this one for permanent changes to the environment.
5477 Vcoding_system_for_read = Qnil;
5479 DEFVAR_LISP ("coding-system-for-write",
5480 &Vcoding_system_for_write /*
5481 Overriding coding system used when writing a file or process.
5482 You should *bind* this, not set it. If this is non-nil, it specifies
5483 the coding system that will be used when a file or process is wrote
5484 in, and overrides `buffer-file-coding-system',
5485 `write-region-pre-hook', etc. Use those variables instead of this one
5486 for permanent changes to the environment.
5488 Vcoding_system_for_write = Qnil;
5490 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5491 Coding system used to convert pathnames when accessing files.
5493 Vfile_name_coding_system = Qnil;
5495 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5496 Non-nil means the buffer contents are regarded as multi-byte form
5497 of characters, not a binary code. This affects the display, file I/O,
5498 and behaviors of various editing commands.
5500 Setting this to nil does not do anything.
5502 enable_multibyte_characters = 1;
5506 complex_vars_of_file_coding (void)
5508 staticpro (&Vcoding_system_hash_table);
5509 Vcoding_system_hash_table =
5510 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5512 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5514 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5516 struct codesys_prop csp; \
5518 csp.prop_type = (Prop_Type); \
5519 Dynarr_add (the_codesys_prop_dynarr, csp); \
5522 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5523 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5524 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5525 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5526 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5527 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5528 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5530 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5531 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5532 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5533 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5534 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5535 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5536 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5537 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5538 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5539 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5540 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5541 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5542 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5543 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5544 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5545 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5546 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5548 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5549 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5551 /* Need to create this here or we're really screwed. */
5553 (Qraw_text, Qno_conversion,
5554 build_string ("Raw text, which means it converts only line-break-codes."),
5555 list2 (Qmnemonic, build_string ("Raw")));
5558 (Qbinary, Qno_conversion,
5559 build_string ("Binary, which means it does not convert anything."),
5560 list4 (Qeol_type, Qlf,
5561 Qmnemonic, build_string ("Binary")));
5566 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5567 list2 (Qmnemonic, build_string ("UTF8")));
5570 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5572 /* Need this for bootstrapping */
5573 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5574 Fget_coding_system (Qraw_text);
5577 coding_category_system[CODING_CATEGORY_UTF8]
5578 = Fget_coding_system (Qutf8);