1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
50 /* Coding system currently associated with each coding category. */
51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
53 /* Table of all coding categories in decreasing order of priority.
54 This describes a permutation of the possible coding categories. */
55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
57 Lisp_Object Qcoding_system_p;
59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
60 /* Qinternal in general.c */
62 Lisp_Object Qmnemonic, Qeol_type;
63 Lisp_Object Qcr, Qcrlf, Qlf;
64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
65 Lisp_Object Qpost_read_conversion;
66 Lisp_Object Qpre_write_conversion;
69 Lisp_Object Qucs4, Qutf8;
70 Lisp_Object Qbig5, Qshift_jis;
71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
74 Lisp_Object Qno_iso6429;
75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
76 Lisp_Object Qctext, Qescape_quoted;
77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
79 Lisp_Object Qencode, Qdecode;
81 Lisp_Object Vcoding_system_hash_table;
83 int enable_multibyte_characters;
86 /* Additional information used by the ISO2022 decoder and detector. */
87 struct iso2022_decoder
89 /* CHARSET holds the character sets currently assigned to the G0
90 through G3 variables. It is initialized from the array
91 INITIAL_CHARSET in CODESYS. */
92 Lisp_Object charset[4];
94 /* Which registers are currently invoked into the left (GL) and
95 right (GR) halves of the 8-bit encoding space? */
96 int register_left, register_right;
98 /* ISO_ESC holds a value indicating part of an escape sequence
99 that has already been seen. */
100 enum iso_esc_flag esc;
102 /* This records the bytes we've seen so far in an escape sequence,
103 in case the sequence is invalid (we spit out the bytes unchanged). */
104 unsigned char esc_bytes[8];
106 /* Index for next byte to store in ISO escape sequence. */
109 #ifdef ENABLE_COMPOSITE_CHARS
110 /* Stuff seen so far when composing a string. */
111 unsigned_char_dynarr *composite_chars;
114 /* If we saw an invalid designation sequence for a particular
115 register, we flag it here and switch to ASCII. The next time we
116 see a valid designation for this register, we turn off the flag
117 and do the designation normally, but pretend the sequence was
118 invalid. The effect of all this is that (most of the time) the
119 escape sequences for both the switch to the unknown charset, and
120 the switch back to the known charset, get inserted literally into
121 the buffer and saved out as such. The hope is that we can
122 preserve the escape sequences so that the resulting written out
123 file makes sense. If we don't do any of this, the designation
124 to the invalid charset will be preserved but that switch back
125 to the known charset will probably get eaten because it was
126 the same charset that was already present in the register. */
127 unsigned char invalid_designated[4];
129 /* We try to do similar things as above for direction-switching
130 sequences. If we encountered a direction switch while an
131 invalid designation was present, or an invalid designation
132 just after a direction switch (i.e. no valid designation
133 encountered yet), we insert the direction-switch escape
134 sequence literally into the output stream, and later on
135 insert the corresponding direction-restoring escape sequence
137 unsigned int switched_dir_and_no_valid_charset_yet :1;
138 unsigned int invalid_switch_dir :1;
140 /* Tells the decoder to output the escape sequence literally
141 even though it was valid. Used in the games we play to
142 avoid lossage when we encounter invalid designations. */
143 unsigned int output_literally :1;
144 /* We encountered a direction switch followed by an invalid
145 designation. We didn't output the direction switch
146 literally because we didn't know about the invalid designation;
147 but we have to do so now. */
148 unsigned int output_direction_sequence :1;
151 EXFUN (Fcopy_coding_system, 2);
153 struct detection_state;
156 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
157 unsigned_char_dynarr *dst, unsigned int n);
159 static int detect_coding_sjis (struct detection_state *st,
160 CONST unsigned char *src,
162 static void decode_coding_sjis (Lstream *decoding,
163 CONST unsigned char *src,
164 unsigned_char_dynarr *dst,
166 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
167 unsigned_char_dynarr *dst, unsigned int *flags);
168 void char_finish_shift_jis (struct encoding_stream *str,
169 unsigned_char_dynarr *dst, unsigned int *flags);
171 static int detect_coding_big5 (struct detection_state *st,
172 CONST unsigned char *src,
174 static void decode_coding_big5 (Lstream *decoding,
175 CONST unsigned char *src,
176 unsigned_char_dynarr *dst, unsigned int n);
177 static void encode_coding_big5 (Lstream *encoding,
178 CONST unsigned char *src,
179 unsigned_char_dynarr *dst, unsigned int n);
180 static int detect_coding_ucs4 (struct detection_state *st,
181 CONST unsigned char *src,
183 static void decode_coding_ucs4 (Lstream *decoding,
184 CONST unsigned char *src,
185 unsigned_char_dynarr *dst, unsigned int n);
186 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
187 unsigned_char_dynarr *dst, unsigned int *flags);
188 void char_finish_ucs4 (struct encoding_stream *str,
189 unsigned_char_dynarr *dst, unsigned int *flags);
191 static int detect_coding_utf8 (struct detection_state *st,
192 CONST unsigned char *src,
194 static void decode_coding_utf8 (Lstream *decoding,
195 CONST unsigned char *src,
196 unsigned_char_dynarr *dst, unsigned int n);
197 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
198 unsigned_char_dynarr *dst, unsigned int *flags);
199 void char_finish_utf8 (struct encoding_stream *str,
200 unsigned_char_dynarr *dst, unsigned int *flags);
202 static int postprocess_iso2022_mask (int mask);
203 static void reset_iso2022 (Lisp_Object coding_system,
204 struct iso2022_decoder *iso);
205 static int detect_coding_iso2022 (struct detection_state *st,
206 CONST unsigned char *src,
208 static void decode_coding_iso2022 (Lstream *decoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst, unsigned int n);
211 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
212 unsigned_char_dynarr *dst, unsigned int *flags);
213 void char_finish_iso2022 (struct encoding_stream *str,
214 unsigned_char_dynarr *dst, unsigned int *flags);
216 static void decode_coding_no_conversion (Lstream *decoding,
217 CONST unsigned char *src,
218 unsigned_char_dynarr *dst,
220 static void encode_coding_no_conversion (Lstream *encoding,
221 CONST unsigned char *src,
222 unsigned_char_dynarr *dst,
224 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
225 unsigned_char_dynarr *dst, unsigned int n);
226 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
227 unsigned_char_dynarr *dst, unsigned int n);
229 typedef struct codesys_prop codesys_prop;
238 Dynarr_declare (codesys_prop);
239 } codesys_prop_dynarr;
241 codesys_prop_dynarr *the_codesys_prop_dynarr;
243 enum codesys_prop_enum
246 CODESYS_PROP_ISO2022,
251 /************************************************************************/
252 /* Coding system functions */
253 /************************************************************************/
255 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
256 static void print_coding_system (Lisp_Object, Lisp_Object, int);
257 static void finalize_coding_system (void *header, int for_disksave);
260 static const struct lrecord_description ccs_description_1[] = {
261 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
265 static const struct struct_description ccs_description = {
266 sizeof(charset_conversion_spec),
270 static const struct lrecord_description ccsd_description_1[] = {
271 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
275 static const struct struct_description ccsd_description = {
276 sizeof(charset_conversion_spec_dynarr),
281 static const struct lrecord_description coding_system_description[] = {
282 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
283 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
284 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
286 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
287 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
288 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
289 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
294 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
295 mark_coding_system, print_coding_system,
296 finalize_coding_system,
297 0, 0, coding_system_description,
298 struct Lisp_Coding_System);
301 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
303 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
305 markobj (CODING_SYSTEM_NAME (codesys));
306 markobj (CODING_SYSTEM_DOC_STRING (codesys));
307 markobj (CODING_SYSTEM_MNEMONIC (codesys));
308 markobj (CODING_SYSTEM_EOL_LF (codesys));
309 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
310 markobj (CODING_SYSTEM_EOL_CR (codesys));
312 switch (CODING_SYSTEM_TYPE (codesys))
316 case CODESYS_ISO2022:
317 for (i = 0; i < 4; i++)
318 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
319 if (codesys->iso2022.input_conv)
321 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
323 struct charset_conversion_spec *ccs =
324 Dynarr_atp (codesys->iso2022.input_conv, i);
325 markobj (ccs->from_charset);
326 markobj (ccs->to_charset);
329 if (codesys->iso2022.output_conv)
331 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
333 struct charset_conversion_spec *ccs =
334 Dynarr_atp (codesys->iso2022.output_conv, i);
335 markobj (ccs->from_charset);
336 markobj (ccs->to_charset);
342 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
343 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
350 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
351 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
355 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
358 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
360 error ("printing unreadable object #<coding_system 0x%x>",
363 write_c_string ("#<coding_system ", printcharfun);
364 print_internal (c->name, printcharfun, 1);
365 write_c_string (">", printcharfun);
369 finalize_coding_system (void *header, int for_disksave)
371 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
372 /* Since coding systems never go away, this function is not
373 necessary. But it would be necessary if we changed things
374 so that coding systems could go away. */
375 if (!for_disksave) /* see comment in lstream.c */
377 switch (CODING_SYSTEM_TYPE (c))
380 case CODESYS_ISO2022:
381 if (c->iso2022.input_conv)
383 Dynarr_free (c->iso2022.input_conv);
384 c->iso2022.input_conv = 0;
386 if (c->iso2022.output_conv)
388 Dynarr_free (c->iso2022.output_conv);
389 c->iso2022.output_conv = 0;
400 symbol_to_eol_type (Lisp_Object symbol)
402 CHECK_SYMBOL (symbol);
403 if (NILP (symbol)) return EOL_AUTODETECT;
404 if (EQ (symbol, Qlf)) return EOL_LF;
405 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
406 if (EQ (symbol, Qcr)) return EOL_CR;
408 signal_simple_error ("Unrecognized eol type", symbol);
409 return EOL_AUTODETECT; /* not reached */
413 eol_type_to_symbol (enum eol_type type)
418 case EOL_LF: return Qlf;
419 case EOL_CRLF: return Qcrlf;
420 case EOL_CR: return Qcr;
421 case EOL_AUTODETECT: return Qnil;
426 setup_eol_coding_systems (Lisp_Coding_System *codesys)
428 Lisp_Object codesys_obj;
429 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
430 char *codesys_name = (char *) alloca (len + 7);
432 char *codesys_mnemonic=0;
434 Lisp_Object codesys_name_sym, sub_codesys_obj;
438 XSETCODING_SYSTEM (codesys_obj, codesys);
440 memcpy (codesys_name,
441 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
443 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
445 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
446 codesys_mnemonic = (char *) alloca (mlen + 7);
447 memcpy (codesys_mnemonic,
448 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
451 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
452 strcpy (codesys_name + len, "-" op_sys); \
454 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
455 codesys_name_sym = intern (codesys_name); \
456 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
457 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
459 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
460 build_string (codesys_mnemonic); \
461 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
464 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
465 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
466 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
469 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
470 Return t if OBJECT is a coding system.
471 A coding system is an object that defines how text containing multiple
472 character sets is encoded into a stream of (typically 8-bit) bytes.
473 The coding system is used to decode the stream into a series of
474 characters (which may be from multiple charsets) when the text is read
475 from a file or process, and is used to encode the text back into the
476 same format when it is written out to a file or process.
478 For example, many ISO2022-compliant coding systems (such as Compound
479 Text, which is used for inter-client data under the X Window System)
480 use escape sequences to switch between different charsets -- Japanese
481 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
482 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
483 `make-coding-system' for more information.
485 Coding systems are normally identified using a symbol, and the
486 symbol is accepted in place of the actual coding system object whenever
487 a coding system is called for. (This is similar to how faces work.)
491 return CODING_SYSTEMP (object) ? Qt : Qnil;
494 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
495 Retrieve the coding system of the given name.
497 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
498 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
499 If there is no such coding system, nil is returned. Otherwise the
500 associated coding system object is returned.
502 (coding_system_or_name))
504 if (CODING_SYSTEMP (coding_system_or_name))
505 return coding_system_or_name;
507 if (NILP (coding_system_or_name))
508 coding_system_or_name = Qbinary;
510 CHECK_SYMBOL (coding_system_or_name);
512 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
515 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
516 Retrieve the coding system of the given name.
517 Same as `find-coding-system' except that if there is no such
518 coding system, an error is signaled instead of returning nil.
522 Lisp_Object coding_system = Ffind_coding_system (name);
524 if (NILP (coding_system))
525 signal_simple_error ("No such coding system", name);
526 return coding_system;
529 /* We store the coding systems in hash tables with the names as the key and the
530 actual coding system object as the value. Occasionally we need to use them
531 in a list format. These routines provide us with that. */
532 struct coding_system_list_closure
534 Lisp_Object *coding_system_list;
538 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
539 void *coding_system_list_closure)
541 /* This function can GC */
542 struct coding_system_list_closure *cscl =
543 (struct coding_system_list_closure *) coding_system_list_closure;
544 Lisp_Object *coding_system_list = cscl->coding_system_list;
546 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
547 *coding_system_list);
551 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
552 Return a list of the names of all defined coding systems.
556 Lisp_Object coding_system_list = Qnil;
558 struct coding_system_list_closure coding_system_list_closure;
560 GCPRO1 (coding_system_list);
561 coding_system_list_closure.coding_system_list = &coding_system_list;
562 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
563 &coding_system_list_closure);
566 return coding_system_list;
569 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
570 Return the name of the given coding system.
574 coding_system = Fget_coding_system (coding_system);
575 return XCODING_SYSTEM_NAME (coding_system);
578 static Lisp_Coding_System *
579 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
581 Lisp_Coding_System *codesys =
582 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
584 zero_lcrecord (codesys);
585 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
586 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
587 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
588 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
589 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
590 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
591 CODING_SYSTEM_TYPE (codesys) = type;
592 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
594 if (type == CODESYS_ISO2022)
597 for (i = 0; i < 4; i++)
598 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
600 else if (type == CODESYS_CCL)
602 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
603 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
606 CODING_SYSTEM_NAME (codesys) = name;
612 /* Given a list of charset conversion specs as specified in a Lisp
613 program, parse it into STORE_HERE. */
616 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
617 Lisp_Object spec_list)
621 EXTERNAL_LIST_LOOP (rest, spec_list)
623 Lisp_Object car = XCAR (rest);
624 Lisp_Object from, to;
625 struct charset_conversion_spec spec;
627 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
628 signal_simple_error ("Invalid charset conversion spec", car);
629 from = Fget_charset (XCAR (car));
630 to = Fget_charset (XCAR (XCDR (car)));
631 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
632 signal_simple_error_2
633 ("Attempted conversion between different charset types",
635 spec.from_charset = from;
636 spec.to_charset = to;
638 Dynarr_add (store_here, spec);
642 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
643 specs, return the equivalent as the Lisp programmer would see it.
645 If LOAD_HERE is 0, return Qnil. */
648 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
655 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
657 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
658 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
661 return Fnreverse (result);
666 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
667 Register symbol NAME as a coding system.
669 TYPE describes the conversion method used and should be one of
672 Automatic conversion. XEmacs attempts to detect the coding system
675 No conversion. Use this for binary files and such. On output,
676 graphic characters that are not in ASCII or Latin-1 will be
677 replaced by a ?. (For a no-conversion-encoded buffer, these
678 characters will only be present if you explicitly insert them.)
680 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
682 ISO 10646 UCS-4 encoding.
684 ISO 10646 UTF-8 encoding.
686 Any ISO2022-compliant encoding. Among other things, this includes
687 JIS (the Japanese encoding commonly used for e-mail), EUC (the
688 standard Unix encoding for Japanese and other languages), and
689 Compound Text (the encoding used in X11). You can specify more
690 specific information about the conversion with the FLAGS argument.
692 Big5 (the encoding commonly used for Taiwanese).
694 The conversion is performed using a user-written pseudo-code
695 program. CCL (Code Conversion Language) is the name of this
698 Write out or read in the raw contents of the memory representing
699 the buffer's text. This is primarily useful for debugging
700 purposes, and is only enabled when XEmacs has been compiled with
701 DEBUG_XEMACS defined (via the --debug configure option).
702 WARNING: Reading in a file using 'internal conversion can result
703 in an internal inconsistency in the memory representing a
704 buffer's text, which will produce unpredictable results and may
705 cause XEmacs to crash. Under normal circumstances you should
706 never use 'internal conversion.
708 DOC-STRING is a string describing the coding system.
710 PROPS is a property list, describing the specific nature of the
711 character set. Recognized properties are:
714 String to be displayed in the modeline when this coding system is
718 End-of-line conversion to be used. It should be one of
721 Automatically detect the end-of-line type (LF, CRLF,
722 or CR). Also generate subsidiary coding systems named
723 `NAME-unix', `NAME-dos', and `NAME-mac', that are
724 identical to this coding system but have an EOL-TYPE
725 value of 'lf, 'crlf, and 'cr, respectively.
727 The end of a line is marked externally using ASCII LF.
728 Since this is also the way that XEmacs represents an
729 end-of-line internally, specifying this option results
730 in no end-of-line conversion. This is the standard
731 format for Unix text files.
733 The end of a line is marked externally using ASCII
734 CRLF. This is the standard format for MS-DOS text
737 The end of a line is marked externally using ASCII CR.
738 This is the standard format for Macintosh text files.
740 Automatically detect the end-of-line type but do not
741 generate subsidiary coding systems. (This value is
742 converted to nil when stored internally, and
743 `coding-system-property' will return nil.)
745 'post-read-conversion
746 Function called after a file has been read in, to perform the
747 decoding. Called with two arguments, BEG and END, denoting
748 a region of the current buffer to be decoded.
750 'pre-write-conversion
751 Function called before a file is written out, to perform the
752 encoding. Called with two arguments, BEG and END, denoting
753 a region of the current buffer to be encoded.
756 The following additional properties are recognized if TYPE is 'iso2022:
762 The character set initially designated to the G0 - G3 registers.
763 The value should be one of
765 -- A charset object (designate that character set)
766 -- nil (do not ever use this register)
767 -- t (no character set is initially designated to
768 the register, but may be later on; this automatically
769 sets the corresponding `force-g*-on-output' property)
775 If non-nil, send an explicit designation sequence on output before
776 using the specified register.
779 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
780 "ESC $ B" on output in place of the full designation sequences
781 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
784 If non-nil, don't designate ASCII to G0 at each end of line on output.
785 Setting this to non-nil also suppresses other state-resetting that
786 normally happens at the end of a line.
789 If non-nil, don't designate ASCII to G0 before control chars on output.
792 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
796 If non-nil, use locking-shift (SO/SI) instead of single-shift
797 or designation by escape sequence.
800 If non-nil, don't use ISO6429's direction specification.
803 If non-nil, literal control characters that are the same as
804 the beginning of a recognized ISO2022 or ISO6429 escape sequence
805 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
806 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
807 so that they can be properly distinguished from an escape sequence.
808 (Note that doing this results in a non-portable encoding.) This
809 encoding flag is used for byte-compiled files. Note that ESC
810 is a good choice for a quoting character because there are no
811 escape sequences whose second byte is a character from the Control-0
812 or Control-1 character sets; this is explicitly disallowed by the
815 'input-charset-conversion
816 A list of conversion specifications, specifying conversion of
817 characters in one charset to another when decoding is performed.
818 Each specification is a list of two elements: the source charset,
819 and the destination charset.
821 'output-charset-conversion
822 A list of conversion specifications, specifying conversion of
823 characters in one charset to another when encoding is performed.
824 The form of each specification is the same as for
825 'input-charset-conversion.
828 The following additional properties are recognized (and required)
832 CCL program used for decoding (converting to internal format).
835 CCL program used for encoding (converting to external format).
837 (name, type, doc_string, props))
839 Lisp_Coding_System *codesys;
840 Lisp_Object rest, key, value;
841 enum coding_system_type ty;
842 int need_to_setup_eol_systems = 1;
844 /* Convert type to constant */
845 if (NILP (type) || EQ (type, Qundecided))
846 { ty = CODESYS_AUTODETECT; }
848 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
849 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
850 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
851 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
852 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
853 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
855 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
857 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
860 signal_simple_error ("Invalid coding system type", type);
864 codesys = allocate_coding_system (ty, name);
866 if (NILP (doc_string))
867 doc_string = build_string ("");
869 CHECK_STRING (doc_string);
870 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
872 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
874 if (EQ (key, Qmnemonic))
877 CHECK_STRING (value);
878 CODING_SYSTEM_MNEMONIC (codesys) = value;
881 else if (EQ (key, Qeol_type))
883 need_to_setup_eol_systems = NILP (value);
886 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
889 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
890 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
892 else if (ty == CODESYS_ISO2022)
894 #define FROB_INITIAL_CHARSET(charset_num) \
895 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
896 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
898 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
899 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
900 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
901 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
903 #define FROB_FORCE_CHARSET(charset_num) \
904 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
906 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
907 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
908 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
909 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
911 #define FROB_BOOLEAN_PROPERTY(prop) \
912 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
914 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
915 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
916 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
917 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
918 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
919 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
920 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
922 else if (EQ (key, Qinput_charset_conversion))
924 codesys->iso2022.input_conv =
925 Dynarr_new (charset_conversion_spec);
926 parse_charset_conversion_specs (codesys->iso2022.input_conv,
929 else if (EQ (key, Qoutput_charset_conversion))
931 codesys->iso2022.output_conv =
932 Dynarr_new (charset_conversion_spec);
933 parse_charset_conversion_specs (codesys->iso2022.output_conv,
937 signal_simple_error ("Unrecognized property", key);
939 else if (EQ (type, Qccl))
941 if (EQ (key, Qdecode))
943 CHECK_VECTOR (value);
944 CODING_SYSTEM_CCL_DECODE (codesys) = value;
946 else if (EQ (key, Qencode))
948 CHECK_VECTOR (value);
949 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
952 signal_simple_error ("Unrecognized property", key);
956 signal_simple_error ("Unrecognized property", key);
959 if (need_to_setup_eol_systems)
960 setup_eol_coding_systems (codesys);
963 Lisp_Object codesys_obj;
964 XSETCODING_SYSTEM (codesys_obj, codesys);
965 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
970 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
971 Copy OLD-CODING-SYSTEM to NEW-NAME.
972 If NEW-NAME does not name an existing coding system, a new one will
975 (old_coding_system, new_name))
977 Lisp_Object new_coding_system;
978 old_coding_system = Fget_coding_system (old_coding_system);
979 new_coding_system = Ffind_coding_system (new_name);
980 if (NILP (new_coding_system))
982 XSETCODING_SYSTEM (new_coding_system,
983 allocate_coding_system
984 (XCODING_SYSTEM_TYPE (old_coding_system),
986 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
990 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
991 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
992 memcpy (((char *) to ) + sizeof (to->header),
993 ((char *) from) + sizeof (from->header),
994 sizeof (*from) - sizeof (from->header));
997 return new_coding_system;
1000 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1001 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1003 (alias, coding_system))
1005 CHECK_SYMBOL (alias);
1006 if (!NILP (Ffind_coding_system (alias)))
1007 signal_simple_error ("Symbol already names a coding system", alias);
1008 coding_system = Fget_coding_system (coding_system);
1009 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1011 /* Set up aliases for subsidiaries. */
1012 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1015 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1016 #define FROB(type, name) \
1018 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1019 if (!NILP (subsidiary)) \
1020 Fdefine_coding_system_alias \
1021 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1024 FROB (CRLF, "-dos");
1028 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1029 but it doesn't look intentional, so I'd rather return something
1030 meaningful or nothing at all. */
1035 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1037 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1038 Lisp_Object new_coding_system;
1040 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1041 return coding_system;
1045 case EOL_AUTODETECT: return coding_system;
1046 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1047 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1048 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1052 return NILP (new_coding_system) ? coding_system : new_coding_system;
1055 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1056 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1058 (coding_system, eol_type))
1060 coding_system = Fget_coding_system (coding_system);
1062 return subsidiary_coding_system (coding_system,
1063 symbol_to_eol_type (eol_type));
1067 /************************************************************************/
1068 /* Coding system accessors */
1069 /************************************************************************/
1071 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1072 Return the doc string for CODING-SYSTEM.
1076 coding_system = Fget_coding_system (coding_system);
1077 return XCODING_SYSTEM_DOC_STRING (coding_system);
1080 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1081 Return the type of CODING-SYSTEM.
1085 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1088 case CODESYS_AUTODETECT: return Qundecided;
1090 case CODESYS_SHIFT_JIS: return Qshift_jis;
1091 case CODESYS_ISO2022: return Qiso2022;
1092 case CODESYS_BIG5: return Qbig5;
1093 case CODESYS_UCS4: return Qucs4;
1094 case CODESYS_UTF8: return Qutf8;
1095 case CODESYS_CCL: return Qccl;
1097 case CODESYS_NO_CONVERSION: return Qno_conversion;
1099 case CODESYS_INTERNAL: return Qinternal;
1106 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1109 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1111 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1114 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1115 Return initial charset of CODING-SYSTEM designated to GNUM.
1118 (coding_system, gnum))
1120 coding_system = Fget_coding_system (coding_system);
1123 return coding_system_charset (coding_system, XINT (gnum));
1127 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1128 Return the PROP property of CODING-SYSTEM.
1130 (coding_system, prop))
1133 enum coding_system_type type;
1135 coding_system = Fget_coding_system (coding_system);
1136 CHECK_SYMBOL (prop);
1137 type = XCODING_SYSTEM_TYPE (coding_system);
1139 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1140 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1143 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1145 case CODESYS_PROP_ALL_OK:
1148 case CODESYS_PROP_ISO2022:
1149 if (type != CODESYS_ISO2022)
1151 ("Property only valid in ISO2022 coding systems",
1155 case CODESYS_PROP_CCL:
1156 if (type != CODESYS_CCL)
1158 ("Property only valid in CCL coding systems",
1168 signal_simple_error ("Unrecognized property", prop);
1170 if (EQ (prop, Qname))
1171 return XCODING_SYSTEM_NAME (coding_system);
1172 else if (EQ (prop, Qtype))
1173 return Fcoding_system_type (coding_system);
1174 else if (EQ (prop, Qdoc_string))
1175 return XCODING_SYSTEM_DOC_STRING (coding_system);
1176 else if (EQ (prop, Qmnemonic))
1177 return XCODING_SYSTEM_MNEMONIC (coding_system);
1178 else if (EQ (prop, Qeol_type))
1179 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1180 else if (EQ (prop, Qeol_lf))
1181 return XCODING_SYSTEM_EOL_LF (coding_system);
1182 else if (EQ (prop, Qeol_crlf))
1183 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1184 else if (EQ (prop, Qeol_cr))
1185 return XCODING_SYSTEM_EOL_CR (coding_system);
1186 else if (EQ (prop, Qpost_read_conversion))
1187 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1188 else if (EQ (prop, Qpre_write_conversion))
1189 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1191 else if (type == CODESYS_ISO2022)
1193 if (EQ (prop, Qcharset_g0))
1194 return coding_system_charset (coding_system, 0);
1195 else if (EQ (prop, Qcharset_g1))
1196 return coding_system_charset (coding_system, 1);
1197 else if (EQ (prop, Qcharset_g2))
1198 return coding_system_charset (coding_system, 2);
1199 else if (EQ (prop, Qcharset_g3))
1200 return coding_system_charset (coding_system, 3);
1202 #define FORCE_CHARSET(charset_num) \
1203 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1204 (coding_system, charset_num) ? Qt : Qnil)
1206 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1207 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1208 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1209 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1211 #define LISP_BOOLEAN(prop) \
1212 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1214 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1215 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1216 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1217 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1218 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1219 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1220 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1222 else if (EQ (prop, Qinput_charset_conversion))
1224 unparse_charset_conversion_specs
1225 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1226 else if (EQ (prop, Qoutput_charset_conversion))
1228 unparse_charset_conversion_specs
1229 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1233 else if (type == CODESYS_CCL)
1235 if (EQ (prop, Qdecode))
1236 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1237 else if (EQ (prop, Qencode))
1238 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1246 return Qnil; /* not reached */
1250 /************************************************************************/
1251 /* Coding category functions */
1252 /************************************************************************/
1255 decode_coding_category (Lisp_Object symbol)
1259 CHECK_SYMBOL (symbol);
1260 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1261 if (EQ (coding_category_symbol[i], symbol))
1264 signal_simple_error ("Unrecognized coding category", symbol);
1265 return 0; /* not reached */
1268 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1269 Return a list of all recognized coding categories.
1274 Lisp_Object list = Qnil;
1276 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1277 list = Fcons (coding_category_symbol[i], list);
1281 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1282 Change the priority order of the coding categories.
1283 LIST should be list of coding categories, in descending order of
1284 priority. Unspecified coding categories will be lower in priority
1285 than all specified ones, in the same relative order they were in
1290 int category_to_priority[CODING_CATEGORY_LAST + 1];
1294 /* First generate a list that maps coding categories to priorities. */
1296 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1297 category_to_priority[i] = -1;
1299 /* Highest priority comes from the specified list. */
1301 EXTERNAL_LIST_LOOP (rest, list)
1303 int cat = decode_coding_category (XCAR (rest));
1305 if (category_to_priority[cat] >= 0)
1306 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1307 category_to_priority[cat] = i++;
1310 /* Now go through the existing categories by priority to retrieve
1311 the categories not yet specified and preserve their priority
1313 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1315 int cat = coding_category_by_priority[j];
1316 if (category_to_priority[cat] < 0)
1317 category_to_priority[cat] = i++;
1320 /* Now we need to construct the inverse of the mapping we just
1323 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1324 coding_category_by_priority[category_to_priority[i]] = i;
1326 /* Phew! That was confusing. */
1330 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1331 Return a list of coding categories in descending order of priority.
1336 Lisp_Object list = Qnil;
1338 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1339 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1344 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1345 Change the coding system associated with a coding category.
1347 (coding_category, coding_system))
1349 int cat = decode_coding_category (coding_category);
1351 coding_system = Fget_coding_system (coding_system);
1352 coding_category_system[cat] = coding_system;
1356 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1357 Return the coding system associated with a coding category.
1361 int cat = decode_coding_category (coding_category);
1362 Lisp_Object sys = coding_category_system[cat];
1365 return XCODING_SYSTEM_NAME (sys);
1370 /************************************************************************/
1371 /* Detecting the encoding of data */
1372 /************************************************************************/
1374 struct detection_state
1376 enum eol_type eol_type;
1412 struct iso2022_decoder iso;
1414 int high_byte_count;
1415 unsigned int saw_single_shift:1;
1428 acceptable_control_char_p (int c)
1432 /* Allow and ignore control characters that you might
1433 reasonably see in a text file */
1438 case 8: /* backspace */
1439 case 11: /* vertical tab */
1440 case 12: /* form feed */
1441 case 26: /* MS-DOS C-z junk */
1442 case 31: /* '^_' -- for info */
1450 mask_has_at_most_one_bit_p (int mask)
1452 /* Perhaps the only thing useful you learn from intensive Microsoft
1453 technical interviews */
1454 return (mask & (mask - 1)) == 0;
1457 static enum eol_type
1458 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1467 st->eol.just_saw_cr = 1;
1472 if (st->eol.just_saw_cr)
1474 else if (st->eol.seen_anything)
1477 else if (st->eol.just_saw_cr)
1479 st->eol.just_saw_cr = 0;
1481 st->eol.seen_anything = 1;
1484 return EOL_AUTODETECT;
1487 /* Attempt to determine the encoding and EOL type of the given text.
1488 Before calling this function for the first type, you must initialize
1489 st->eol_type as appropriate and initialize st->mask to ~0.
1491 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1494 st->mask holds the determined coding category mask, or ~0 if only
1495 ASCII has been seen so far.
1499 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1500 is present in st->mask
1501 1 == definitive answers are here for both st->eol_type and st->mask
1505 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1506 unsigned int n, int just_do_eol)
1510 if (st->eol_type == EOL_AUTODETECT)
1511 st->eol_type = detect_eol_type (st, src, n);
1514 return st->eol_type != EOL_AUTODETECT;
1516 if (!st->seen_non_ascii)
1518 for (; n; n--, src++)
1521 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1523 st->seen_non_ascii = 1;
1525 st->shift_jis.mask = ~0;
1529 st->iso2022.mask = ~0;
1539 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1540 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1541 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1542 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1543 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1544 st->big5.mask = detect_coding_big5 (st, src, n);
1545 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1546 st->utf8.mask = detect_coding_utf8 (st, src, n);
1547 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1548 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1551 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1552 | st->utf8.mask | st->ucs4.mask;
1555 int retval = mask_has_at_most_one_bit_p (st->mask);
1556 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1557 return retval && st->eol_type != EOL_AUTODETECT;
1562 coding_system_from_mask (int mask)
1566 /* If the file was entirely or basically ASCII, use the
1567 default value of `buffer-file-coding-system'. */
1568 Lisp_Object retval =
1569 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1572 retval = Ffind_coding_system (retval);
1576 (Qbad_variable, Qwarning,
1577 "Invalid `default-buffer-file-coding-system', set to nil");
1578 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1582 retval = Fget_coding_system (Qraw_text);
1590 mask = postprocess_iso2022_mask (mask);
1592 /* Look through the coding categories by priority and find
1593 the first one that is allowed. */
1594 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1596 cat = coding_category_by_priority[i];
1597 if ((mask & (1 << cat)) &&
1598 !NILP (coding_category_system[cat]))
1602 return coding_category_system[cat];
1604 return Fget_coding_system (Qraw_text);
1608 /* Given a seekable read stream and potential coding system and EOL type
1609 as specified, do any autodetection that is called for. If the
1610 coding system and/or EOL type are not autodetect, they will be left
1611 alone; but this function will never return an autodetect coding system
1614 This function does not automatically fetch subsidiary coding systems;
1615 that should be unnecessary with the explicit eol-type argument. */
1618 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1619 enum eol_type *eol_type_in_out)
1621 struct detection_state decst;
1623 if (*eol_type_in_out == EOL_AUTODETECT)
1624 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1627 decst.eol_type = *eol_type_in_out;
1630 /* If autodetection is called for, do it now. */
1631 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1632 *eol_type_in_out == EOL_AUTODETECT)
1634 unsigned char random_buffer[4096];
1636 Lisp_Object coding_system = Qnil;
1638 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1641 unsigned char *cp = random_buffer;
1643 while (cp < random_buffer + nread)
1645 if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1646 (*cp++ == 'o') && (cp < random_buffer + nread) &&
1647 (*cp++ == 'd') && (cp < random_buffer + nread) &&
1648 (*cp++ == 'i') && (cp < random_buffer + nread) &&
1649 (*cp++ == 'n') && (cp < random_buffer + nread) &&
1650 (*cp++ == 'g') && (cp < random_buffer + nread) &&
1651 (*cp++ == ':') && (cp < random_buffer + nread))
1653 unsigned char coding_system_name[4096 - 6];
1654 unsigned char *np = coding_system_name;
1656 while ( (cp < random_buffer + nread)
1657 && ((*cp == ' ') || (*cp == '\t')) )
1661 while ( (cp < random_buffer + nread) &&
1662 (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1668 = Ffind_coding_system (intern (coding_system_name));
1672 if (EQ(coding_system, Qnil))
1674 if (detect_coding_type (&decst, random_buffer, nread,
1675 XCODING_SYSTEM_TYPE (*codesys_in_out)
1676 != CODESYS_AUTODETECT))
1678 nread = Lstream_read (stream,
1679 random_buffer, sizeof (random_buffer));
1684 *eol_type_in_out = decst.eol_type;
1685 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1687 if (EQ(coding_system, Qnil))
1688 *codesys_in_out = coding_system_from_mask (decst.mask);
1690 *codesys_in_out = coding_system;
1693 /* If we absolutely can't determine the EOL type, just assume LF. */
1694 if (*eol_type_in_out == EOL_AUTODETECT)
1695 *eol_type_in_out = EOL_LF;
1697 Lstream_rewind (stream);
1700 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1701 Detect coding system of the text in the region between START and END.
1702 Returned a list of possible coding systems ordered by priority.
1703 If only ASCII characters are found, it returns 'undecided or one of
1704 its subsidiary coding systems according to a detected end-of-line
1705 type. Optional arg BUFFER defaults to the current buffer.
1707 (start, end, buffer))
1709 Lisp_Object val = Qnil;
1710 struct buffer *buf = decode_buffer (buffer, 0);
1712 Lisp_Object instream, lb_instream;
1713 Lstream *istr, *lb_istr;
1714 struct detection_state decst;
1715 struct gcpro gcpro1, gcpro2;
1717 get_buffer_range_char (buf, start, end, &b, &e, 0);
1718 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1719 lb_istr = XLSTREAM (lb_instream);
1720 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1721 istr = XLSTREAM (instream);
1722 GCPRO2 (instream, lb_instream);
1724 decst.eol_type = EOL_AUTODETECT;
1728 unsigned char random_buffer[4096];
1729 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1733 if (detect_coding_type (&decst, random_buffer, nread, 0))
1737 if (decst.mask == ~0)
1738 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1746 decst.mask = postprocess_iso2022_mask (decst.mask);
1748 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1750 int sys = coding_category_by_priority[i];
1751 if (decst.mask & (1 << sys))
1753 Lisp_Object codesys = coding_category_system[sys];
1754 if (!NILP (codesys))
1755 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1756 val = Fcons (codesys, val);
1760 Lstream_close (istr);
1762 Lstream_delete (istr);
1763 Lstream_delete (lb_istr);
1768 /************************************************************************/
1769 /* Converting to internal Mule format ("decoding") */
1770 /************************************************************************/
1772 /* A decoding stream is a stream used for decoding text (i.e.
1773 converting from some external format to internal format).
1774 The decoding-stream object keeps track of the actual coding
1775 stream, the stream that is at the other end, and data that
1776 needs to be persistent across the lifetime of the stream. */
1778 /* Handle the EOL stuff related to just-read-in character C.
1779 EOL_TYPE is the EOL type of the coding stream.
1780 FLAGS is the current value of FLAGS in the coding stream, and may
1781 be modified by this macro. (The macro only looks at the
1782 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1783 bytes are to be written. You need to also define a local goto
1784 label "label_continue_loop" that is at the end of the main
1785 character-reading loop.
1787 If C is a CR character, then this macro handles it entirely and
1788 jumps to label_continue_loop. Otherwise, this macro does not add
1789 anything to DST, and continues normally. You should continue
1790 processing C normally after this macro. */
1792 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1796 if (eol_type == EOL_CR) \
1797 Dynarr_add (dst, '\n'); \
1798 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1799 Dynarr_add (dst, c); \
1801 flags |= CODING_STATE_CR; \
1802 goto label_continue_loop; \
1804 else if (flags & CODING_STATE_CR) \
1805 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1807 Dynarr_add (dst, '\r'); \
1808 flags &= ~CODING_STATE_CR; \
1812 /* C should be a binary character in the range 0 - 255; convert
1813 to internal format and add to Dynarr DST. */
1816 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1818 if (BYTE_ASCII_P (c)) \
1819 Dynarr_add (dst, c); \
1822 Dynarr_add (dst, (c >> 6) | 0xc0); \
1823 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1828 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1832 Dynarr_add (dst, c);
1834 else if ( c <= 0x7ff )
1836 Dynarr_add (dst, (c >> 6) | 0xc0);
1837 Dynarr_add (dst, (c & 0x3f) | 0x80);
1839 else if ( c <= 0xffff )
1841 Dynarr_add (dst, (c >> 12) | 0xe0);
1842 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1843 Dynarr_add (dst, (c & 0x3f) | 0x80);
1845 else if ( c <= 0x1fffff )
1847 Dynarr_add (dst, (c >> 18) | 0xf0);
1848 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1849 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1850 Dynarr_add (dst, (c & 0x3f) | 0x80);
1852 else if ( c <= 0x3ffffff )
1854 Dynarr_add (dst, (c >> 24) | 0xf8);
1855 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1856 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1857 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1858 Dynarr_add (dst, (c & 0x3f) | 0x80);
1862 Dynarr_add (dst, (c >> 30) | 0xfc);
1863 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1864 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1865 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1866 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1867 Dynarr_add (dst, (c & 0x3f) | 0x80);
1871 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1873 if (BYTE_ASCII_P (c)) \
1874 Dynarr_add (dst, c); \
1875 else if (BYTE_C1_P (c)) \
1877 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1878 Dynarr_add (dst, c + 0x20); \
1882 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1883 Dynarr_add (dst, c); \
1888 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1892 DECODE_ADD_BINARY_CHAR (ch, dst); \
1897 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1899 if (flags & CODING_STATE_END) \
1901 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1902 if (flags & CODING_STATE_CR) \
1903 Dynarr_add (dst, '\r'); \
1907 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1909 struct decoding_stream
1911 /* Coding system that governs the conversion. */
1912 Lisp_Coding_System *codesys;
1914 /* Stream that we read the encoded data from or
1915 write the decoded data to. */
1918 /* If we are reading, then we can return only a fixed amount of
1919 data, so if the conversion resulted in too much data, we store it
1920 here for retrieval the next time around. */
1921 unsigned_char_dynarr *runoff;
1923 /* FLAGS holds flags indicating the current state of the decoding.
1924 Some of these flags are dependent on the coding system. */
1927 /* CH holds a partially built-up character. Since we only deal
1928 with one- and two-byte characters at the moment, we only use
1929 this to store the first byte of a two-byte character. */
1932 /* EOL_TYPE specifies the type of end-of-line conversion that
1933 currently applies. We need to keep this separate from the
1934 EOL type stored in CODESYS because the latter might indicate
1935 automatic EOL-type detection while the former will always
1936 indicate a particular EOL type. */
1937 enum eol_type eol_type;
1939 /* Additional ISO2022 information. We define the structure above
1940 because it's also needed by the detection routines. */
1941 struct iso2022_decoder iso2022;
1943 /* Additional information (the state of the running CCL program)
1944 used by the CCL decoder. */
1945 struct ccl_program ccl;
1947 /* counter for UTF-8 or UCS-4 */
1948 unsigned char counter;
1950 struct detection_state decst;
1953 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1954 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1955 static int decoding_rewinder (Lstream *stream);
1956 static int decoding_seekable_p (Lstream *stream);
1957 static int decoding_flusher (Lstream *stream);
1958 static int decoding_closer (Lstream *stream);
1960 static Lisp_Object decoding_marker (Lisp_Object stream,
1961 void (*markobj) (Lisp_Object));
1963 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1964 sizeof (struct decoding_stream));
1967 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1969 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1970 Lisp_Object str_obj;
1972 /* We do not need to mark the coding systems or charsets stored
1973 within the stream because they are stored in a global list
1974 and automatically marked. */
1976 XSETLSTREAM (str_obj, str);
1978 if (str->imp->marker)
1979 return (str->imp->marker) (str_obj, markobj);
1984 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1985 so we read data from the other end, decode it, and store it into DATA. */
1988 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1990 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1991 unsigned char *orig_data = data;
1993 int error_occurred = 0;
1995 /* We need to interface to mule_decode(), which expects to take some
1996 amount of data and store the result into a Dynarr. We have
1997 mule_decode() store into str->runoff, and take data from there
2000 /* We loop until we have enough data, reading chunks from the other
2001 end and decoding it. */
2004 /* Take data from the runoff if we can. Make sure to take at
2005 most SIZE bytes, and delete the data from the runoff. */
2006 if (Dynarr_length (str->runoff) > 0)
2008 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2009 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2010 Dynarr_delete_many (str->runoff, 0, chunk);
2016 break; /* No more room for data */
2018 if (str->flags & CODING_STATE_END)
2019 /* This means that on the previous iteration, we hit the EOF on
2020 the other end. We loop once more so that mule_decode() can
2021 output any final stuff it may be holding, or any "go back
2022 to a sane state" escape sequences. (This latter makes sense
2023 during encoding.) */
2026 /* Exhausted the runoff, so get some more. DATA has at least
2027 SIZE bytes left of storage in it, so it's OK to read directly
2028 into it. (We'll be overwriting above, after we've decoded it
2029 into the runoff.) */
2030 read_size = Lstream_read (str->other_end, data, size);
2037 /* There might be some more end data produced in the translation.
2038 See the comment above. */
2039 str->flags |= CODING_STATE_END;
2040 mule_decode (stream, data, str->runoff, read_size);
2043 if (data - orig_data == 0)
2044 return error_occurred ? -1 : 0;
2046 return data - orig_data;
2050 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2052 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2055 /* Decode all our data into the runoff, and then attempt to write
2056 it all out to the other end. Remove whatever chunk we succeeded
2058 mule_decode (stream, data, str->runoff, size);
2059 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2060 Dynarr_length (str->runoff));
2062 Dynarr_delete_many (str->runoff, 0, retval);
2063 /* Do NOT return retval. The return value indicates how much
2064 of the incoming data was written, not how many bytes were
2070 reset_decoding_stream (struct decoding_stream *str)
2073 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2075 Lisp_Object coding_system;
2076 XSETCODING_SYSTEM (coding_system, str->codesys);
2077 reset_iso2022 (coding_system, &str->iso2022);
2079 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2081 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2085 str->flags = str->ch = 0;
2089 decoding_rewinder (Lstream *stream)
2091 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2092 reset_decoding_stream (str);
2093 Dynarr_reset (str->runoff);
2094 return Lstream_rewind (str->other_end);
2098 decoding_seekable_p (Lstream *stream)
2100 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2101 return Lstream_seekable_p (str->other_end);
2105 decoding_flusher (Lstream *stream)
2107 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2108 return Lstream_flush (str->other_end);
2112 decoding_closer (Lstream *stream)
2114 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2115 if (stream->flags & LSTREAM_FL_WRITE)
2117 str->flags |= CODING_STATE_END;
2118 decoding_writer (stream, 0, 0);
2120 Dynarr_free (str->runoff);
2122 #ifdef ENABLE_COMPOSITE_CHARS
2123 if (str->iso2022.composite_chars)
2124 Dynarr_free (str->iso2022.composite_chars);
2127 return Lstream_close (str->other_end);
2131 decoding_stream_coding_system (Lstream *stream)
2133 Lisp_Object coding_system;
2134 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2136 XSETCODING_SYSTEM (coding_system, str->codesys);
2137 return subsidiary_coding_system (coding_system, str->eol_type);
2141 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2143 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2144 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2146 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2147 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2148 reset_decoding_stream (str);
2151 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2152 stream for writing, no automatic code detection will be performed.
2153 The reason for this is that automatic code detection requires a
2154 seekable input. Things will also fail if you open a decoding
2155 stream for reading using a non-fully-specified coding system and
2156 a non-seekable input stream. */
2159 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2162 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2163 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2167 str->other_end = stream;
2168 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2169 str->eol_type = EOL_AUTODETECT;
2170 if (!strcmp (mode, "r")
2171 && Lstream_seekable_p (stream))
2172 /* We can determine the coding system now. */
2173 determine_real_coding_system (stream, &codesys, &str->eol_type);
2174 set_decoding_stream_coding_system (lstr, codesys);
2175 str->decst.eol_type = str->eol_type;
2176 str->decst.mask = ~0;
2177 XSETLSTREAM (obj, lstr);
2182 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2184 return make_decoding_stream_1 (stream, codesys, "r");
2188 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2190 return make_decoding_stream_1 (stream, codesys, "w");
2193 /* Note: the decode_coding_* functions all take the same
2194 arguments as mule_decode(), which is to say some SRC data of
2195 size N, which is to be stored into dynamic array DST.
2196 DECODING is the stream within which the decoding is
2197 taking place, but no data is actually read from or
2198 written to that stream; that is handled in decoding_reader()
2199 or decoding_writer(). This allows the same functions to
2200 be used for both reading and writing. */
2203 mule_decode (Lstream *decoding, CONST unsigned char *src,
2204 unsigned_char_dynarr *dst, unsigned int n)
2206 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2208 /* If necessary, do encoding-detection now. We do this when
2209 we're a writing stream or a non-seekable reading stream,
2210 meaning that we can't just process the whole input,
2211 rewind, and start over. */
2213 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2214 str->eol_type == EOL_AUTODETECT)
2216 Lisp_Object codesys;
2218 XSETCODING_SYSTEM (codesys, str->codesys);
2219 detect_coding_type (&str->decst, src, n,
2220 CODING_SYSTEM_TYPE (str->codesys) !=
2221 CODESYS_AUTODETECT);
2222 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2223 str->decst.mask != ~0)
2224 /* #### This is cheesy. What we really ought to do is
2225 buffer up a certain amount of data so as to get a
2226 less random result. */
2227 codesys = coding_system_from_mask (str->decst.mask);
2228 str->eol_type = str->decst.eol_type;
2229 if (XCODING_SYSTEM (codesys) != str->codesys)
2231 /* Preserve the CODING_STATE_END flag in case it was set.
2232 If we erase it, bad things might happen. */
2233 int was_end = str->flags & CODING_STATE_END;
2234 set_decoding_stream_coding_system (decoding, codesys);
2236 str->flags |= CODING_STATE_END;
2240 switch (CODING_SYSTEM_TYPE (str->codesys))
2243 case CODESYS_INTERNAL:
2244 Dynarr_add_many (dst, src, n);
2247 case CODESYS_AUTODETECT:
2248 /* If we got this far and still haven't decided on the coding
2249 system, then do no conversion. */
2250 case CODESYS_NO_CONVERSION:
2251 decode_coding_no_conversion (decoding, src, dst, n);
2254 case CODESYS_SHIFT_JIS:
2255 decode_coding_sjis (decoding, src, dst, n);
2258 decode_coding_big5 (decoding, src, dst, n);
2261 decode_coding_ucs4 (decoding, src, dst, n);
2264 decode_coding_utf8 (decoding, src, dst, n);
2267 str->ccl.last_block = str->flags & CODING_STATE_END;
2268 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2270 case CODESYS_ISO2022:
2271 decode_coding_iso2022 (decoding, src, dst, n);
2279 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2280 Decode the text between START and END which is encoded in CODING-SYSTEM.
2281 This is useful if you've read in encoded text from a file without decoding
2282 it (e.g. you read in a JIS-formatted file but used the `binary' or
2283 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2284 Return length of decoded text.
2285 BUFFER defaults to the current buffer if unspecified.
2287 (start, end, coding_system, buffer))
2290 struct buffer *buf = decode_buffer (buffer, 0);
2291 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2292 Lstream *istr, *ostr;
2293 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2295 get_buffer_range_char (buf, start, end, &b, &e, 0);
2297 barf_if_buffer_read_only (buf, b, e);
2299 coding_system = Fget_coding_system (coding_system);
2300 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2301 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2302 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2304 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2305 Fget_coding_system (Qbinary));
2306 istr = XLSTREAM (instream);
2307 ostr = XLSTREAM (outstream);
2308 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2310 /* The chain of streams looks like this:
2312 [BUFFER] <----- send through
2313 ------> [ENCODE AS BINARY]
2314 ------> [DECODE AS SPECIFIED]
2320 char tempbuf[1024]; /* some random amount */
2321 Bufpos newpos, even_newer_pos;
2322 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2323 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2327 newpos = lisp_buffer_stream_startpos (istr);
2328 Lstream_write (ostr, tempbuf, size_in_bytes);
2329 even_newer_pos = lisp_buffer_stream_startpos (istr);
2330 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2333 Lstream_close (istr);
2334 Lstream_close (ostr);
2336 Lstream_delete (istr);
2337 Lstream_delete (ostr);
2338 Lstream_delete (XLSTREAM (de_outstream));
2339 Lstream_delete (XLSTREAM (lb_outstream));
2344 /************************************************************************/
2345 /* Converting to an external encoding ("encoding") */
2346 /************************************************************************/
2348 /* An encoding stream is an output stream. When you create the
2349 stream, you specify the coding system that governs the encoding
2350 and another stream that the resulting encoded data is to be
2351 sent to, and then start sending data to it. */
2353 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2355 struct encoding_stream
2357 /* Coding system that governs the conversion. */
2358 Lisp_Coding_System *codesys;
2360 /* Stream that we read the encoded data from or
2361 write the decoded data to. */
2364 /* If we are reading, then we can return only a fixed amount of
2365 data, so if the conversion resulted in too much data, we store it
2366 here for retrieval the next time around. */
2367 unsigned_char_dynarr *runoff;
2369 /* FLAGS holds flags indicating the current state of the encoding.
2370 Some of these flags are dependent on the coding system. */
2373 /* CH holds a partially built-up character. Since we only deal
2374 with one- and two-byte characters at the moment, we only use
2375 this to store the first byte of a two-byte character. */
2378 /* Additional information used by the ISO2022 encoder. */
2381 /* CHARSET holds the character sets currently assigned to the G0
2382 through G3 registers. It is initialized from the array
2383 INITIAL_CHARSET in CODESYS. */
2384 Lisp_Object charset[4];
2386 /* Which registers are currently invoked into the left (GL) and
2387 right (GR) halves of the 8-bit encoding space? */
2388 int register_left, register_right;
2390 /* Whether we need to explicitly designate the charset in the
2391 G? register before using it. It is initialized from the
2392 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2393 unsigned char force_charset_on_output[4];
2395 /* Other state variables that need to be preserved across
2397 Lisp_Object current_charset;
2399 int current_char_boundary;
2402 void (*encode_char) (struct encoding_stream *str, Emchar c,
2403 unsigned_char_dynarr *dst, unsigned int *flags);
2404 void (*finish) (struct encoding_stream *str,
2405 unsigned_char_dynarr *dst, unsigned int *flags);
2407 /* Additional information (the state of the running CCL program)
2408 used by the CCL encoder. */
2409 struct ccl_program ccl;
2413 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2414 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2416 static int encoding_rewinder (Lstream *stream);
2417 static int encoding_seekable_p (Lstream *stream);
2418 static int encoding_flusher (Lstream *stream);
2419 static int encoding_closer (Lstream *stream);
2421 static Lisp_Object encoding_marker (Lisp_Object stream,
2422 void (*markobj) (Lisp_Object));
2424 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2425 sizeof (struct encoding_stream));
2428 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2430 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2431 Lisp_Object str_obj;
2433 /* We do not need to mark the coding systems or charsets stored
2434 within the stream because they are stored in a global list
2435 and automatically marked. */
2437 XSETLSTREAM (str_obj, str);
2439 if (str->imp->marker)
2440 return (str->imp->marker) (str_obj, markobj);
2445 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2446 so we read data from the other end, encode it, and store it into DATA. */
2449 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2451 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2452 unsigned char *orig_data = data;
2454 int error_occurred = 0;
2456 /* We need to interface to mule_encode(), which expects to take some
2457 amount of data and store the result into a Dynarr. We have
2458 mule_encode() store into str->runoff, and take data from there
2461 /* We loop until we have enough data, reading chunks from the other
2462 end and encoding it. */
2465 /* Take data from the runoff if we can. Make sure to take at
2466 most SIZE bytes, and delete the data from the runoff. */
2467 if (Dynarr_length (str->runoff) > 0)
2469 int chunk = min ((int) size, Dynarr_length (str->runoff));
2470 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2471 Dynarr_delete_many (str->runoff, 0, chunk);
2477 break; /* No more room for data */
2479 if (str->flags & CODING_STATE_END)
2480 /* This means that on the previous iteration, we hit the EOF on
2481 the other end. We loop once more so that mule_encode() can
2482 output any final stuff it may be holding, or any "go back
2483 to a sane state" escape sequences. (This latter makes sense
2484 during encoding.) */
2487 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2488 left of storage in it, so it's OK to read directly into it.
2489 (We'll be overwriting above, after we've encoded it into the
2491 read_size = Lstream_read (str->other_end, data, size);
2498 /* There might be some more end data produced in the translation.
2499 See the comment above. */
2500 str->flags |= CODING_STATE_END;
2501 mule_encode (stream, data, str->runoff, read_size);
2504 if (data == orig_data)
2505 return error_occurred ? -1 : 0;
2507 return data - orig_data;
2511 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2513 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2516 /* Encode all our data into the runoff, and then attempt to write
2517 it all out to the other end. Remove whatever chunk we succeeded
2519 mule_encode (stream, data, str->runoff, size);
2520 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2521 Dynarr_length (str->runoff));
2523 Dynarr_delete_many (str->runoff, 0, retval);
2524 /* Do NOT return retval. The return value indicates how much
2525 of the incoming data was written, not how many bytes were
2531 reset_encoding_stream (struct encoding_stream *str)
2534 switch (CODING_SYSTEM_TYPE (str->codesys))
2536 case CODESYS_ISO2022:
2540 str->encode_char = &char_encode_iso2022;
2541 str->finish = &char_finish_iso2022;
2542 for (i = 0; i < 4; i++)
2544 str->iso2022.charset[i] =
2545 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2546 str->iso2022.force_charset_on_output[i] =
2547 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2549 str->iso2022.register_left = 0;
2550 str->iso2022.register_right = 1;
2551 str->iso2022.current_charset = Qnil;
2552 str->iso2022.current_half = 0;
2556 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2559 str->encode_char = &char_encode_utf8;
2560 str->finish = &char_finish_utf8;
2563 str->encode_char = &char_encode_ucs4;
2564 str->finish = &char_finish_ucs4;
2566 case CODESYS_SHIFT_JIS:
2567 str->encode_char = &char_encode_shift_jis;
2568 str->finish = &char_finish_shift_jis;
2574 str->iso2022.current_char_boundary = 0;
2575 str->flags = str->ch = 0;
2579 encoding_rewinder (Lstream *stream)
2581 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2582 reset_encoding_stream (str);
2583 Dynarr_reset (str->runoff);
2584 return Lstream_rewind (str->other_end);
2588 encoding_seekable_p (Lstream *stream)
2590 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2591 return Lstream_seekable_p (str->other_end);
2595 encoding_flusher (Lstream *stream)
2597 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2598 return Lstream_flush (str->other_end);
2602 encoding_closer (Lstream *stream)
2604 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2605 if (stream->flags & LSTREAM_FL_WRITE)
2607 str->flags |= CODING_STATE_END;
2608 encoding_writer (stream, 0, 0);
2610 Dynarr_free (str->runoff);
2611 return Lstream_close (str->other_end);
2615 encoding_stream_coding_system (Lstream *stream)
2617 Lisp_Object coding_system;
2618 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2620 XSETCODING_SYSTEM (coding_system, str->codesys);
2621 return coding_system;
2625 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2627 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2628 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2630 reset_encoding_stream (str);
2634 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2637 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2638 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2642 str->runoff = Dynarr_new (unsigned_char);
2643 str->other_end = stream;
2644 set_encoding_stream_coding_system (lstr, codesys);
2645 XSETLSTREAM (obj, lstr);
2650 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2652 return make_encoding_stream_1 (stream, codesys, "r");
2656 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2658 return make_encoding_stream_1 (stream, codesys, "w");
2661 /* Convert N bytes of internally-formatted data stored in SRC to an
2662 external format, according to the encoding stream ENCODING.
2663 Store the encoded data into DST. */
2666 mule_encode (Lstream *encoding, CONST unsigned char *src,
2667 unsigned_char_dynarr *dst, unsigned int n)
2669 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2671 switch (CODING_SYSTEM_TYPE (str->codesys))
2674 case CODESYS_INTERNAL:
2675 Dynarr_add_many (dst, src, n);
2678 case CODESYS_AUTODETECT:
2679 /* If we got this far and still haven't decided on the coding
2680 system, then do no conversion. */
2681 case CODESYS_NO_CONVERSION:
2682 encode_coding_no_conversion (encoding, src, dst, n);
2686 encode_coding_big5 (encoding, src, dst, n);
2689 str->ccl.last_block = str->flags & CODING_STATE_END;
2690 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2694 text_encode_generic (encoding, src, dst, n);
2698 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2699 Encode the text between START and END using CODING-SYSTEM.
2700 This will, for example, convert Japanese characters into stuff such as
2701 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2702 text. BUFFER defaults to the current buffer if unspecified.
2704 (start, end, coding_system, buffer))
2707 struct buffer *buf = decode_buffer (buffer, 0);
2708 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2709 Lstream *istr, *ostr;
2710 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2712 get_buffer_range_char (buf, start, end, &b, &e, 0);
2714 barf_if_buffer_read_only (buf, b, e);
2716 coding_system = Fget_coding_system (coding_system);
2717 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2718 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2719 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2720 Fget_coding_system (Qbinary));
2721 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2723 istr = XLSTREAM (instream);
2724 ostr = XLSTREAM (outstream);
2725 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2726 /* The chain of streams looks like this:
2728 [BUFFER] <----- send through
2729 ------> [ENCODE AS SPECIFIED]
2730 ------> [DECODE AS BINARY]
2735 char tempbuf[1024]; /* some random amount */
2736 Bufpos newpos, even_newer_pos;
2737 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2738 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2742 newpos = lisp_buffer_stream_startpos (istr);
2743 Lstream_write (ostr, tempbuf, size_in_bytes);
2744 even_newer_pos = lisp_buffer_stream_startpos (istr);
2745 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2751 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2752 Lstream_close (istr);
2753 Lstream_close (ostr);
2755 Lstream_delete (istr);
2756 Lstream_delete (ostr);
2757 Lstream_delete (XLSTREAM (de_outstream));
2758 Lstream_delete (XLSTREAM (lb_outstream));
2759 return make_int (retlen);
2766 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2767 unsigned_char_dynarr *dst, unsigned int n)
2770 unsigned char char_boundary;
2771 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2772 unsigned int flags = str->flags;
2773 Emchar ch = str->ch;
2775 char_boundary = str->iso2022.current_char_boundary;
2781 if (char_boundary == 0)
2809 (*str->encode_char) (str, c, dst, &flags);
2811 else if (char_boundary == 1)
2813 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2819 ch = (ch << 6) | (c & 0x3f);
2824 if ((char_boundary == 0) && (flags & CODING_STATE_END))
2826 (*str->finish) (str, dst, &flags);
2831 str->iso2022.current_char_boundary = char_boundary;
2835 /************************************************************************/
2836 /* Shift-JIS methods */
2837 /************************************************************************/
2839 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2840 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2841 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2842 encoded by "position-code + 0x80". A character of JISX0208
2843 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2844 position-codes are divided and shifted so that it fit in the range
2847 --- CODE RANGE of Shift-JIS ---
2848 (character set) (range)
2850 JISX0201-Kana 0xA0 .. 0xDF
2851 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2852 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2853 -------------------------------
2857 /* Is this the first byte of a Shift-JIS two-byte char? */
2859 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2860 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2862 /* Is this the second byte of a Shift-JIS two-byte char? */
2864 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2865 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2867 #define BYTE_SJIS_KATAKANA_P(c) \
2868 ((c) >= 0xA1 && (c) <= 0xDF)
2871 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2879 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2881 if (st->shift_jis.in_second_byte)
2883 st->shift_jis.in_second_byte = 0;
2887 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2888 st->shift_jis.in_second_byte = 1;
2890 return CODING_CATEGORY_SHIFT_JIS_MASK;
2893 /* Convert Shift-JIS data to internal format. */
2896 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2897 unsigned_char_dynarr *dst, unsigned int n)
2900 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2901 unsigned int flags = str->flags;
2902 unsigned int ch = str->ch;
2903 eol_type_t eol_type = str->eol_type;
2911 /* Previous character was first byte of Shift-JIS Kanji char. */
2912 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2914 unsigned char e1, e2;
2916 DECODE_SJIS (ch, c, e1, e2);
2918 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2922 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2923 Dynarr_add (dst, e1);
2924 Dynarr_add (dst, e2);
2929 DECODE_ADD_BINARY_CHAR (ch, dst);
2930 DECODE_ADD_BINARY_CHAR (c, dst);
2936 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2937 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2939 else if (BYTE_SJIS_KATAKANA_P (c))
2942 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2945 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2946 Dynarr_add (dst, c);
2951 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
2955 DECODE_ADD_BINARY_CHAR (c, dst);
2957 label_continue_loop:;
2960 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2966 /* Convert internal character representation to Shift_JIS. */
2969 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
2970 unsigned_char_dynarr *dst, unsigned int *flags)
2972 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2976 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2977 Dynarr_add (dst, '\r');
2978 if (eol_type != EOL_CR)
2979 Dynarr_add (dst, ch);
2983 Lisp_Object charset;
2984 unsigned int c1, c2, s1, s2;
2986 Lisp_Object value = charset_code_point (Vcharset_latin_jisx0201, ch);
2987 Lisp_Object ret = Fcar (value);
2991 charset = Vcharset_latin_jisx0201;
2997 BREAKUP_CHAR (ch, charset, c1, c2);
2999 if (EQ(charset, Vcharset_katakana_jisx0201))
3001 Dynarr_add (dst, c1 | 0x80);
3005 Dynarr_add (dst, c1);
3007 else if (EQ(charset, Vcharset_japanese_jisx0208))
3009 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3010 Dynarr_add (dst, s1);
3011 Dynarr_add (dst, s2);
3014 Dynarr_add (dst, '?');
3019 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3020 unsigned int *flags)
3024 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3025 Decode a JISX0208 character of Shift-JIS coding-system.
3026 CODE is the character code in Shift-JIS as a cons of type bytes.
3027 Return the corresponding character.
3031 unsigned char c1, c2, s1, s2;
3034 CHECK_INT (XCAR (code));
3035 CHECK_INT (XCDR (code));
3036 s1 = XINT (XCAR (code));
3037 s2 = XINT (XCDR (code));
3038 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3039 BYTE_SJIS_TWO_BYTE_2_P (s2))
3041 DECODE_SJIS (s1, s2, c1, c2);
3042 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3043 c1 & 0x7F, c2 & 0x7F));
3049 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3050 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3051 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3055 Lisp_Object charset;
3058 CHECK_CHAR_COERCE_INT (ch);
3059 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3060 if (EQ (charset, Vcharset_japanese_jisx0208))
3062 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3063 return Fcons (make_int (s1), make_int (s2));
3070 /************************************************************************/
3072 /************************************************************************/
3074 /* BIG5 is a coding system encoding two character sets: ASCII and
3075 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3076 character set and is encoded in two-byte.
3078 --- CODE RANGE of BIG5 ---
3079 (character set) (range)
3081 Big5 (1st byte) 0xA1 .. 0xFE
3082 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3083 --------------------------
3085 Since the number of characters in Big5 is larger than maximum
3086 characters in Emacs' charset (96x96), it can't be handled as one
3087 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3088 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3089 contains frequently used characters and the latter contains less
3090 frequently used characters. */
3092 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3093 ((c) >= 0xA1 && (c) <= 0xFE)
3095 /* Is this the second byte of a Shift-JIS two-byte char? */
3097 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3098 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3100 /* Number of Big5 characters which have the same code in 1st byte. */
3102 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3104 /* Code conversion macros. These are macros because they are used in
3105 inner loops during code conversion.
3107 Note that temporary variables in macros introduce the classic
3108 dynamic-scoping problems with variable names. We use capital-
3109 lettered variables in the assumption that XEmacs does not use
3110 capital letters in variables except in a very formalized way
3113 /* Convert Big5 code (b1, b2) into its internal string representation
3116 /* There is a much simpler way to split the Big5 charset into two.
3117 For the moment I'm going to leave the algorithm as-is because it
3118 claims to separate out the most-used characters into a single
3119 charset, which perhaps will lead to optimizations in various
3122 The way the algorithm works is something like this:
3124 Big5 can be viewed as a 94x157 charset, where the row is
3125 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3126 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3127 the split between low and high column numbers is apparently
3128 meaningless; ascending rows produce less and less frequent chars.
3129 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3130 the first charset, and the upper half (0xC9 .. 0xFE) to the
3131 second. To do the conversion, we convert the character into
3132 a single number where 0 .. 156 is the first row, 157 .. 313
3133 is the second, etc. That way, the characters are ordered by
3134 decreasing frequency. Then we just chop the space in two
3135 and coerce the result into a 94x94 space.
3138 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3140 int B1 = b1, B2 = b2; \
3142 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3146 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3150 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3151 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3153 c1 = I / (0xFF - 0xA1) + 0xA1; \
3154 c2 = I % (0xFF - 0xA1) + 0xA1; \
3157 /* Convert the internal string representation of a Big5 character
3158 (lb, c1, c2) into Big5 code (b1, b2). */
3160 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3162 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3164 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3166 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3168 b1 = I / BIG5_SAME_ROW + 0xA1; \
3169 b2 = I % BIG5_SAME_ROW; \
3170 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3174 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3182 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3183 (c >= 0x80 && c <= 0xA0))
3185 if (st->big5.in_second_byte)
3187 st->big5.in_second_byte = 0;
3188 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3192 st->big5.in_second_byte = 1;
3194 return CODING_CATEGORY_BIG5_MASK;
3197 /* Convert Big5 data to internal format. */
3200 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3201 unsigned_char_dynarr *dst, unsigned int n)
3204 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3205 unsigned int flags = str->flags;
3206 unsigned int ch = str->ch;
3207 eol_type_t eol_type = str->eol_type;
3214 /* Previous character was first byte of Big5 char. */
3215 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3217 unsigned char b1, b2, b3;
3218 DECODE_BIG5 (ch, c, b1, b2, b3);
3219 Dynarr_add (dst, b1);
3220 Dynarr_add (dst, b2);
3221 Dynarr_add (dst, b3);
3225 DECODE_ADD_BINARY_CHAR (ch, dst);
3226 DECODE_ADD_BINARY_CHAR (c, dst);
3232 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3233 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3236 DECODE_ADD_BINARY_CHAR (c, dst);
3238 label_continue_loop:;
3241 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3247 /* Convert internally-formatted data to Big5. */
3250 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3251 unsigned_char_dynarr *dst, unsigned int n)
3255 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3256 unsigned int flags = str->flags;
3257 unsigned int ch = str->ch;
3258 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3265 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3266 Dynarr_add (dst, '\r');
3267 if (eol_type != EOL_CR)
3268 Dynarr_add (dst, '\n');
3270 else if (BYTE_ASCII_P (c))
3273 Dynarr_add (dst, c);
3275 else if (BUFBYTE_LEADING_BYTE_P (c))
3277 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3278 c == LEADING_BYTE_CHINESE_BIG5_2)
3280 /* A recognized leading byte. */
3282 continue; /* not done with this character. */
3284 /* otherwise just ignore this character. */
3286 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3287 ch == LEADING_BYTE_CHINESE_BIG5_2)
3289 /* Previous char was a recognized leading byte. */
3291 continue; /* not done with this character. */
3295 /* Encountering second byte of a Big5 character. */
3296 unsigned char b1, b2;
3298 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3299 Dynarr_add (dst, b1);
3300 Dynarr_add (dst, b2);
3312 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3313 Decode a Big5 character CODE of BIG5 coding-system.
3314 CODE is the character code in BIG5, a cons of two integers.
3315 Return the corresponding character.
3319 unsigned char c1, c2, b1, b2;
3322 CHECK_INT (XCAR (code));
3323 CHECK_INT (XCDR (code));
3324 b1 = XINT (XCAR (code));
3325 b2 = XINT (XCDR (code));
3326 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3327 BYTE_BIG5_TWO_BYTE_2_P (b2))
3329 Charset_ID leading_byte;
3330 Lisp_Object charset;
3331 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3332 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3333 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3339 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3340 Encode the Big5 character CH to BIG5 coding-system.
3341 Return the corresponding character code in Big5.
3345 Lisp_Object charset;
3348 CHECK_CHAR_COERCE_INT (ch);
3349 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3350 if (EQ (charset, Vcharset_chinese_big5_1) ||
3351 EQ (charset, Vcharset_chinese_big5_2))
3353 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3355 return Fcons (make_int (b1), make_int (b2));
3362 /************************************************************************/
3364 /************************************************************************/
3367 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3373 switch (st->ucs4.in_byte)
3382 st->ucs4.in_byte = 0;
3388 return CODING_CATEGORY_UCS4_MASK;
3392 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3393 unsigned_char_dynarr *dst, unsigned int n)
3395 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3396 unsigned int flags = str->flags;
3397 unsigned int ch = str->ch;
3398 unsigned char counter = str->counter;
3402 unsigned char c = *src++;
3410 DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3415 ch = ( ch << 8 ) | c;
3419 if (counter & CODING_STATE_END)
3420 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3424 str->counter = counter;
3428 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3429 unsigned_char_dynarr *dst, unsigned int *flags)
3431 Dynarr_add (dst, ch >> 24);
3432 Dynarr_add (dst, ch >> 16);
3433 Dynarr_add (dst, ch >> 8);
3434 Dynarr_add (dst, ch );
3438 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3439 unsigned int *flags)
3444 /************************************************************************/
3446 /************************************************************************/
3449 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3454 unsigned char c = *src++;
3455 switch (st->utf8.in_byte)
3458 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3461 st->utf8.in_byte = 5;
3463 st->utf8.in_byte = 4;
3465 st->utf8.in_byte = 3;
3467 st->utf8.in_byte = 2;
3469 st->utf8.in_byte = 1;
3474 if ((c & 0xc0) != 0x80)
3480 return CODING_CATEGORY_UTF8_MASK;
3484 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3485 unsigned_char_dynarr *dst, unsigned int n)
3487 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3488 unsigned int flags = str->flags;
3489 unsigned int ch = str->ch;
3490 eol_type_t eol_type = str->eol_type;
3491 unsigned char counter = str->counter;
3495 unsigned char c = *src++;
3504 else if ( c >= 0xf8 )
3509 else if ( c >= 0xf0 )
3514 else if ( c >= 0xe0 )
3519 else if ( c >= 0xc0 )
3526 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3527 DECODE_ADD_UCS_CHAR (c, dst);
3531 ch = ( ch << 6 ) | ( c & 0x3f );
3532 DECODE_ADD_UCS_CHAR (ch, dst);
3537 ch = ( ch << 6 ) | ( c & 0x3f );
3540 label_continue_loop:;
3543 if (flags & CODING_STATE_END)
3544 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3548 str->counter = counter;
3552 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3553 unsigned_char_dynarr *dst, unsigned int *flags)
3555 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3559 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3560 Dynarr_add (dst, '\r');
3561 if (eol_type != EOL_CR)
3562 Dynarr_add (dst, ch);
3564 else if (ch <= 0x7f)
3566 Dynarr_add (dst, ch);
3568 else if (ch <= 0x7ff)
3570 Dynarr_add (dst, (ch >> 6) | 0xc0);
3571 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3573 else if (ch <= 0xffff)
3575 Dynarr_add (dst, (ch >> 12) | 0xe0);
3576 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3577 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3579 else if (ch <= 0x1fffff)
3581 Dynarr_add (dst, (ch >> 18) | 0xf0);
3582 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3583 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3584 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3586 else if (ch <= 0x3ffffff)
3588 Dynarr_add (dst, (ch >> 24) | 0xf8);
3589 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3590 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3591 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3592 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3596 Dynarr_add (dst, (ch >> 30) | 0xfc);
3597 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3598 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3599 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3600 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3601 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3606 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3607 unsigned int *flags)
3612 /************************************************************************/
3613 /* ISO2022 methods */
3614 /************************************************************************/
3616 /* The following note describes the coding system ISO2022 briefly.
3617 Since the intention of this note is to help understand the
3618 functions in this file, some parts are NOT ACCURATE or OVERLY
3619 SIMPLIFIED. For thorough understanding, please refer to the
3620 original document of ISO2022.
3622 ISO2022 provides many mechanisms to encode several character sets
3623 in 7-bit and 8-bit environments. For 7-bit environments, all text
3624 is encoded using bytes less than 128. This may make the encoded
3625 text a little bit longer, but the text passes more easily through
3626 several gateways, some of which strip off MSB (Most Signigant Bit).
3628 There are two kinds of character sets: control character set and
3629 graphic character set. The former contains control characters such
3630 as `newline' and `escape' to provide control functions (control
3631 functions are also provided by escape sequences). The latter
3632 contains graphic characters such as 'A' and '-'. Emacs recognizes
3633 two control character sets and many graphic character sets.
3635 Graphic character sets are classified into one of the following
3636 four classes, according to the number of bytes (DIMENSION) and
3637 number of characters in one dimension (CHARS) of the set:
3638 - DIMENSION1_CHARS94
3639 - DIMENSION1_CHARS96
3640 - DIMENSION2_CHARS94
3641 - DIMENSION2_CHARS96
3643 In addition, each character set is assigned an identification tag,
3644 unique for each set, called "final character" (denoted as <F>
3645 hereafter). The <F> of each character set is decided by ECMA(*)
3646 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3647 (0x30..0x3F are for private use only).
3649 Note (*): ECMA = European Computer Manufacturers Association
3651 Here are examples of graphic character set [NAME(<F>)]:
3652 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3653 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3654 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3655 o DIMENSION2_CHARS96 -- none for the moment
3657 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3658 C0 [0x00..0x1F] -- control character plane 0
3659 GL [0x20..0x7F] -- graphic character plane 0
3660 C1 [0x80..0x9F] -- control character plane 1
3661 GR [0xA0..0xFF] -- graphic character plane 1
3663 A control character set is directly designated and invoked to C0 or
3664 C1 by an escape sequence. The most common case is that:
3665 - ISO646's control character set is designated/invoked to C0, and
3666 - ISO6429's control character set is designated/invoked to C1,
3667 and usually these designations/invocations are omitted in encoded
3668 text. In a 7-bit environment, only C0 can be used, and a control
3669 character for C1 is encoded by an appropriate escape sequence to
3670 fit into the environment. All control characters for C1 are
3671 defined to have corresponding escape sequences.
3673 A graphic character set is at first designated to one of four
3674 graphic registers (G0 through G3), then these graphic registers are
3675 invoked to GL or GR. These designations and invocations can be
3676 done independently. The most common case is that G0 is invoked to
3677 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3678 these invocations and designations are omitted in encoded text.
3679 In a 7-bit environment, only GL can be used.
3681 When a graphic character set of CHARS94 is invoked to GL, codes
3682 0x20 and 0x7F of the GL area work as control characters SPACE and
3683 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3686 There are two ways of invocation: locking-shift and single-shift.
3687 With locking-shift, the invocation lasts until the next different
3688 invocation, whereas with single-shift, the invocation affects the
3689 following character only and doesn't affect the locking-shift
3690 state. Invocations are done by the following control characters or
3693 ----------------------------------------------------------------------
3694 abbrev function cntrl escape seq description
3695 ----------------------------------------------------------------------
3696 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3697 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3698 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3699 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3700 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3701 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3702 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3703 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3704 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3705 ----------------------------------------------------------------------
3706 (*) These are not used by any known coding system.
3708 Control characters for these functions are defined by macros
3709 ISO_CODE_XXX in `coding.h'.
3711 Designations are done by the following escape sequences:
3712 ----------------------------------------------------------------------
3713 escape sequence description
3714 ----------------------------------------------------------------------
3715 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3716 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3717 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3718 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3719 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3720 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3721 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3722 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3723 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3724 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3725 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3726 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3727 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3728 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3729 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3730 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3731 ----------------------------------------------------------------------
3733 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3734 of dimension 1, chars 94, and final character <F>, etc...
3736 Note (*): Although these designations are not allowed in ISO2022,
3737 Emacs accepts them on decoding, and produces them on encoding
3738 CHARS96 character sets in a coding system which is characterized as
3739 7-bit environment, non-locking-shift, and non-single-shift.
3741 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3742 '(' can be omitted. We refer to this as "short-form" hereafter.
3744 Now you may notice that there are a lot of ways for encoding the
3745 same multilingual text in ISO2022. Actually, there exist many
3746 coding systems such as Compound Text (used in X11's inter client
3747 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3748 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3749 localized platforms), and all of these are variants of ISO2022.
3751 In addition to the above, Emacs handles two more kinds of escape
3752 sequences: ISO6429's direction specification and Emacs' private
3753 sequence for specifying character composition.
3755 ISO6429's direction specification takes the following form:
3756 o CSI ']' -- end of the current direction
3757 o CSI '0' ']' -- end of the current direction
3758 o CSI '1' ']' -- start of left-to-right text
3759 o CSI '2' ']' -- start of right-to-left text
3760 The control character CSI (0x9B: control sequence introducer) is
3761 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3763 Character composition specification takes the following form:
3764 o ESC '0' -- start character composition
3765 o ESC '1' -- end character composition
3766 Since these are not standard escape sequences of any ISO standard,
3767 their use with these meanings is restricted to Emacs only. */
3770 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3774 for (i = 0; i < 4; i++)
3776 if (!NILP (coding_system))
3778 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3780 iso->charset[i] = Qt;
3781 iso->invalid_designated[i] = 0;
3783 iso->esc = ISO_ESC_NOTHING;
3784 iso->esc_bytes_index = 0;
3785 iso->register_left = 0;
3786 iso->register_right = 1;
3787 iso->switched_dir_and_no_valid_charset_yet = 0;
3788 iso->invalid_switch_dir = 0;
3789 iso->output_direction_sequence = 0;
3790 iso->output_literally = 0;
3791 #ifdef ENABLE_COMPOSITE_CHARS
3792 if (iso->composite_chars)
3793 Dynarr_reset (iso->composite_chars);
3798 fit_to_be_escape_quoted (unsigned char c)
3815 /* Parse one byte of an ISO2022 escape sequence.
3816 If the result is an invalid escape sequence, return 0 and
3817 do not change anything in STR. Otherwise, if the result is
3818 an incomplete escape sequence, update ISO2022.ESC and
3819 ISO2022.ESC_BYTES and return -1. Otherwise, update
3820 all the state variables (but not ISO2022.ESC_BYTES) and
3823 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3824 or invocation of an invalid character set and treat that as
3825 an unrecognized escape sequence. */
3828 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3829 unsigned char c, unsigned int *flags,
3830 int check_invalid_charsets)
3832 /* (1) If we're at the end of a designation sequence, CS is the
3833 charset being designated and REG is the register to designate
3836 (2) If we're at the end of a locking-shift sequence, REG is
3837 the register to invoke and HALF (0 == left, 1 == right) is
3838 the half to invoke it into.
3840 (3) If we're at the end of a single-shift sequence, REG is
3841 the register to invoke. */
3842 Lisp_Object cs = Qnil;
3845 /* NOTE: This code does goto's all over the fucking place.
3846 The reason for this is that we're basically implementing
3847 a state machine here, and hierarchical languages like C
3848 don't really provide a clean way of doing this. */
3850 if (! (*flags & CODING_STATE_ESCAPE))
3851 /* At beginning of escape sequence; we need to reset our
3852 escape-state variables. */
3853 iso->esc = ISO_ESC_NOTHING;
3855 iso->output_literally = 0;
3856 iso->output_direction_sequence = 0;
3860 case ISO_ESC_NOTHING:
3861 iso->esc_bytes_index = 0;
3864 case ISO_CODE_ESC: /* Start escape sequence */
3865 *flags |= CODING_STATE_ESCAPE;
3869 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3870 *flags |= CODING_STATE_ESCAPE;
3871 iso->esc = ISO_ESC_5_11;
3874 case ISO_CODE_SO: /* locking shift 1 */
3877 case ISO_CODE_SI: /* locking shift 0 */
3881 case ISO_CODE_SS2: /* single shift */
3884 case ISO_CODE_SS3: /* single shift */
3888 default: /* Other control characters */
3895 /**** single shift ****/
3897 case 'N': /* single shift 2 */
3900 case 'O': /* single shift 3 */
3904 /**** locking shift ****/
3906 case '~': /* locking shift 1 right */
3909 case 'n': /* locking shift 2 */
3912 case '}': /* locking shift 2 right */
3915 case 'o': /* locking shift 3 */
3918 case '|': /* locking shift 3 right */
3922 #ifdef ENABLE_COMPOSITE_CHARS
3923 /**** composite ****/
3926 iso->esc = ISO_ESC_START_COMPOSITE;
3927 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3928 CODING_STATE_COMPOSITE;
3932 iso->esc = ISO_ESC_END_COMPOSITE;
3933 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3934 ~CODING_STATE_COMPOSITE;
3936 #endif /* ENABLE_COMPOSITE_CHARS */
3938 /**** directionality ****/
3941 iso->esc = ISO_ESC_5_11;
3944 /**** designation ****/
3946 case '$': /* multibyte charset prefix */
3947 iso->esc = ISO_ESC_2_4;
3951 if (0x28 <= c && c <= 0x2F)
3953 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3957 /* This function is called with CODESYS equal to nil when
3958 doing coding-system detection. */
3960 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3961 && fit_to_be_escape_quoted (c))
3963 iso->esc = ISO_ESC_LITERAL;
3964 *flags &= CODING_STATE_ISO2022_LOCK;
3974 /**** directionality ****/
3976 case ISO_ESC_5_11: /* ISO6429 direction control */
3979 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3980 goto directionality;
3982 if (c == '0') iso->esc = ISO_ESC_5_11_0;
3983 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3984 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3988 case ISO_ESC_5_11_0:
3991 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3992 goto directionality;
3996 case ISO_ESC_5_11_1:
3999 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4000 goto directionality;
4004 case ISO_ESC_5_11_2:
4007 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4008 goto directionality;
4013 iso->esc = ISO_ESC_DIRECTIONALITY;
4014 /* Various junk here to attempt to preserve the direction sequences
4015 literally in the text if they would otherwise be swallowed due
4016 to invalid designations that don't show up as actual charset
4017 changes in the text. */
4018 if (iso->invalid_switch_dir)
4020 /* We already inserted a direction switch literally into the
4021 text. We assume (#### this may not be right) that the
4022 next direction switch is the one going the other way,
4023 and we need to output that literally as well. */
4024 iso->output_literally = 1;
4025 iso->invalid_switch_dir = 0;
4031 /* If we are in the thrall of an invalid designation,
4032 then stick the directionality sequence literally into the
4033 output stream so it ends up in the original text again. */
4034 for (jj = 0; jj < 4; jj++)
4035 if (iso->invalid_designated[jj])
4039 iso->output_literally = 1;
4040 iso->invalid_switch_dir = 1;
4043 /* Indicate that we haven't yet seen a valid designation,
4044 so that if a switch-dir is directly followed by an
4045 invalid designation, both get inserted literally. */
4046 iso->switched_dir_and_no_valid_charset_yet = 1;
4051 /**** designation ****/
4054 if (0x28 <= c && c <= 0x2F)
4056 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4059 if (0x40 <= c && c <= 0x42)
4061 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4062 *flags & CODING_STATE_R2L ?
4063 CHARSET_RIGHT_TO_LEFT :
4064 CHARSET_LEFT_TO_RIGHT);
4074 if (c < '0' || c > '~')
4075 return 0; /* bad final byte */
4077 if (iso->esc >= ISO_ESC_2_8 &&
4078 iso->esc <= ISO_ESC_2_15)
4080 type = ((iso->esc >= ISO_ESC_2_12) ?
4081 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4082 reg = (iso->esc - ISO_ESC_2_8) & 3;
4084 else if (iso->esc >= ISO_ESC_2_4_8 &&
4085 iso->esc <= ISO_ESC_2_4_15)
4087 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4088 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4089 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4093 /* Can this ever be reached? -slb */
4097 cs = CHARSET_BY_ATTRIBUTES (type, c,
4098 *flags & CODING_STATE_R2L ?
4099 CHARSET_RIGHT_TO_LEFT :
4100 CHARSET_LEFT_TO_RIGHT);
4106 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4110 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4111 /* can't invoke something that ain't there. */
4113 iso->esc = ISO_ESC_SINGLE_SHIFT;
4114 *flags &= CODING_STATE_ISO2022_LOCK;
4116 *flags |= CODING_STATE_SS2;
4118 *flags |= CODING_STATE_SS3;
4122 if (check_invalid_charsets &&
4123 !CHARSETP (iso->charset[reg]))
4124 /* can't invoke something that ain't there. */
4127 iso->register_right = reg;
4129 iso->register_left = reg;
4130 *flags &= CODING_STATE_ISO2022_LOCK;
4131 iso->esc = ISO_ESC_LOCKING_SHIFT;
4135 if (NILP (cs) && check_invalid_charsets)
4137 iso->invalid_designated[reg] = 1;
4138 iso->charset[reg] = Vcharset_ascii;
4139 iso->esc = ISO_ESC_DESIGNATE;
4140 *flags &= CODING_STATE_ISO2022_LOCK;
4141 iso->output_literally = 1;
4142 if (iso->switched_dir_and_no_valid_charset_yet)
4144 /* We encountered a switch-direction followed by an
4145 invalid designation. Ensure that the switch-direction
4146 gets outputted; otherwise it will probably get eaten
4147 when the text is written out again. */
4148 iso->switched_dir_and_no_valid_charset_yet = 0;
4149 iso->output_direction_sequence = 1;
4150 /* And make sure that the switch-dir going the other
4151 way gets outputted, as well. */
4152 iso->invalid_switch_dir = 1;
4156 /* This function is called with CODESYS equal to nil when
4157 doing coding-system detection. */
4158 if (!NILP (codesys))
4160 charset_conversion_spec_dynarr *dyn =
4161 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4167 for (i = 0; i < Dynarr_length (dyn); i++)
4169 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4170 if (EQ (cs, spec->from_charset))
4171 cs = spec->to_charset;
4176 iso->charset[reg] = cs;
4177 iso->esc = ISO_ESC_DESIGNATE;
4178 *flags &= CODING_STATE_ISO2022_LOCK;
4179 if (iso->invalid_designated[reg])
4181 iso->invalid_designated[reg] = 0;
4182 iso->output_literally = 1;
4184 if (iso->switched_dir_and_no_valid_charset_yet)
4185 iso->switched_dir_and_no_valid_charset_yet = 0;
4190 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4195 /* #### There are serious deficiencies in the recognition mechanism
4196 here. This needs to be much smarter if it's going to cut it.
4197 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4198 it should be detected as Latin-1.
4199 All the ISO2022 stuff in this file should be synced up with the
4200 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4201 Perhaps we should wait till R2L works in FSF Emacs? */
4203 if (!st->iso2022.initted)
4205 reset_iso2022 (Qnil, &st->iso2022.iso);
4206 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4207 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4208 CODING_CATEGORY_ISO_8_1_MASK |
4209 CODING_CATEGORY_ISO_8_2_MASK |
4210 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4211 st->iso2022.flags = 0;
4212 st->iso2022.high_byte_count = 0;
4213 st->iso2022.saw_single_shift = 0;
4214 st->iso2022.initted = 1;
4217 mask = st->iso2022.mask;
4224 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4225 st->iso2022.high_byte_count++;
4229 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4231 if (st->iso2022.high_byte_count & 1)
4232 /* odd number of high bytes; assume not iso-8-2 */
4233 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4235 st->iso2022.high_byte_count = 0;
4236 st->iso2022.saw_single_shift = 0;
4238 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4240 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4241 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4242 { /* control chars */
4245 /* Allow and ignore control characters that you might
4246 reasonably see in a text file */
4251 case 8: /* backspace */
4252 case 11: /* vertical tab */
4253 case 12: /* form feed */
4254 case 26: /* MS-DOS C-z junk */
4255 case 31: /* '^_' -- for info */
4256 goto label_continue_loop;
4263 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4266 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4267 &st->iso2022.flags, 0))
4269 switch (st->iso2022.iso.esc)
4271 case ISO_ESC_DESIGNATE:
4272 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4273 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4275 case ISO_ESC_LOCKING_SHIFT:
4276 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4277 goto ran_out_of_chars;
4278 case ISO_ESC_SINGLE_SHIFT:
4279 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4280 st->iso2022.saw_single_shift = 1;
4289 goto ran_out_of_chars;
4292 label_continue_loop:;
4301 postprocess_iso2022_mask (int mask)
4303 /* #### kind of cheesy */
4304 /* If seven-bit ISO is allowed, then assume that the encoding is
4305 entirely seven-bit and turn off the eight-bit ones. */
4306 if (mask & CODING_CATEGORY_ISO_7_MASK)
4307 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4308 CODING_CATEGORY_ISO_8_1_MASK |
4309 CODING_CATEGORY_ISO_8_2_MASK);
4313 /* If FLAGS is a null pointer or specifies right-to-left motion,
4314 output a switch-dir-to-left-to-right sequence to DST.
4315 Also update FLAGS if it is not a null pointer.
4316 If INTERNAL_P is set, we are outputting in internal format and
4317 need to handle the CSI differently. */
4320 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4321 unsigned_char_dynarr *dst,
4322 unsigned int *flags,
4325 if (!flags || (*flags & CODING_STATE_R2L))
4327 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4329 Dynarr_add (dst, ISO_CODE_ESC);
4330 Dynarr_add (dst, '[');
4332 else if (internal_p)
4333 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4335 Dynarr_add (dst, ISO_CODE_CSI);
4336 Dynarr_add (dst, '0');
4337 Dynarr_add (dst, ']');
4339 *flags &= ~CODING_STATE_R2L;
4343 /* If FLAGS is a null pointer or specifies a direction different from
4344 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4345 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4346 sequence to DST. Also update FLAGS if it is not a null pointer.
4347 If INTERNAL_P is set, we are outputting in internal format and
4348 need to handle the CSI differently. */
4351 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4352 unsigned_char_dynarr *dst, unsigned int *flags,
4355 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4356 direction == CHARSET_LEFT_TO_RIGHT)
4357 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4358 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4359 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4360 direction == CHARSET_RIGHT_TO_LEFT)
4362 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4364 Dynarr_add (dst, ISO_CODE_ESC);
4365 Dynarr_add (dst, '[');
4367 else if (internal_p)
4368 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4370 Dynarr_add (dst, ISO_CODE_CSI);
4371 Dynarr_add (dst, '2');
4372 Dynarr_add (dst, ']');
4374 *flags |= CODING_STATE_R2L;
4378 /* Convert ISO2022-format data to internal format. */
4381 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4382 unsigned_char_dynarr *dst, unsigned int n)
4384 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4385 unsigned int flags = str->flags;
4386 unsigned int ch = str->ch;
4387 eol_type_t eol_type = str->eol_type;
4388 #ifdef ENABLE_COMPOSITE_CHARS
4389 unsigned_char_dynarr *real_dst = dst;
4391 Lisp_Object coding_system;
4393 XSETCODING_SYSTEM (coding_system, str->codesys);
4395 #ifdef ENABLE_COMPOSITE_CHARS
4396 if (flags & CODING_STATE_COMPOSITE)
4397 dst = str->iso2022.composite_chars;
4398 #endif /* ENABLE_COMPOSITE_CHARS */
4402 unsigned char c = *src++;
4403 if (flags & CODING_STATE_ESCAPE)
4404 { /* Within ESC sequence */
4405 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4410 switch (str->iso2022.esc)
4412 #ifdef ENABLE_COMPOSITE_CHARS
4413 case ISO_ESC_START_COMPOSITE:
4414 if (str->iso2022.composite_chars)
4415 Dynarr_reset (str->iso2022.composite_chars);
4417 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4418 dst = str->iso2022.composite_chars;
4420 case ISO_ESC_END_COMPOSITE:
4422 Bufbyte comstr[MAX_EMCHAR_LEN];
4424 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4425 Dynarr_length (dst));
4427 len = set_charptr_emchar (comstr, emch);
4428 Dynarr_add_many (dst, comstr, len);
4431 #endif /* ENABLE_COMPOSITE_CHARS */
4433 case ISO_ESC_LITERAL:
4434 DECODE_ADD_BINARY_CHAR (c, dst);
4438 /* Everything else handled already */
4443 /* Attempted error recovery. */
4444 if (str->iso2022.output_direction_sequence)
4445 ensure_correct_direction (flags & CODING_STATE_R2L ?
4446 CHARSET_RIGHT_TO_LEFT :
4447 CHARSET_LEFT_TO_RIGHT,
4448 str->codesys, dst, 0, 1);
4449 /* More error recovery. */
4450 if (!retval || str->iso2022.output_literally)
4452 /* Output the (possibly invalid) sequence */
4454 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4455 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4456 flags &= CODING_STATE_ISO2022_LOCK;
4458 n++, src--;/* Repeat the loop with the same character. */
4461 /* No sense in reprocessing the final byte of the
4462 escape sequence; it could mess things up anyway.
4464 DECODE_ADD_BINARY_CHAR (c, dst);
4469 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4470 { /* Control characters */
4472 /***** Error-handling *****/
4474 /* If we were in the middle of a character, dump out the
4475 partial character. */
4476 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4478 /* If we just saw a single-shift character, dump it out.
4479 This may dump out the wrong sort of single-shift character,
4480 but least it will give an indication that something went
4482 if (flags & CODING_STATE_SS2)
4484 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4485 flags &= ~CODING_STATE_SS2;
4487 if (flags & CODING_STATE_SS3)
4489 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4490 flags &= ~CODING_STATE_SS3;
4493 /***** Now handle the control characters. *****/
4496 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4498 flags &= CODING_STATE_ISO2022_LOCK;
4500 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4501 DECODE_ADD_BINARY_CHAR (c, dst);
4504 { /* Graphic characters */
4505 Lisp_Object charset;
4511 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4513 /* Now determine the charset. */
4514 reg = ((flags & CODING_STATE_SS2) ? 2
4515 : (flags & CODING_STATE_SS3) ? 3
4516 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4517 : str->iso2022.register_left);
4518 charset = str->iso2022.charset[reg];
4520 /* Error checking: */
4521 if (! CHARSETP (charset)
4522 || str->iso2022.invalid_designated[reg]
4523 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4524 && XCHARSET_CHARS (charset) == 94))
4525 /* Mrmph. We are trying to invoke a register that has no
4526 or an invalid charset in it, or trying to add a character
4527 outside the range of the charset. Insert that char literally
4528 to preserve it for the output. */
4530 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4531 DECODE_ADD_BINARY_CHAR (c, dst);
4536 /* Things are probably hunky-dorey. */
4538 /* Fetch reverse charset, maybe. */
4539 if (((flags & CODING_STATE_R2L) &&
4540 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4542 (!(flags & CODING_STATE_R2L) &&
4543 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4545 Lisp_Object new_charset =
4546 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4547 if (!NILP (new_charset))
4548 charset = new_charset;
4552 if (XCHARSET_DIMENSION (charset) == 1)
4554 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4556 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4561 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4567 lb = XCHARSET_LEADING_BYTE (charset);
4568 switch (XCHARSET_REP_BYTES (charset))
4571 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4572 Dynarr_add (dst, c & 0x7F);
4575 case 2: /* one-byte official */
4576 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4577 Dynarr_add (dst, lb);
4578 Dynarr_add (dst, c | 0x80);
4581 case 3: /* one-byte private or two-byte official */
4582 if (XCHARSET_PRIVATE_P (charset))
4584 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4585 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4586 Dynarr_add (dst, lb);
4587 Dynarr_add (dst, c | 0x80);
4593 Dynarr_add (dst, lb);
4594 Dynarr_add (dst, ch | 0x80);
4595 Dynarr_add (dst, c | 0x80);
4603 default: /* two-byte private */
4606 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4607 Dynarr_add (dst, lb);
4608 Dynarr_add (dst, ch | 0x80);
4609 Dynarr_add (dst, c | 0x80);
4619 flags &= CODING_STATE_ISO2022_LOCK;
4622 label_continue_loop:;
4625 if (flags & CODING_STATE_END)
4626 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4633 /***** ISO2022 encoder *****/
4635 /* Designate CHARSET into register REG. */
4638 iso2022_designate (Lisp_Object charset, unsigned char reg,
4639 struct encoding_stream *str, unsigned_char_dynarr *dst)
4641 static CONST char inter94[] = "()*+";
4642 static CONST char inter96[] = ",-./";
4644 unsigned char final;
4645 Lisp_Object old_charset = str->iso2022.charset[reg];
4647 str->iso2022.charset[reg] = charset;
4648 if (!CHARSETP (charset))
4649 /* charset might be an initial nil or t. */
4651 type = XCHARSET_TYPE (charset);
4652 final = XCHARSET_FINAL (charset);
4653 if (!str->iso2022.force_charset_on_output[reg] &&
4654 CHARSETP (old_charset) &&
4655 XCHARSET_TYPE (old_charset) == type &&
4656 XCHARSET_FINAL (old_charset) == final)
4659 str->iso2022.force_charset_on_output[reg] = 0;
4662 charset_conversion_spec_dynarr *dyn =
4663 str->codesys->iso2022.output_conv;
4669 for (i = 0; i < Dynarr_length (dyn); i++)
4671 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4672 if (EQ (charset, spec->from_charset))
4673 charset = spec->to_charset;
4678 Dynarr_add (dst, ISO_CODE_ESC);
4681 case CHARSET_TYPE_94:
4682 Dynarr_add (dst, inter94[reg]);
4684 case CHARSET_TYPE_96:
4685 Dynarr_add (dst, inter96[reg]);
4687 case CHARSET_TYPE_94X94:
4688 Dynarr_add (dst, '$');
4690 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4693 Dynarr_add (dst, inter94[reg]);
4695 case CHARSET_TYPE_96X96:
4696 Dynarr_add (dst, '$');
4697 Dynarr_add (dst, inter96[reg]);
4700 Dynarr_add (dst, final);
4704 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4706 if (str->iso2022.register_left != 0)
4708 Dynarr_add (dst, ISO_CODE_SI);
4709 str->iso2022.register_left = 0;
4714 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4716 if (str->iso2022.register_left != 1)
4718 Dynarr_add (dst, ISO_CODE_SO);
4719 str->iso2022.register_left = 1;
4724 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4725 unsigned_char_dynarr *dst, unsigned int *flags)
4727 unsigned char charmask;
4728 Lisp_Coding_System* codesys = str->codesys;
4729 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4731 Lisp_Object charset = str->iso2022.current_charset;
4732 int half = str->iso2022.current_half;
4733 unsigned int byte1, byte2;
4737 restore_left_to_right_direction (codesys, dst, flags, 0);
4739 /* Make sure G0 contains ASCII */
4740 if ((ch > ' ' && ch < ISO_CODE_DEL)
4741 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4743 ensure_normal_shift (str, dst);
4744 iso2022_designate (Vcharset_ascii, 0, str, dst);
4747 /* If necessary, restore everything to the default state
4749 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4751 restore_left_to_right_direction (codesys, dst, flags, 0);
4753 ensure_normal_shift (str, dst);
4755 for (i = 0; i < 4; i++)
4757 Lisp_Object initial_charset =
4758 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4759 iso2022_designate (initial_charset, i, str, dst);
4764 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4765 Dynarr_add (dst, '\r');
4766 if (eol_type != EOL_CR)
4767 Dynarr_add (dst, ch);
4771 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4772 && fit_to_be_escape_quoted (ch))
4773 Dynarr_add (dst, ISO_CODE_ESC);
4774 Dynarr_add (dst, ch);
4777 else if ( (0x80 <= ch) && (ch <= 0x9f) )
4779 charmask = (half == 0 ? 0x00 : 0x80);
4781 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4782 && fit_to_be_escape_quoted (ch))
4783 Dynarr_add (dst, ISO_CODE_ESC);
4784 /* you asked for it ... */
4785 Dynarr_add (dst, ch);
4791 /* Now determine which register to use. */
4793 for (i = 0; i < 4; i++)
4795 Lisp_Object code_point;
4797 if ((CHARSETP (charset = str->iso2022.charset[i])
4798 && !EQ (code_point = charset_code_point (charset, ch), Qnil))
4802 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
4803 && !EQ (code_point = charset_code_point (charset, ch), Qnil)))
4805 Lisp_Object ret = Fcar (code_point);
4810 ret = Fcar (Fcdr (code_point));
4827 Lisp_Object original_default_coded_charset_priority_list
4828 = Vdefault_coded_charset_priority_list;
4830 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
4832 BREAKUP_CHAR (ch, charset, byte1, byte2);
4833 if (XCHARSET_FINAL (charset))
4835 Vdefault_coded_charset_priority_list
4836 = Fcdr (Fmemq (XCHARSET_NAME (charset),
4837 Vdefault_coded_charset_priority_list));
4839 BREAKUP_CHAR (ch, charset, byte1, byte2);
4840 if (!XCHARSET_FINAL (charset))
4842 charset = Vcharset_ascii;
4846 Vdefault_coded_charset_priority_list
4847 = original_default_coded_charset_priority_list;
4849 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4850 codesys, dst, flags, 0);
4854 if (XCHARSET_GRAPHIC (charset) != 0)
4856 if (!NILP (str->iso2022.charset[1]) &&
4857 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4858 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4860 else if (!NILP (str->iso2022.charset[2]))
4862 else if (!NILP (str->iso2022.charset[3]))
4871 iso2022_designate (charset, reg, str, dst);
4873 /* Now invoke that register. */
4877 ensure_normal_shift (str, dst);
4881 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4883 ensure_shift_out (str, dst);
4890 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4892 Dynarr_add (dst, ISO_CODE_ESC);
4893 Dynarr_add (dst, 'N');
4898 Dynarr_add (dst, ISO_CODE_SS2);
4903 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4905 Dynarr_add (dst, ISO_CODE_ESC);
4906 Dynarr_add (dst, 'O');
4911 Dynarr_add (dst, ISO_CODE_SS3);
4919 charmask = (half == 0 ? 0x00 : 0x80);
4921 switch (XCHARSET_DIMENSION (charset))
4924 Dynarr_add (dst, byte1 | charmask);
4927 Dynarr_add (dst, byte1 | charmask);
4928 Dynarr_add (dst, byte2 | charmask);
4934 str->iso2022.current_charset = charset;
4935 str->iso2022.current_half = half;
4939 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4940 unsigned int *flags)
4942 Lisp_Coding_System* codesys = str->codesys;
4945 restore_left_to_right_direction (codesys, dst, flags, 0);
4946 ensure_normal_shift (str, dst);
4947 for (i = 0; i < 4; i++)
4949 Lisp_Object initial_charset
4950 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4951 iso2022_designate (initial_charset, i, str, dst);
4956 /************************************************************************/
4957 /* No-conversion methods */
4958 /************************************************************************/
4960 /* This is used when reading in "binary" files -- i.e. files that may
4961 contain all 256 possible byte values and that are not to be
4962 interpreted as being in any particular decoding. */
4964 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4965 unsigned_char_dynarr *dst, unsigned int n)
4968 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4969 unsigned int flags = str->flags;
4970 unsigned int ch = str->ch;
4971 eol_type_t eol_type = str->eol_type;
4977 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4978 DECODE_ADD_BINARY_CHAR (c, dst);
4979 label_continue_loop:;
4982 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4989 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4990 unsigned_char_dynarr *dst, unsigned int n)
4993 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4994 unsigned int flags = str->flags;
4995 unsigned int ch = str->ch;
4996 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4998 unsigned char char_boundary = str->iso2022.current_char_boundary;
5005 if (char_boundary == 0)
5011 else if ( c >= 0xf8 )
5016 else if ( c >= 0xf0 )
5021 else if ( c >= 0xe0 )
5026 else if ( c >= 0xc0 )
5036 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5037 Dynarr_add (dst, '\r');
5038 if (eol_type != EOL_CR)
5039 Dynarr_add (dst, c);
5042 Dynarr_add (dst, c);
5045 else if (char_boundary == 1)
5047 ch = ( ch << 6 ) | ( c & 0x3f );
5048 Dynarr_add (dst, ch & 0xff);
5053 ch = ( ch << 6 ) | ( c & 0x3f );
5056 #else /* not UTF2000 */
5059 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5060 Dynarr_add (dst, '\r');
5061 if (eol_type != EOL_CR)
5062 Dynarr_add (dst, '\n');
5065 else if (BYTE_ASCII_P (c))
5068 Dynarr_add (dst, c);
5070 else if (BUFBYTE_LEADING_BYTE_P (c))
5073 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5074 c == LEADING_BYTE_CONTROL_1)
5077 Dynarr_add (dst, '~'); /* untranslatable character */
5081 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5082 Dynarr_add (dst, c);
5083 else if (ch == LEADING_BYTE_CONTROL_1)
5086 Dynarr_add (dst, c - 0x20);
5088 /* else it should be the second or third byte of an
5089 untranslatable character, so ignore it */
5092 #endif /* not UTF2000 */
5098 str->iso2022.current_char_boundary = char_boundary;
5103 /************************************************************************/
5104 /* Simple internal/external functions */
5105 /************************************************************************/
5107 static Extbyte_dynarr *conversion_out_dynarr;
5108 static Bufbyte_dynarr *conversion_in_dynarr;
5110 /* Determine coding system from coding format */
5112 /* #### not correct for all values of `fmt'! */
5114 external_data_format_to_coding_system (enum external_data_format fmt)
5118 case FORMAT_FILENAME:
5119 case FORMAT_TERMINAL:
5120 if (EQ (Vfile_name_coding_system, Qnil) ||
5121 EQ (Vfile_name_coding_system, Qbinary))
5124 return Fget_coding_system (Vfile_name_coding_system);
5127 return Fget_coding_system (Qctext);
5135 convert_to_external_format (CONST Bufbyte *ptr,
5138 enum external_data_format fmt)
5140 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5142 if (!conversion_out_dynarr)
5143 conversion_out_dynarr = Dynarr_new (Extbyte);
5145 Dynarr_reset (conversion_out_dynarr);
5147 if (NILP (coding_system))
5149 CONST Bufbyte *end = ptr + len;
5155 (*ptr < 0xc0) ? *ptr :
5156 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5159 (BYTE_ASCII_P (*ptr)) ? *ptr :
5160 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5161 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5164 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5168 #ifdef ERROR_CHECK_BUFPOS
5169 assert (ptr == end);
5174 Lisp_Object instream, outstream, da_outstream;
5175 Lstream *istr, *ostr;
5176 struct gcpro gcpro1, gcpro2, gcpro3;
5177 char tempbuf[1024]; /* some random amount */
5179 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5180 da_outstream = make_dynarr_output_stream
5181 ((unsigned_char_dynarr *) conversion_out_dynarr);
5183 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5184 istr = XLSTREAM (instream);
5185 ostr = XLSTREAM (outstream);
5186 GCPRO3 (instream, outstream, da_outstream);
5189 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5192 Lstream_write (ostr, tempbuf, size_in_bytes);
5194 Lstream_close (istr);
5195 Lstream_close (ostr);
5197 Lstream_delete (istr);
5198 Lstream_delete (ostr);
5199 Lstream_delete (XLSTREAM (da_outstream));
5202 *len_out = Dynarr_length (conversion_out_dynarr);
5203 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5204 return Dynarr_atp (conversion_out_dynarr, 0);
5208 convert_from_external_format (CONST Extbyte *ptr,
5211 enum external_data_format fmt)
5213 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5215 if (!conversion_in_dynarr)
5216 conversion_in_dynarr = Dynarr_new (Bufbyte);
5218 Dynarr_reset (conversion_in_dynarr);
5220 if (NILP (coding_system))
5222 CONST Extbyte *end = ptr + len;
5223 for (; ptr < end; ptr++)
5226 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5231 Lisp_Object instream, outstream, da_outstream;
5232 Lstream *istr, *ostr;
5233 struct gcpro gcpro1, gcpro2, gcpro3;
5234 char tempbuf[1024]; /* some random amount */
5236 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5237 da_outstream = make_dynarr_output_stream
5238 ((unsigned_char_dynarr *) conversion_in_dynarr);
5240 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5241 istr = XLSTREAM (instream);
5242 ostr = XLSTREAM (outstream);
5243 GCPRO3 (instream, outstream, da_outstream);
5246 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5249 Lstream_write (ostr, tempbuf, size_in_bytes);
5251 Lstream_close (istr);
5252 Lstream_close (ostr);
5254 Lstream_delete (istr);
5255 Lstream_delete (ostr);
5256 Lstream_delete (XLSTREAM (da_outstream));
5259 *len_out = Dynarr_length (conversion_in_dynarr);
5260 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5261 return Dynarr_atp (conversion_in_dynarr, 0);
5265 /************************************************************************/
5266 /* Initialization */
5267 /************************************************************************/
5270 syms_of_file_coding (void)
5272 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5273 deferror (&Qcoding_system_error, "coding-system-error",
5274 "Coding-system error", Qio_error);
5276 DEFSUBR (Fcoding_system_p);
5277 DEFSUBR (Ffind_coding_system);
5278 DEFSUBR (Fget_coding_system);
5279 DEFSUBR (Fcoding_system_list);
5280 DEFSUBR (Fcoding_system_name);
5281 DEFSUBR (Fmake_coding_system);
5282 DEFSUBR (Fcopy_coding_system);
5283 DEFSUBR (Fdefine_coding_system_alias);
5284 DEFSUBR (Fsubsidiary_coding_system);
5286 DEFSUBR (Fcoding_system_type);
5287 DEFSUBR (Fcoding_system_doc_string);
5289 DEFSUBR (Fcoding_system_charset);
5291 DEFSUBR (Fcoding_system_property);
5293 DEFSUBR (Fcoding_category_list);
5294 DEFSUBR (Fset_coding_priority_list);
5295 DEFSUBR (Fcoding_priority_list);
5296 DEFSUBR (Fset_coding_category_system);
5297 DEFSUBR (Fcoding_category_system);
5299 DEFSUBR (Fdetect_coding_region);
5300 DEFSUBR (Fdecode_coding_region);
5301 DEFSUBR (Fencode_coding_region);
5303 DEFSUBR (Fdecode_shift_jis_char);
5304 DEFSUBR (Fencode_shift_jis_char);
5305 DEFSUBR (Fdecode_big5_char);
5306 DEFSUBR (Fencode_big5_char);
5308 defsymbol (&Qcoding_system_p, "coding-system-p");
5309 defsymbol (&Qno_conversion, "no-conversion");
5310 defsymbol (&Qraw_text, "raw-text");
5312 defsymbol (&Qbig5, "big5");
5313 defsymbol (&Qshift_jis, "shift-jis");
5314 defsymbol (&Qucs4, "ucs-4");
5315 defsymbol (&Qutf8, "utf-8");
5316 defsymbol (&Qccl, "ccl");
5317 defsymbol (&Qiso2022, "iso2022");
5319 defsymbol (&Qmnemonic, "mnemonic");
5320 defsymbol (&Qeol_type, "eol-type");
5321 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5322 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5324 defsymbol (&Qcr, "cr");
5325 defsymbol (&Qlf, "lf");
5326 defsymbol (&Qcrlf, "crlf");
5327 defsymbol (&Qeol_cr, "eol-cr");
5328 defsymbol (&Qeol_lf, "eol-lf");
5329 defsymbol (&Qeol_crlf, "eol-crlf");
5331 defsymbol (&Qcharset_g0, "charset-g0");
5332 defsymbol (&Qcharset_g1, "charset-g1");
5333 defsymbol (&Qcharset_g2, "charset-g2");
5334 defsymbol (&Qcharset_g3, "charset-g3");
5335 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5336 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5337 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5338 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5339 defsymbol (&Qno_iso6429, "no-iso6429");
5340 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5341 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5343 defsymbol (&Qshort, "short");
5344 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5345 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5346 defsymbol (&Qseven, "seven");
5347 defsymbol (&Qlock_shift, "lock-shift");
5348 defsymbol (&Qescape_quoted, "escape-quoted");
5350 defsymbol (&Qencode, "encode");
5351 defsymbol (&Qdecode, "decode");
5354 defsymbol (&Qctext, "ctext");
5355 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5357 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5359 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5361 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5363 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5365 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5367 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5369 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5371 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5374 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5379 lstream_type_create_file_coding (void)
5381 LSTREAM_HAS_METHOD (decoding, reader);
5382 LSTREAM_HAS_METHOD (decoding, writer);
5383 LSTREAM_HAS_METHOD (decoding, rewinder);
5384 LSTREAM_HAS_METHOD (decoding, seekable_p);
5385 LSTREAM_HAS_METHOD (decoding, flusher);
5386 LSTREAM_HAS_METHOD (decoding, closer);
5387 LSTREAM_HAS_METHOD (decoding, marker);
5389 LSTREAM_HAS_METHOD (encoding, reader);
5390 LSTREAM_HAS_METHOD (encoding, writer);
5391 LSTREAM_HAS_METHOD (encoding, rewinder);
5392 LSTREAM_HAS_METHOD (encoding, seekable_p);
5393 LSTREAM_HAS_METHOD (encoding, flusher);
5394 LSTREAM_HAS_METHOD (encoding, closer);
5395 LSTREAM_HAS_METHOD (encoding, marker);
5399 vars_of_file_coding (void)
5403 /* Initialize to something reasonable ... */
5404 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5406 coding_category_system[i] = Qnil;
5407 coding_category_by_priority[i] = i;
5410 Fprovide (intern ("file-coding"));
5412 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5413 Coding system used for TTY keyboard input.
5414 Not used under a windowing system.
5416 Vkeyboard_coding_system = Qnil;
5418 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5419 Coding system used for TTY display output.
5420 Not used under a windowing system.
5422 Vterminal_coding_system = Qnil;
5424 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5425 Overriding coding system used when writing a file or process.
5426 You should *bind* this, not set it. If this is non-nil, it specifies
5427 the coding system that will be used when a file or process is read
5428 in, and overrides `buffer-file-coding-system-for-read',
5429 `insert-file-contents-pre-hook', etc. Use those variables instead of
5430 this one for permanent changes to the environment.
5432 Vcoding_system_for_read = Qnil;
5434 DEFVAR_LISP ("coding-system-for-write",
5435 &Vcoding_system_for_write /*
5436 Overriding coding system used when writing a file or process.
5437 You should *bind* this, not set it. If this is non-nil, it specifies
5438 the coding system that will be used when a file or process is wrote
5439 in, and overrides `buffer-file-coding-system',
5440 `write-region-pre-hook', etc. Use those variables instead of this one
5441 for permanent changes to the environment.
5443 Vcoding_system_for_write = Qnil;
5445 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5446 Coding system used to convert pathnames when accessing files.
5448 Vfile_name_coding_system = Qnil;
5450 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5451 Non-nil means the buffer contents are regarded as multi-byte form
5452 of characters, not a binary code. This affects the display, file I/O,
5453 and behaviors of various editing commands.
5455 Setting this to nil does not do anything.
5457 enable_multibyte_characters = 1;
5461 complex_vars_of_file_coding (void)
5463 staticpro (&Vcoding_system_hash_table);
5464 Vcoding_system_hash_table =
5465 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5467 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5469 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5471 struct codesys_prop csp; \
5473 csp.prop_type = (Prop_Type); \
5474 Dynarr_add (the_codesys_prop_dynarr, csp); \
5477 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5478 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5479 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5480 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5481 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5482 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5483 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5485 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5486 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5487 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5488 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5489 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5490 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5491 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5492 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5493 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5494 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5495 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5496 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5497 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5498 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5499 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5500 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5501 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5503 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5504 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5506 /* Need to create this here or we're really screwed. */
5508 (Qraw_text, Qno_conversion,
5509 build_string ("Raw text, which means it converts only line-break-codes."),
5510 list2 (Qmnemonic, build_string ("Raw")));
5513 (Qbinary, Qno_conversion,
5514 build_string ("Binary, which means it does not convert anything."),
5515 list4 (Qeol_type, Qlf,
5516 Qmnemonic, build_string ("Binary")));
5521 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5522 list2 (Qmnemonic, build_string ("UTF8")));
5525 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5527 /* Need this for bootstrapping */
5528 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5529 Fget_coding_system (Qraw_text);
5532 coding_category_system[CODING_CATEGORY_UTF8]
5533 = Fget_coding_system (Qutf8);