1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
50 /* Coding system currently associated with each coding category. */
51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
53 /* Table of all coding categories in decreasing order of priority.
54 This describes a permutation of the possible coding categories. */
55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
57 Lisp_Object Qcoding_system_p;
59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
60 /* Qinternal in general.c */
62 Lisp_Object Qmnemonic, Qeol_type;
63 Lisp_Object Qcr, Qcrlf, Qlf;
64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
65 Lisp_Object Qpost_read_conversion;
66 Lisp_Object Qpre_write_conversion;
69 Lisp_Object Qucs4, Qutf8;
70 Lisp_Object Qbig5, Qshift_jis;
71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
74 Lisp_Object Qno_iso6429;
75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
76 Lisp_Object Qctext, Qescape_quoted;
77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
79 Lisp_Object Qencode, Qdecode;
81 Lisp_Object Vcoding_system_hash_table;
83 int enable_multibyte_characters;
86 /* Additional information used by the ISO2022 decoder and detector. */
87 struct iso2022_decoder
89 /* CHARSET holds the character sets currently assigned to the G0
90 through G3 variables. It is initialized from the array
91 INITIAL_CHARSET in CODESYS. */
92 Lisp_Object charset[4];
94 /* Which registers are currently invoked into the left (GL) and
95 right (GR) halves of the 8-bit encoding space? */
96 int register_left, register_right;
98 /* ISO_ESC holds a value indicating part of an escape sequence
99 that has already been seen. */
100 enum iso_esc_flag esc;
102 /* This records the bytes we've seen so far in an escape sequence,
103 in case the sequence is invalid (we spit out the bytes unchanged). */
104 unsigned char esc_bytes[8];
106 /* Index for next byte to store in ISO escape sequence. */
109 #ifdef ENABLE_COMPOSITE_CHARS
110 /* Stuff seen so far when composing a string. */
111 unsigned_char_dynarr *composite_chars;
114 /* If we saw an invalid designation sequence for a particular
115 register, we flag it here and switch to ASCII. The next time we
116 see a valid designation for this register, we turn off the flag
117 and do the designation normally, but pretend the sequence was
118 invalid. The effect of all this is that (most of the time) the
119 escape sequences for both the switch to the unknown charset, and
120 the switch back to the known charset, get inserted literally into
121 the buffer and saved out as such. The hope is that we can
122 preserve the escape sequences so that the resulting written out
123 file makes sense. If we don't do any of this, the designation
124 to the invalid charset will be preserved but that switch back
125 to the known charset will probably get eaten because it was
126 the same charset that was already present in the register. */
127 unsigned char invalid_designated[4];
129 /* We try to do similar things as above for direction-switching
130 sequences. If we encountered a direction switch while an
131 invalid designation was present, or an invalid designation
132 just after a direction switch (i.e. no valid designation
133 encountered yet), we insert the direction-switch escape
134 sequence literally into the output stream, and later on
135 insert the corresponding direction-restoring escape sequence
137 unsigned int switched_dir_and_no_valid_charset_yet :1;
138 unsigned int invalid_switch_dir :1;
140 /* Tells the decoder to output the escape sequence literally
141 even though it was valid. Used in the games we play to
142 avoid lossage when we encounter invalid designations. */
143 unsigned int output_literally :1;
144 /* We encountered a direction switch followed by an invalid
145 designation. We didn't output the direction switch
146 literally because we didn't know about the invalid designation;
147 but we have to do so now. */
148 unsigned int output_direction_sequence :1;
151 EXFUN (Fcopy_coding_system, 2);
153 struct detection_state;
156 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
157 unsigned_char_dynarr *dst, unsigned int n);
159 static int detect_coding_sjis (struct detection_state *st,
160 CONST unsigned char *src,
162 static void decode_coding_sjis (Lstream *decoding,
163 CONST unsigned char *src,
164 unsigned_char_dynarr *dst,
166 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
167 unsigned_char_dynarr *dst, unsigned int *flags);
168 void char_finish_shift_jis (struct encoding_stream *str,
169 unsigned_char_dynarr *dst, unsigned int *flags);
171 static int detect_coding_big5 (struct detection_state *st,
172 CONST unsigned char *src,
174 static void decode_coding_big5 (Lstream *decoding,
175 CONST unsigned char *src,
176 unsigned_char_dynarr *dst, unsigned int n);
177 static void encode_coding_big5 (Lstream *encoding,
178 CONST unsigned char *src,
179 unsigned_char_dynarr *dst, unsigned int n);
180 static int detect_coding_ucs4 (struct detection_state *st,
181 CONST unsigned char *src,
183 static void decode_coding_ucs4 (Lstream *decoding,
184 CONST unsigned char *src,
185 unsigned_char_dynarr *dst, unsigned int n);
186 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
187 unsigned_char_dynarr *dst, unsigned int *flags);
188 void char_finish_ucs4 (struct encoding_stream *str,
189 unsigned_char_dynarr *dst, unsigned int *flags);
191 static int detect_coding_utf8 (struct detection_state *st,
192 CONST unsigned char *src,
194 static void decode_coding_utf8 (Lstream *decoding,
195 CONST unsigned char *src,
196 unsigned_char_dynarr *dst, unsigned int n);
197 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
198 unsigned_char_dynarr *dst, unsigned int *flags);
199 void char_finish_utf8 (struct encoding_stream *str,
200 unsigned_char_dynarr *dst, unsigned int *flags);
202 static int postprocess_iso2022_mask (int mask);
203 static void reset_iso2022 (Lisp_Object coding_system,
204 struct iso2022_decoder *iso);
205 static int detect_coding_iso2022 (struct detection_state *st,
206 CONST unsigned char *src,
208 static void decode_coding_iso2022 (Lstream *decoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst, unsigned int n);
211 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
212 unsigned_char_dynarr *dst, unsigned int *flags);
213 void char_finish_iso2022 (struct encoding_stream *str,
214 unsigned_char_dynarr *dst, unsigned int *flags);
216 static void decode_coding_no_conversion (Lstream *decoding,
217 CONST unsigned char *src,
218 unsigned_char_dynarr *dst,
220 static void encode_coding_no_conversion (Lstream *encoding,
221 CONST unsigned char *src,
222 unsigned_char_dynarr *dst,
224 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
225 unsigned_char_dynarr *dst, unsigned int n);
226 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
227 unsigned_char_dynarr *dst, unsigned int n);
229 typedef struct codesys_prop codesys_prop;
238 Dynarr_declare (codesys_prop);
239 } codesys_prop_dynarr;
241 codesys_prop_dynarr *the_codesys_prop_dynarr;
243 enum codesys_prop_enum
246 CODESYS_PROP_ISO2022,
251 /************************************************************************/
252 /* Coding system functions */
253 /************************************************************************/
255 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
256 static void print_coding_system (Lisp_Object, Lisp_Object, int);
257 static void finalize_coding_system (void *header, int for_disksave);
260 static const struct lrecord_description ccs_description_1[] = {
261 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
265 static const struct struct_description ccs_description = {
266 sizeof(charset_conversion_spec),
270 static const struct lrecord_description ccsd_description_1[] = {
271 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
275 static const struct struct_description ccsd_description = {
276 sizeof(charset_conversion_spec_dynarr),
281 static const struct lrecord_description coding_system_description[] = {
282 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
283 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
284 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
286 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
287 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
288 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
289 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
294 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
295 mark_coding_system, print_coding_system,
296 finalize_coding_system,
297 0, 0, coding_system_description,
298 struct Lisp_Coding_System);
301 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
303 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
305 markobj (CODING_SYSTEM_NAME (codesys));
306 markobj (CODING_SYSTEM_DOC_STRING (codesys));
307 markobj (CODING_SYSTEM_MNEMONIC (codesys));
308 markobj (CODING_SYSTEM_EOL_LF (codesys));
309 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
310 markobj (CODING_SYSTEM_EOL_CR (codesys));
312 switch (CODING_SYSTEM_TYPE (codesys))
316 case CODESYS_ISO2022:
317 for (i = 0; i < 4; i++)
318 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
319 if (codesys->iso2022.input_conv)
321 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
323 struct charset_conversion_spec *ccs =
324 Dynarr_atp (codesys->iso2022.input_conv, i);
325 markobj (ccs->from_charset);
326 markobj (ccs->to_charset);
329 if (codesys->iso2022.output_conv)
331 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
333 struct charset_conversion_spec *ccs =
334 Dynarr_atp (codesys->iso2022.output_conv, i);
335 markobj (ccs->from_charset);
336 markobj (ccs->to_charset);
342 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
343 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
350 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
351 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
355 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
358 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
360 error ("printing unreadable object #<coding_system 0x%x>",
363 write_c_string ("#<coding_system ", printcharfun);
364 print_internal (c->name, printcharfun, 1);
365 write_c_string (">", printcharfun);
369 finalize_coding_system (void *header, int for_disksave)
371 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
372 /* Since coding systems never go away, this function is not
373 necessary. But it would be necessary if we changed things
374 so that coding systems could go away. */
375 if (!for_disksave) /* see comment in lstream.c */
377 switch (CODING_SYSTEM_TYPE (c))
380 case CODESYS_ISO2022:
381 if (c->iso2022.input_conv)
383 Dynarr_free (c->iso2022.input_conv);
384 c->iso2022.input_conv = 0;
386 if (c->iso2022.output_conv)
388 Dynarr_free (c->iso2022.output_conv);
389 c->iso2022.output_conv = 0;
400 symbol_to_eol_type (Lisp_Object symbol)
402 CHECK_SYMBOL (symbol);
403 if (NILP (symbol)) return EOL_AUTODETECT;
404 if (EQ (symbol, Qlf)) return EOL_LF;
405 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
406 if (EQ (symbol, Qcr)) return EOL_CR;
408 signal_simple_error ("Unrecognized eol type", symbol);
409 return EOL_AUTODETECT; /* not reached */
413 eol_type_to_symbol (enum eol_type type)
418 case EOL_LF: return Qlf;
419 case EOL_CRLF: return Qcrlf;
420 case EOL_CR: return Qcr;
421 case EOL_AUTODETECT: return Qnil;
426 setup_eol_coding_systems (Lisp_Coding_System *codesys)
428 Lisp_Object codesys_obj;
429 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
430 char *codesys_name = (char *) alloca (len + 7);
432 char *codesys_mnemonic=0;
434 Lisp_Object codesys_name_sym, sub_codesys_obj;
438 XSETCODING_SYSTEM (codesys_obj, codesys);
440 memcpy (codesys_name,
441 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
443 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
445 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
446 codesys_mnemonic = (char *) alloca (mlen + 7);
447 memcpy (codesys_mnemonic,
448 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
451 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
452 strcpy (codesys_name + len, "-" op_sys); \
454 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
455 codesys_name_sym = intern (codesys_name); \
456 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
457 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
459 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
460 build_string (codesys_mnemonic); \
461 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
464 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
465 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
466 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
469 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
470 Return t if OBJECT is a coding system.
471 A coding system is an object that defines how text containing multiple
472 character sets is encoded into a stream of (typically 8-bit) bytes.
473 The coding system is used to decode the stream into a series of
474 characters (which may be from multiple charsets) when the text is read
475 from a file or process, and is used to encode the text back into the
476 same format when it is written out to a file or process.
478 For example, many ISO2022-compliant coding systems (such as Compound
479 Text, which is used for inter-client data under the X Window System)
480 use escape sequences to switch between different charsets -- Japanese
481 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
482 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
483 `make-coding-system' for more information.
485 Coding systems are normally identified using a symbol, and the
486 symbol is accepted in place of the actual coding system object whenever
487 a coding system is called for. (This is similar to how faces work.)
491 return CODING_SYSTEMP (object) ? Qt : Qnil;
494 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
495 Retrieve the coding system of the given name.
497 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
498 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
499 If there is no such coding system, nil is returned. Otherwise the
500 associated coding system object is returned.
502 (coding_system_or_name))
504 if (CODING_SYSTEMP (coding_system_or_name))
505 return coding_system_or_name;
507 if (NILP (coding_system_or_name))
508 coding_system_or_name = Qbinary;
510 CHECK_SYMBOL (coding_system_or_name);
512 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
515 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
516 Retrieve the coding system of the given name.
517 Same as `find-coding-system' except that if there is no such
518 coding system, an error is signaled instead of returning nil.
522 Lisp_Object coding_system = Ffind_coding_system (name);
524 if (NILP (coding_system))
525 signal_simple_error ("No such coding system", name);
526 return coding_system;
529 /* We store the coding systems in hash tables with the names as the key and the
530 actual coding system object as the value. Occasionally we need to use them
531 in a list format. These routines provide us with that. */
532 struct coding_system_list_closure
534 Lisp_Object *coding_system_list;
538 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
539 void *coding_system_list_closure)
541 /* This function can GC */
542 struct coding_system_list_closure *cscl =
543 (struct coding_system_list_closure *) coding_system_list_closure;
544 Lisp_Object *coding_system_list = cscl->coding_system_list;
546 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
547 *coding_system_list);
551 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
552 Return a list of the names of all defined coding systems.
556 Lisp_Object coding_system_list = Qnil;
558 struct coding_system_list_closure coding_system_list_closure;
560 GCPRO1 (coding_system_list);
561 coding_system_list_closure.coding_system_list = &coding_system_list;
562 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
563 &coding_system_list_closure);
566 return coding_system_list;
569 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
570 Return the name of the given coding system.
574 coding_system = Fget_coding_system (coding_system);
575 return XCODING_SYSTEM_NAME (coding_system);
578 static Lisp_Coding_System *
579 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
581 Lisp_Coding_System *codesys =
582 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
584 zero_lcrecord (codesys);
585 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
586 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
587 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
588 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
589 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
590 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
591 CODING_SYSTEM_TYPE (codesys) = type;
592 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
594 if (type == CODESYS_ISO2022)
597 for (i = 0; i < 4; i++)
598 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
600 else if (type == CODESYS_CCL)
602 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
603 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
606 CODING_SYSTEM_NAME (codesys) = name;
612 /* Given a list of charset conversion specs as specified in a Lisp
613 program, parse it into STORE_HERE. */
616 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
617 Lisp_Object spec_list)
621 EXTERNAL_LIST_LOOP (rest, spec_list)
623 Lisp_Object car = XCAR (rest);
624 Lisp_Object from, to;
625 struct charset_conversion_spec spec;
627 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
628 signal_simple_error ("Invalid charset conversion spec", car);
629 from = Fget_charset (XCAR (car));
630 to = Fget_charset (XCAR (XCDR (car)));
631 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
632 signal_simple_error_2
633 ("Attempted conversion between different charset types",
635 spec.from_charset = from;
636 spec.to_charset = to;
638 Dynarr_add (store_here, spec);
642 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
643 specs, return the equivalent as the Lisp programmer would see it.
645 If LOAD_HERE is 0, return Qnil. */
648 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
655 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
657 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
658 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
661 return Fnreverse (result);
666 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
667 Register symbol NAME as a coding system.
669 TYPE describes the conversion method used and should be one of
672 Automatic conversion. XEmacs attempts to detect the coding system
675 No conversion. Use this for binary files and such. On output,
676 graphic characters that are not in ASCII or Latin-1 will be
677 replaced by a ?. (For a no-conversion-encoded buffer, these
678 characters will only be present if you explicitly insert them.)
680 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
682 ISO 10646 UCS-4 encoding.
684 ISO 10646 UTF-8 encoding.
686 Any ISO2022-compliant encoding. Among other things, this includes
687 JIS (the Japanese encoding commonly used for e-mail), EUC (the
688 standard Unix encoding for Japanese and other languages), and
689 Compound Text (the encoding used in X11). You can specify more
690 specific information about the conversion with the FLAGS argument.
692 Big5 (the encoding commonly used for Taiwanese).
694 The conversion is performed using a user-written pseudo-code
695 program. CCL (Code Conversion Language) is the name of this
698 Write out or read in the raw contents of the memory representing
699 the buffer's text. This is primarily useful for debugging
700 purposes, and is only enabled when XEmacs has been compiled with
701 DEBUG_XEMACS defined (via the --debug configure option).
702 WARNING: Reading in a file using 'internal conversion can result
703 in an internal inconsistency in the memory representing a
704 buffer's text, which will produce unpredictable results and may
705 cause XEmacs to crash. Under normal circumstances you should
706 never use 'internal conversion.
708 DOC-STRING is a string describing the coding system.
710 PROPS is a property list, describing the specific nature of the
711 character set. Recognized properties are:
714 String to be displayed in the modeline when this coding system is
718 End-of-line conversion to be used. It should be one of
721 Automatically detect the end-of-line type (LF, CRLF,
722 or CR). Also generate subsidiary coding systems named
723 `NAME-unix', `NAME-dos', and `NAME-mac', that are
724 identical to this coding system but have an EOL-TYPE
725 value of 'lf, 'crlf, and 'cr, respectively.
727 The end of a line is marked externally using ASCII LF.
728 Since this is also the way that XEmacs represents an
729 end-of-line internally, specifying this option results
730 in no end-of-line conversion. This is the standard
731 format for Unix text files.
733 The end of a line is marked externally using ASCII
734 CRLF. This is the standard format for MS-DOS text
737 The end of a line is marked externally using ASCII CR.
738 This is the standard format for Macintosh text files.
740 Automatically detect the end-of-line type but do not
741 generate subsidiary coding systems. (This value is
742 converted to nil when stored internally, and
743 `coding-system-property' will return nil.)
745 'post-read-conversion
746 Function called after a file has been read in, to perform the
747 decoding. Called with two arguments, BEG and END, denoting
748 a region of the current buffer to be decoded.
750 'pre-write-conversion
751 Function called before a file is written out, to perform the
752 encoding. Called with two arguments, BEG and END, denoting
753 a region of the current buffer to be encoded.
756 The following additional properties are recognized if TYPE is 'iso2022:
762 The character set initially designated to the G0 - G3 registers.
763 The value should be one of
765 -- A charset object (designate that character set)
766 -- nil (do not ever use this register)
767 -- t (no character set is initially designated to
768 the register, but may be later on; this automatically
769 sets the corresponding `force-g*-on-output' property)
775 If non-nil, send an explicit designation sequence on output before
776 using the specified register.
779 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
780 "ESC $ B" on output in place of the full designation sequences
781 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
784 If non-nil, don't designate ASCII to G0 at each end of line on output.
785 Setting this to non-nil also suppresses other state-resetting that
786 normally happens at the end of a line.
789 If non-nil, don't designate ASCII to G0 before control chars on output.
792 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
796 If non-nil, use locking-shift (SO/SI) instead of single-shift
797 or designation by escape sequence.
800 If non-nil, don't use ISO6429's direction specification.
803 If non-nil, literal control characters that are the same as
804 the beginning of a recognized ISO2022 or ISO6429 escape sequence
805 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
806 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
807 so that they can be properly distinguished from an escape sequence.
808 (Note that doing this results in a non-portable encoding.) This
809 encoding flag is used for byte-compiled files. Note that ESC
810 is a good choice for a quoting character because there are no
811 escape sequences whose second byte is a character from the Control-0
812 or Control-1 character sets; this is explicitly disallowed by the
815 'input-charset-conversion
816 A list of conversion specifications, specifying conversion of
817 characters in one charset to another when decoding is performed.
818 Each specification is a list of two elements: the source charset,
819 and the destination charset.
821 'output-charset-conversion
822 A list of conversion specifications, specifying conversion of
823 characters in one charset to another when encoding is performed.
824 The form of each specification is the same as for
825 'input-charset-conversion.
828 The following additional properties are recognized (and required)
832 CCL program used for decoding (converting to internal format).
835 CCL program used for encoding (converting to external format).
837 (name, type, doc_string, props))
839 Lisp_Coding_System *codesys;
840 Lisp_Object rest, key, value;
841 enum coding_system_type ty;
842 int need_to_setup_eol_systems = 1;
844 /* Convert type to constant */
845 if (NILP (type) || EQ (type, Qundecided))
846 { ty = CODESYS_AUTODETECT; }
848 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
849 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
850 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
851 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
852 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
853 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
855 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
857 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
860 signal_simple_error ("Invalid coding system type", type);
864 codesys = allocate_coding_system (ty, name);
866 if (NILP (doc_string))
867 doc_string = build_string ("");
869 CHECK_STRING (doc_string);
870 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
873 if (ty == CODESYS_NO_CONVERSION)
874 codesys->fixed.size = 1;
876 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
878 if (EQ (key, Qmnemonic))
881 CHECK_STRING (value);
882 CODING_SYSTEM_MNEMONIC (codesys) = value;
885 else if (EQ (key, Qeol_type))
887 need_to_setup_eol_systems = NILP (value);
890 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
893 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
894 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
896 else if (ty == CODESYS_ISO2022)
898 #define FROB_INITIAL_CHARSET(charset_num) \
899 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
900 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
902 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
903 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
904 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
905 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
907 #define FROB_FORCE_CHARSET(charset_num) \
908 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
910 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
911 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
912 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
913 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
915 #define FROB_BOOLEAN_PROPERTY(prop) \
916 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
918 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
919 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
920 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
921 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
922 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
923 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
924 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
926 else if (EQ (key, Qinput_charset_conversion))
928 codesys->iso2022.input_conv =
929 Dynarr_new (charset_conversion_spec);
930 parse_charset_conversion_specs (codesys->iso2022.input_conv,
933 else if (EQ (key, Qoutput_charset_conversion))
935 codesys->iso2022.output_conv =
936 Dynarr_new (charset_conversion_spec);
937 parse_charset_conversion_specs (codesys->iso2022.output_conv,
941 signal_simple_error ("Unrecognized property", key);
943 else if (EQ (type, Qccl))
945 if (EQ (key, Qdecode))
947 CHECK_VECTOR (value);
948 CODING_SYSTEM_CCL_DECODE (codesys) = value;
950 else if (EQ (key, Qencode))
952 CHECK_VECTOR (value);
953 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
956 signal_simple_error ("Unrecognized property", key);
960 signal_simple_error ("Unrecognized property", key);
963 if (need_to_setup_eol_systems)
964 setup_eol_coding_systems (codesys);
967 Lisp_Object codesys_obj;
968 XSETCODING_SYSTEM (codesys_obj, codesys);
969 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
974 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
975 Copy OLD-CODING-SYSTEM to NEW-NAME.
976 If NEW-NAME does not name an existing coding system, a new one will
979 (old_coding_system, new_name))
981 Lisp_Object new_coding_system;
982 old_coding_system = Fget_coding_system (old_coding_system);
983 new_coding_system = Ffind_coding_system (new_name);
984 if (NILP (new_coding_system))
986 XSETCODING_SYSTEM (new_coding_system,
987 allocate_coding_system
988 (XCODING_SYSTEM_TYPE (old_coding_system),
990 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
994 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
995 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
996 memcpy (((char *) to ) + sizeof (to->header),
997 ((char *) from) + sizeof (from->header),
998 sizeof (*from) - sizeof (from->header));
1001 return new_coding_system;
1004 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1005 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1007 (alias, coding_system))
1009 CHECK_SYMBOL (alias);
1010 if (!NILP (Ffind_coding_system (alias)))
1011 signal_simple_error ("Symbol already names a coding system", alias);
1012 coding_system = Fget_coding_system (coding_system);
1013 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1015 /* Set up aliases for subsidiaries. */
1016 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1019 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1020 #define FROB(type, name) \
1022 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1023 if (!NILP (subsidiary)) \
1024 Fdefine_coding_system_alias \
1025 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1028 FROB (CRLF, "-dos");
1032 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1033 but it doesn't look intentional, so I'd rather return something
1034 meaningful or nothing at all. */
1039 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1041 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1042 Lisp_Object new_coding_system;
1044 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1045 return coding_system;
1049 case EOL_AUTODETECT: return coding_system;
1050 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1051 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1052 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1056 return NILP (new_coding_system) ? coding_system : new_coding_system;
1059 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1060 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1062 (coding_system, eol_type))
1064 coding_system = Fget_coding_system (coding_system);
1066 return subsidiary_coding_system (coding_system,
1067 symbol_to_eol_type (eol_type));
1071 /************************************************************************/
1072 /* Coding system accessors */
1073 /************************************************************************/
1075 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1076 Return the doc string for CODING-SYSTEM.
1080 coding_system = Fget_coding_system (coding_system);
1081 return XCODING_SYSTEM_DOC_STRING (coding_system);
1084 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1085 Return the type of CODING-SYSTEM.
1089 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1092 case CODESYS_AUTODETECT: return Qundecided;
1094 case CODESYS_SHIFT_JIS: return Qshift_jis;
1095 case CODESYS_ISO2022: return Qiso2022;
1096 case CODESYS_BIG5: return Qbig5;
1097 case CODESYS_UCS4: return Qucs4;
1098 case CODESYS_UTF8: return Qutf8;
1099 case CODESYS_CCL: return Qccl;
1101 case CODESYS_NO_CONVERSION: return Qno_conversion;
1103 case CODESYS_INTERNAL: return Qinternal;
1110 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1113 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1115 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1118 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1119 Return initial charset of CODING-SYSTEM designated to GNUM.
1122 (coding_system, gnum))
1124 coding_system = Fget_coding_system (coding_system);
1127 return coding_system_charset (coding_system, XINT (gnum));
1131 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1132 Return the PROP property of CODING-SYSTEM.
1134 (coding_system, prop))
1137 enum coding_system_type type;
1139 coding_system = Fget_coding_system (coding_system);
1140 CHECK_SYMBOL (prop);
1141 type = XCODING_SYSTEM_TYPE (coding_system);
1143 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1144 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1147 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1149 case CODESYS_PROP_ALL_OK:
1152 case CODESYS_PROP_ISO2022:
1153 if (type != CODESYS_ISO2022)
1155 ("Property only valid in ISO2022 coding systems",
1159 case CODESYS_PROP_CCL:
1160 if (type != CODESYS_CCL)
1162 ("Property only valid in CCL coding systems",
1172 signal_simple_error ("Unrecognized property", prop);
1174 if (EQ (prop, Qname))
1175 return XCODING_SYSTEM_NAME (coding_system);
1176 else if (EQ (prop, Qtype))
1177 return Fcoding_system_type (coding_system);
1178 else if (EQ (prop, Qdoc_string))
1179 return XCODING_SYSTEM_DOC_STRING (coding_system);
1180 else if (EQ (prop, Qmnemonic))
1181 return XCODING_SYSTEM_MNEMONIC (coding_system);
1182 else if (EQ (prop, Qeol_type))
1183 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1184 else if (EQ (prop, Qeol_lf))
1185 return XCODING_SYSTEM_EOL_LF (coding_system);
1186 else if (EQ (prop, Qeol_crlf))
1187 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1188 else if (EQ (prop, Qeol_cr))
1189 return XCODING_SYSTEM_EOL_CR (coding_system);
1190 else if (EQ (prop, Qpost_read_conversion))
1191 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1192 else if (EQ (prop, Qpre_write_conversion))
1193 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1195 else if (type == CODESYS_ISO2022)
1197 if (EQ (prop, Qcharset_g0))
1198 return coding_system_charset (coding_system, 0);
1199 else if (EQ (prop, Qcharset_g1))
1200 return coding_system_charset (coding_system, 1);
1201 else if (EQ (prop, Qcharset_g2))
1202 return coding_system_charset (coding_system, 2);
1203 else if (EQ (prop, Qcharset_g3))
1204 return coding_system_charset (coding_system, 3);
1206 #define FORCE_CHARSET(charset_num) \
1207 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1208 (coding_system, charset_num) ? Qt : Qnil)
1210 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1211 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1212 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1213 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1215 #define LISP_BOOLEAN(prop) \
1216 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1218 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1219 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1220 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1221 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1222 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1223 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1224 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1226 else if (EQ (prop, Qinput_charset_conversion))
1228 unparse_charset_conversion_specs
1229 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1230 else if (EQ (prop, Qoutput_charset_conversion))
1232 unparse_charset_conversion_specs
1233 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1237 else if (type == CODESYS_CCL)
1239 if (EQ (prop, Qdecode))
1240 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1241 else if (EQ (prop, Qencode))
1242 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1250 return Qnil; /* not reached */
1254 /************************************************************************/
1255 /* Coding category functions */
1256 /************************************************************************/
1259 decode_coding_category (Lisp_Object symbol)
1263 CHECK_SYMBOL (symbol);
1264 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1265 if (EQ (coding_category_symbol[i], symbol))
1268 signal_simple_error ("Unrecognized coding category", symbol);
1269 return 0; /* not reached */
1272 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1273 Return a list of all recognized coding categories.
1278 Lisp_Object list = Qnil;
1280 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1281 list = Fcons (coding_category_symbol[i], list);
1285 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1286 Change the priority order of the coding categories.
1287 LIST should be list of coding categories, in descending order of
1288 priority. Unspecified coding categories will be lower in priority
1289 than all specified ones, in the same relative order they were in
1294 int category_to_priority[CODING_CATEGORY_LAST + 1];
1298 /* First generate a list that maps coding categories to priorities. */
1300 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1301 category_to_priority[i] = -1;
1303 /* Highest priority comes from the specified list. */
1305 EXTERNAL_LIST_LOOP (rest, list)
1307 int cat = decode_coding_category (XCAR (rest));
1309 if (category_to_priority[cat] >= 0)
1310 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1311 category_to_priority[cat] = i++;
1314 /* Now go through the existing categories by priority to retrieve
1315 the categories not yet specified and preserve their priority
1317 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1319 int cat = coding_category_by_priority[j];
1320 if (category_to_priority[cat] < 0)
1321 category_to_priority[cat] = i++;
1324 /* Now we need to construct the inverse of the mapping we just
1327 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1328 coding_category_by_priority[category_to_priority[i]] = i;
1330 /* Phew! That was confusing. */
1334 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1335 Return a list of coding categories in descending order of priority.
1340 Lisp_Object list = Qnil;
1342 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1343 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1348 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1349 Change the coding system associated with a coding category.
1351 (coding_category, coding_system))
1353 int cat = decode_coding_category (coding_category);
1355 coding_system = Fget_coding_system (coding_system);
1356 coding_category_system[cat] = coding_system;
1360 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1361 Return the coding system associated with a coding category.
1365 int cat = decode_coding_category (coding_category);
1366 Lisp_Object sys = coding_category_system[cat];
1369 return XCODING_SYSTEM_NAME (sys);
1374 /************************************************************************/
1375 /* Detecting the encoding of data */
1376 /************************************************************************/
1378 struct detection_state
1380 enum eol_type eol_type;
1416 struct iso2022_decoder iso;
1418 int high_byte_count;
1419 unsigned int saw_single_shift:1;
1432 acceptable_control_char_p (int c)
1436 /* Allow and ignore control characters that you might
1437 reasonably see in a text file */
1442 case 8: /* backspace */
1443 case 11: /* vertical tab */
1444 case 12: /* form feed */
1445 case 26: /* MS-DOS C-z junk */
1446 case 31: /* '^_' -- for info */
1454 mask_has_at_most_one_bit_p (int mask)
1456 /* Perhaps the only thing useful you learn from intensive Microsoft
1457 technical interviews */
1458 return (mask & (mask - 1)) == 0;
1461 static enum eol_type
1462 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1471 st->eol.just_saw_cr = 1;
1476 if (st->eol.just_saw_cr)
1478 else if (st->eol.seen_anything)
1481 else if (st->eol.just_saw_cr)
1483 st->eol.just_saw_cr = 0;
1485 st->eol.seen_anything = 1;
1488 return EOL_AUTODETECT;
1491 /* Attempt to determine the encoding and EOL type of the given text.
1492 Before calling this function for the first type, you must initialize
1493 st->eol_type as appropriate and initialize st->mask to ~0.
1495 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1498 st->mask holds the determined coding category mask, or ~0 if only
1499 ASCII has been seen so far.
1503 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1504 is present in st->mask
1505 1 == definitive answers are here for both st->eol_type and st->mask
1509 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1510 unsigned int n, int just_do_eol)
1514 if (st->eol_type == EOL_AUTODETECT)
1515 st->eol_type = detect_eol_type (st, src, n);
1518 return st->eol_type != EOL_AUTODETECT;
1520 if (!st->seen_non_ascii)
1522 for (; n; n--, src++)
1525 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1527 st->seen_non_ascii = 1;
1529 st->shift_jis.mask = ~0;
1533 st->iso2022.mask = ~0;
1543 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1544 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1545 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1546 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1547 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1548 st->big5.mask = detect_coding_big5 (st, src, n);
1549 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1550 st->utf8.mask = detect_coding_utf8 (st, src, n);
1551 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1552 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1555 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1556 | st->utf8.mask | st->ucs4.mask;
1559 int retval = mask_has_at_most_one_bit_p (st->mask);
1560 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1561 return retval && st->eol_type != EOL_AUTODETECT;
1566 coding_system_from_mask (int mask)
1570 /* If the file was entirely or basically ASCII, use the
1571 default value of `buffer-file-coding-system'. */
1572 Lisp_Object retval =
1573 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1576 retval = Ffind_coding_system (retval);
1580 (Qbad_variable, Qwarning,
1581 "Invalid `default-buffer-file-coding-system', set to nil");
1582 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1586 retval = Fget_coding_system (Qraw_text);
1594 mask = postprocess_iso2022_mask (mask);
1596 /* Look through the coding categories by priority and find
1597 the first one that is allowed. */
1598 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1600 cat = coding_category_by_priority[i];
1601 if ((mask & (1 << cat)) &&
1602 !NILP (coding_category_system[cat]))
1606 return coding_category_system[cat];
1608 return Fget_coding_system (Qraw_text);
1612 /* Given a seekable read stream and potential coding system and EOL type
1613 as specified, do any autodetection that is called for. If the
1614 coding system and/or EOL type are not autodetect, they will be left
1615 alone; but this function will never return an autodetect coding system
1618 This function does not automatically fetch subsidiary coding systems;
1619 that should be unnecessary with the explicit eol-type argument. */
1622 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1623 enum eol_type *eol_type_in_out)
1625 struct detection_state decst;
1627 if (*eol_type_in_out == EOL_AUTODETECT)
1628 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1631 decst.eol_type = *eol_type_in_out;
1634 /* If autodetection is called for, do it now. */
1635 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1636 *eol_type_in_out == EOL_AUTODETECT)
1638 unsigned char random_buffer[4096];
1640 Lisp_Object coding_system = Qnil;
1642 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1645 unsigned char *cp = random_buffer;
1647 while (cp < random_buffer + nread)
1649 if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1650 (*cp++ == 'o') && (cp < random_buffer + nread) &&
1651 (*cp++ == 'd') && (cp < random_buffer + nread) &&
1652 (*cp++ == 'i') && (cp < random_buffer + nread) &&
1653 (*cp++ == 'n') && (cp < random_buffer + nread) &&
1654 (*cp++ == 'g') && (cp < random_buffer + nread) &&
1655 (*cp++ == ':') && (cp < random_buffer + nread))
1657 unsigned char coding_system_name[4096 - 6];
1658 unsigned char *np = coding_system_name;
1660 while ( (cp < random_buffer + nread)
1661 && ((*cp == ' ') || (*cp == '\t')) )
1665 while ( (cp < random_buffer + nread) &&
1666 (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1672 = Ffind_coding_system (intern (coding_system_name));
1676 if (EQ(coding_system, Qnil))
1678 if (detect_coding_type (&decst, random_buffer, nread,
1679 XCODING_SYSTEM_TYPE (*codesys_in_out)
1680 != CODESYS_AUTODETECT))
1682 nread = Lstream_read (stream,
1683 random_buffer, sizeof (random_buffer));
1688 *eol_type_in_out = decst.eol_type;
1689 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1691 if (EQ(coding_system, Qnil))
1692 *codesys_in_out = coding_system_from_mask (decst.mask);
1694 *codesys_in_out = coding_system;
1697 /* If we absolutely can't determine the EOL type, just assume LF. */
1698 if (*eol_type_in_out == EOL_AUTODETECT)
1699 *eol_type_in_out = EOL_LF;
1701 Lstream_rewind (stream);
1704 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1705 Detect coding system of the text in the region between START and END.
1706 Returned a list of possible coding systems ordered by priority.
1707 If only ASCII characters are found, it returns 'undecided or one of
1708 its subsidiary coding systems according to a detected end-of-line
1709 type. Optional arg BUFFER defaults to the current buffer.
1711 (start, end, buffer))
1713 Lisp_Object val = Qnil;
1714 struct buffer *buf = decode_buffer (buffer, 0);
1716 Lisp_Object instream, lb_instream;
1717 Lstream *istr, *lb_istr;
1718 struct detection_state decst;
1719 struct gcpro gcpro1, gcpro2;
1721 get_buffer_range_char (buf, start, end, &b, &e, 0);
1722 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1723 lb_istr = XLSTREAM (lb_instream);
1724 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1725 istr = XLSTREAM (instream);
1726 GCPRO2 (instream, lb_instream);
1728 decst.eol_type = EOL_AUTODETECT;
1732 unsigned char random_buffer[4096];
1733 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1737 if (detect_coding_type (&decst, random_buffer, nread, 0))
1741 if (decst.mask == ~0)
1742 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1750 decst.mask = postprocess_iso2022_mask (decst.mask);
1752 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1754 int sys = coding_category_by_priority[i];
1755 if (decst.mask & (1 << sys))
1757 Lisp_Object codesys = coding_category_system[sys];
1758 if (!NILP (codesys))
1759 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1760 val = Fcons (codesys, val);
1764 Lstream_close (istr);
1766 Lstream_delete (istr);
1767 Lstream_delete (lb_istr);
1772 /************************************************************************/
1773 /* Converting to internal Mule format ("decoding") */
1774 /************************************************************************/
1776 /* A decoding stream is a stream used for decoding text (i.e.
1777 converting from some external format to internal format).
1778 The decoding-stream object keeps track of the actual coding
1779 stream, the stream that is at the other end, and data that
1780 needs to be persistent across the lifetime of the stream. */
1782 /* Handle the EOL stuff related to just-read-in character C.
1783 EOL_TYPE is the EOL type of the coding stream.
1784 FLAGS is the current value of FLAGS in the coding stream, and may
1785 be modified by this macro. (The macro only looks at the
1786 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1787 bytes are to be written. You need to also define a local goto
1788 label "label_continue_loop" that is at the end of the main
1789 character-reading loop.
1791 If C is a CR character, then this macro handles it entirely and
1792 jumps to label_continue_loop. Otherwise, this macro does not add
1793 anything to DST, and continues normally. You should continue
1794 processing C normally after this macro. */
1796 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1800 if (eol_type == EOL_CR) \
1801 Dynarr_add (dst, '\n'); \
1802 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1803 Dynarr_add (dst, c); \
1805 flags |= CODING_STATE_CR; \
1806 goto label_continue_loop; \
1808 else if (flags & CODING_STATE_CR) \
1809 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1811 Dynarr_add (dst, '\r'); \
1812 flags &= ~CODING_STATE_CR; \
1816 /* C should be a binary character in the range 0 - 255; convert
1817 to internal format and add to Dynarr DST. */
1820 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1822 if (BYTE_ASCII_P (c)) \
1823 Dynarr_add (dst, c); \
1826 Dynarr_add (dst, (c >> 6) | 0xc0); \
1827 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1832 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1836 Dynarr_add (dst, c);
1838 else if ( c <= 0x7ff )
1840 Dynarr_add (dst, (c >> 6) | 0xc0);
1841 Dynarr_add (dst, (c & 0x3f) | 0x80);
1843 else if ( c <= 0xffff )
1845 Dynarr_add (dst, (c >> 12) | 0xe0);
1846 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1847 Dynarr_add (dst, (c & 0x3f) | 0x80);
1849 else if ( c <= 0x1fffff )
1851 Dynarr_add (dst, (c >> 18) | 0xf0);
1852 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1853 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1854 Dynarr_add (dst, (c & 0x3f) | 0x80);
1856 else if ( c <= 0x3ffffff )
1858 Dynarr_add (dst, (c >> 24) | 0xf8);
1859 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1860 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1861 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1862 Dynarr_add (dst, (c & 0x3f) | 0x80);
1866 Dynarr_add (dst, (c >> 30) | 0xfc);
1867 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1868 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1869 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1870 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1871 Dynarr_add (dst, (c & 0x3f) | 0x80);
1875 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1877 if (BYTE_ASCII_P (c)) \
1878 Dynarr_add (dst, c); \
1879 else if (BYTE_C1_P (c)) \
1881 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1882 Dynarr_add (dst, c + 0x20); \
1886 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1887 Dynarr_add (dst, c); \
1892 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1896 DECODE_ADD_BINARY_CHAR (ch, dst); \
1901 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1903 if (flags & CODING_STATE_END) \
1905 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1906 if (flags & CODING_STATE_CR) \
1907 Dynarr_add (dst, '\r'); \
1911 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1913 struct decoding_stream
1915 /* Coding system that governs the conversion. */
1916 Lisp_Coding_System *codesys;
1918 /* Stream that we read the encoded data from or
1919 write the decoded data to. */
1922 /* If we are reading, then we can return only a fixed amount of
1923 data, so if the conversion resulted in too much data, we store it
1924 here for retrieval the next time around. */
1925 unsigned_char_dynarr *runoff;
1927 /* FLAGS holds flags indicating the current state of the decoding.
1928 Some of these flags are dependent on the coding system. */
1931 /* CH holds a partially built-up character. Since we only deal
1932 with one- and two-byte characters at the moment, we only use
1933 this to store the first byte of a two-byte character. */
1936 /* EOL_TYPE specifies the type of end-of-line conversion that
1937 currently applies. We need to keep this separate from the
1938 EOL type stored in CODESYS because the latter might indicate
1939 automatic EOL-type detection while the former will always
1940 indicate a particular EOL type. */
1941 enum eol_type eol_type;
1943 /* Additional ISO2022 information. We define the structure above
1944 because it's also needed by the detection routines. */
1945 struct iso2022_decoder iso2022;
1947 /* Additional information (the state of the running CCL program)
1948 used by the CCL decoder. */
1949 struct ccl_program ccl;
1951 /* counter for UTF-8 or UCS-4 */
1952 unsigned char counter;
1954 struct detection_state decst;
1957 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1958 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1959 static int decoding_rewinder (Lstream *stream);
1960 static int decoding_seekable_p (Lstream *stream);
1961 static int decoding_flusher (Lstream *stream);
1962 static int decoding_closer (Lstream *stream);
1964 static Lisp_Object decoding_marker (Lisp_Object stream,
1965 void (*markobj) (Lisp_Object));
1967 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1968 sizeof (struct decoding_stream));
1971 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1973 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1974 Lisp_Object str_obj;
1976 /* We do not need to mark the coding systems or charsets stored
1977 within the stream because they are stored in a global list
1978 and automatically marked. */
1980 XSETLSTREAM (str_obj, str);
1982 if (str->imp->marker)
1983 return (str->imp->marker) (str_obj, markobj);
1988 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1989 so we read data from the other end, decode it, and store it into DATA. */
1992 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1994 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1995 unsigned char *orig_data = data;
1997 int error_occurred = 0;
1999 /* We need to interface to mule_decode(), which expects to take some
2000 amount of data and store the result into a Dynarr. We have
2001 mule_decode() store into str->runoff, and take data from there
2004 /* We loop until we have enough data, reading chunks from the other
2005 end and decoding it. */
2008 /* Take data from the runoff if we can. Make sure to take at
2009 most SIZE bytes, and delete the data from the runoff. */
2010 if (Dynarr_length (str->runoff) > 0)
2012 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2013 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2014 Dynarr_delete_many (str->runoff, 0, chunk);
2020 break; /* No more room for data */
2022 if (str->flags & CODING_STATE_END)
2023 /* This means that on the previous iteration, we hit the EOF on
2024 the other end. We loop once more so that mule_decode() can
2025 output any final stuff it may be holding, or any "go back
2026 to a sane state" escape sequences. (This latter makes sense
2027 during encoding.) */
2030 /* Exhausted the runoff, so get some more. DATA has at least
2031 SIZE bytes left of storage in it, so it's OK to read directly
2032 into it. (We'll be overwriting above, after we've decoded it
2033 into the runoff.) */
2034 read_size = Lstream_read (str->other_end, data, size);
2041 /* There might be some more end data produced in the translation.
2042 See the comment above. */
2043 str->flags |= CODING_STATE_END;
2044 mule_decode (stream, data, str->runoff, read_size);
2047 if (data - orig_data == 0)
2048 return error_occurred ? -1 : 0;
2050 return data - orig_data;
2054 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2056 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2059 /* Decode all our data into the runoff, and then attempt to write
2060 it all out to the other end. Remove whatever chunk we succeeded
2062 mule_decode (stream, data, str->runoff, size);
2063 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2064 Dynarr_length (str->runoff));
2066 Dynarr_delete_many (str->runoff, 0, retval);
2067 /* Do NOT return retval. The return value indicates how much
2068 of the incoming data was written, not how many bytes were
2074 reset_decoding_stream (struct decoding_stream *str)
2077 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2079 Lisp_Object coding_system;
2080 XSETCODING_SYSTEM (coding_system, str->codesys);
2081 reset_iso2022 (coding_system, &str->iso2022);
2083 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2085 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2089 str->flags = str->ch = 0;
2093 decoding_rewinder (Lstream *stream)
2095 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2096 reset_decoding_stream (str);
2097 Dynarr_reset (str->runoff);
2098 return Lstream_rewind (str->other_end);
2102 decoding_seekable_p (Lstream *stream)
2104 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2105 return Lstream_seekable_p (str->other_end);
2109 decoding_flusher (Lstream *stream)
2111 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2112 return Lstream_flush (str->other_end);
2116 decoding_closer (Lstream *stream)
2118 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2119 if (stream->flags & LSTREAM_FL_WRITE)
2121 str->flags |= CODING_STATE_END;
2122 decoding_writer (stream, 0, 0);
2124 Dynarr_free (str->runoff);
2126 #ifdef ENABLE_COMPOSITE_CHARS
2127 if (str->iso2022.composite_chars)
2128 Dynarr_free (str->iso2022.composite_chars);
2131 return Lstream_close (str->other_end);
2135 decoding_stream_coding_system (Lstream *stream)
2137 Lisp_Object coding_system;
2138 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2140 XSETCODING_SYSTEM (coding_system, str->codesys);
2141 return subsidiary_coding_system (coding_system, str->eol_type);
2145 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2147 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2148 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2150 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2151 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2152 reset_decoding_stream (str);
2155 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2156 stream for writing, no automatic code detection will be performed.
2157 The reason for this is that automatic code detection requires a
2158 seekable input. Things will also fail if you open a decoding
2159 stream for reading using a non-fully-specified coding system and
2160 a non-seekable input stream. */
2163 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2166 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2167 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2171 str->other_end = stream;
2172 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2173 str->eol_type = EOL_AUTODETECT;
2174 if (!strcmp (mode, "r")
2175 && Lstream_seekable_p (stream))
2176 /* We can determine the coding system now. */
2177 determine_real_coding_system (stream, &codesys, &str->eol_type);
2178 set_decoding_stream_coding_system (lstr, codesys);
2179 str->decst.eol_type = str->eol_type;
2180 str->decst.mask = ~0;
2181 XSETLSTREAM (obj, lstr);
2186 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2188 return make_decoding_stream_1 (stream, codesys, "r");
2192 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2194 return make_decoding_stream_1 (stream, codesys, "w");
2197 /* Note: the decode_coding_* functions all take the same
2198 arguments as mule_decode(), which is to say some SRC data of
2199 size N, which is to be stored into dynamic array DST.
2200 DECODING is the stream within which the decoding is
2201 taking place, but no data is actually read from or
2202 written to that stream; that is handled in decoding_reader()
2203 or decoding_writer(). This allows the same functions to
2204 be used for both reading and writing. */
2207 mule_decode (Lstream *decoding, CONST unsigned char *src,
2208 unsigned_char_dynarr *dst, unsigned int n)
2210 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2212 /* If necessary, do encoding-detection now. We do this when
2213 we're a writing stream or a non-seekable reading stream,
2214 meaning that we can't just process the whole input,
2215 rewind, and start over. */
2217 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2218 str->eol_type == EOL_AUTODETECT)
2220 Lisp_Object codesys;
2222 XSETCODING_SYSTEM (codesys, str->codesys);
2223 detect_coding_type (&str->decst, src, n,
2224 CODING_SYSTEM_TYPE (str->codesys) !=
2225 CODESYS_AUTODETECT);
2226 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2227 str->decst.mask != ~0)
2228 /* #### This is cheesy. What we really ought to do is
2229 buffer up a certain amount of data so as to get a
2230 less random result. */
2231 codesys = coding_system_from_mask (str->decst.mask);
2232 str->eol_type = str->decst.eol_type;
2233 if (XCODING_SYSTEM (codesys) != str->codesys)
2235 /* Preserve the CODING_STATE_END flag in case it was set.
2236 If we erase it, bad things might happen. */
2237 int was_end = str->flags & CODING_STATE_END;
2238 set_decoding_stream_coding_system (decoding, codesys);
2240 str->flags |= CODING_STATE_END;
2244 switch (CODING_SYSTEM_TYPE (str->codesys))
2247 case CODESYS_INTERNAL:
2248 Dynarr_add_many (dst, src, n);
2251 case CODESYS_AUTODETECT:
2252 /* If we got this far and still haven't decided on the coding
2253 system, then do no conversion. */
2254 case CODESYS_NO_CONVERSION:
2255 decode_coding_no_conversion (decoding, src, dst, n);
2258 case CODESYS_SHIFT_JIS:
2259 decode_coding_sjis (decoding, src, dst, n);
2262 decode_coding_big5 (decoding, src, dst, n);
2265 decode_coding_ucs4 (decoding, src, dst, n);
2268 decode_coding_utf8 (decoding, src, dst, n);
2271 str->ccl.last_block = str->flags & CODING_STATE_END;
2272 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2274 case CODESYS_ISO2022:
2275 decode_coding_iso2022 (decoding, src, dst, n);
2283 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2284 Decode the text between START and END which is encoded in CODING-SYSTEM.
2285 This is useful if you've read in encoded text from a file without decoding
2286 it (e.g. you read in a JIS-formatted file but used the `binary' or
2287 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2288 Return length of decoded text.
2289 BUFFER defaults to the current buffer if unspecified.
2291 (start, end, coding_system, buffer))
2294 struct buffer *buf = decode_buffer (buffer, 0);
2295 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2296 Lstream *istr, *ostr;
2297 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2299 get_buffer_range_char (buf, start, end, &b, &e, 0);
2301 barf_if_buffer_read_only (buf, b, e);
2303 coding_system = Fget_coding_system (coding_system);
2304 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2305 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2306 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2308 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2309 Fget_coding_system (Qbinary));
2310 istr = XLSTREAM (instream);
2311 ostr = XLSTREAM (outstream);
2312 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2314 /* The chain of streams looks like this:
2316 [BUFFER] <----- send through
2317 ------> [ENCODE AS BINARY]
2318 ------> [DECODE AS SPECIFIED]
2324 char tempbuf[1024]; /* some random amount */
2325 Bufpos newpos, even_newer_pos;
2326 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2327 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2331 newpos = lisp_buffer_stream_startpos (istr);
2332 Lstream_write (ostr, tempbuf, size_in_bytes);
2333 even_newer_pos = lisp_buffer_stream_startpos (istr);
2334 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2337 Lstream_close (istr);
2338 Lstream_close (ostr);
2340 Lstream_delete (istr);
2341 Lstream_delete (ostr);
2342 Lstream_delete (XLSTREAM (de_outstream));
2343 Lstream_delete (XLSTREAM (lb_outstream));
2348 /************************************************************************/
2349 /* Converting to an external encoding ("encoding") */
2350 /************************************************************************/
2352 /* An encoding stream is an output stream. When you create the
2353 stream, you specify the coding system that governs the encoding
2354 and another stream that the resulting encoded data is to be
2355 sent to, and then start sending data to it. */
2357 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2359 struct encoding_stream
2361 /* Coding system that governs the conversion. */
2362 Lisp_Coding_System *codesys;
2364 /* Stream that we read the encoded data from or
2365 write the decoded data to. */
2368 /* If we are reading, then we can return only a fixed amount of
2369 data, so if the conversion resulted in too much data, we store it
2370 here for retrieval the next time around. */
2371 unsigned_char_dynarr *runoff;
2373 /* FLAGS holds flags indicating the current state of the encoding.
2374 Some of these flags are dependent on the coding system. */
2377 /* CH holds a partially built-up character. Since we only deal
2378 with one- and two-byte characters at the moment, we only use
2379 this to store the first byte of a two-byte character. */
2382 /* Additional information used by the ISO2022 encoder. */
2385 /* CHARSET holds the character sets currently assigned to the G0
2386 through G3 registers. It is initialized from the array
2387 INITIAL_CHARSET in CODESYS. */
2388 Lisp_Object charset[4];
2390 /* Which registers are currently invoked into the left (GL) and
2391 right (GR) halves of the 8-bit encoding space? */
2392 int register_left, register_right;
2394 /* Whether we need to explicitly designate the charset in the
2395 G? register before using it. It is initialized from the
2396 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2397 unsigned char force_charset_on_output[4];
2399 /* Other state variables that need to be preserved across
2401 Lisp_Object current_charset;
2403 int current_char_boundary;
2406 void (*encode_char) (struct encoding_stream *str, Emchar c,
2407 unsigned_char_dynarr *dst, unsigned int *flags);
2408 void (*finish) (struct encoding_stream *str,
2409 unsigned_char_dynarr *dst, unsigned int *flags);
2411 /* Additional information (the state of the running CCL program)
2412 used by the CCL encoder. */
2413 struct ccl_program ccl;
2417 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2418 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2420 static int encoding_rewinder (Lstream *stream);
2421 static int encoding_seekable_p (Lstream *stream);
2422 static int encoding_flusher (Lstream *stream);
2423 static int encoding_closer (Lstream *stream);
2425 static Lisp_Object encoding_marker (Lisp_Object stream,
2426 void (*markobj) (Lisp_Object));
2428 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2429 sizeof (struct encoding_stream));
2432 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2434 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2435 Lisp_Object str_obj;
2437 /* We do not need to mark the coding systems or charsets stored
2438 within the stream because they are stored in a global list
2439 and automatically marked. */
2441 XSETLSTREAM (str_obj, str);
2443 if (str->imp->marker)
2444 return (str->imp->marker) (str_obj, markobj);
2449 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2450 so we read data from the other end, encode it, and store it into DATA. */
2453 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2455 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2456 unsigned char *orig_data = data;
2458 int error_occurred = 0;
2460 /* We need to interface to mule_encode(), which expects to take some
2461 amount of data and store the result into a Dynarr. We have
2462 mule_encode() store into str->runoff, and take data from there
2465 /* We loop until we have enough data, reading chunks from the other
2466 end and encoding it. */
2469 /* Take data from the runoff if we can. Make sure to take at
2470 most SIZE bytes, and delete the data from the runoff. */
2471 if (Dynarr_length (str->runoff) > 0)
2473 int chunk = min ((int) size, Dynarr_length (str->runoff));
2474 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2475 Dynarr_delete_many (str->runoff, 0, chunk);
2481 break; /* No more room for data */
2483 if (str->flags & CODING_STATE_END)
2484 /* This means that on the previous iteration, we hit the EOF on
2485 the other end. We loop once more so that mule_encode() can
2486 output any final stuff it may be holding, or any "go back
2487 to a sane state" escape sequences. (This latter makes sense
2488 during encoding.) */
2491 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2492 left of storage in it, so it's OK to read directly into it.
2493 (We'll be overwriting above, after we've encoded it into the
2495 read_size = Lstream_read (str->other_end, data, size);
2502 /* There might be some more end data produced in the translation.
2503 See the comment above. */
2504 str->flags |= CODING_STATE_END;
2505 mule_encode (stream, data, str->runoff, read_size);
2508 if (data == orig_data)
2509 return error_occurred ? -1 : 0;
2511 return data - orig_data;
2515 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2517 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2520 /* Encode all our data into the runoff, and then attempt to write
2521 it all out to the other end. Remove whatever chunk we succeeded
2523 mule_encode (stream, data, str->runoff, size);
2524 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2525 Dynarr_length (str->runoff));
2527 Dynarr_delete_many (str->runoff, 0, retval);
2528 /* Do NOT return retval. The return value indicates how much
2529 of the incoming data was written, not how many bytes were
2535 reset_encoding_stream (struct encoding_stream *str)
2538 switch (CODING_SYSTEM_TYPE (str->codesys))
2540 case CODESYS_ISO2022:
2544 str->encode_char = &char_encode_iso2022;
2545 str->finish = &char_finish_iso2022;
2546 for (i = 0; i < 4; i++)
2548 str->iso2022.charset[i] =
2549 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2550 str->iso2022.force_charset_on_output[i] =
2551 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2553 str->iso2022.register_left = 0;
2554 str->iso2022.register_right = 1;
2555 str->iso2022.current_charset = Qnil;
2556 str->iso2022.current_half = 0;
2560 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2563 str->encode_char = &char_encode_utf8;
2564 str->finish = &char_finish_utf8;
2566 str->encode_char = &char_encode_ucs4;
2567 str->finish = &char_finish_ucs4;
2568 case CODESYS_SHIFT_JIS:
2569 str->encode_char = &char_encode_shift_jis;
2570 str->finish = &char_finish_shift_jis;
2575 str->iso2022.current_char_boundary = 0;
2576 str->flags = str->ch = 0;
2580 encoding_rewinder (Lstream *stream)
2582 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2583 reset_encoding_stream (str);
2584 Dynarr_reset (str->runoff);
2585 return Lstream_rewind (str->other_end);
2589 encoding_seekable_p (Lstream *stream)
2591 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2592 return Lstream_seekable_p (str->other_end);
2596 encoding_flusher (Lstream *stream)
2598 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2599 return Lstream_flush (str->other_end);
2603 encoding_closer (Lstream *stream)
2605 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2606 if (stream->flags & LSTREAM_FL_WRITE)
2608 str->flags |= CODING_STATE_END;
2609 encoding_writer (stream, 0, 0);
2611 Dynarr_free (str->runoff);
2612 return Lstream_close (str->other_end);
2616 encoding_stream_coding_system (Lstream *stream)
2618 Lisp_Object coding_system;
2619 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2621 XSETCODING_SYSTEM (coding_system, str->codesys);
2622 return coding_system;
2626 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2628 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2629 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2631 reset_encoding_stream (str);
2635 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2638 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2639 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2643 str->runoff = Dynarr_new (unsigned_char);
2644 str->other_end = stream;
2645 set_encoding_stream_coding_system (lstr, codesys);
2646 XSETLSTREAM (obj, lstr);
2651 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2653 return make_encoding_stream_1 (stream, codesys, "r");
2657 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2659 return make_encoding_stream_1 (stream, codesys, "w");
2662 /* Convert N bytes of internally-formatted data stored in SRC to an
2663 external format, according to the encoding stream ENCODING.
2664 Store the encoded data into DST. */
2667 mule_encode (Lstream *encoding, CONST unsigned char *src,
2668 unsigned_char_dynarr *dst, unsigned int n)
2670 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2672 switch (CODING_SYSTEM_TYPE (str->codesys))
2675 case CODESYS_INTERNAL:
2676 Dynarr_add_many (dst, src, n);
2679 case CODESYS_AUTODETECT:
2680 /* If we got this far and still haven't decided on the coding
2681 system, then do no conversion. */
2682 case CODESYS_NO_CONVERSION:
2683 encode_coding_no_conversion (encoding, src, dst, n);
2687 encode_coding_big5 (encoding, src, dst, n);
2690 str->ccl.last_block = str->flags & CODING_STATE_END;
2691 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2695 text_encode_generic (encoding, src, dst, n);
2699 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2700 Encode the text between START and END using CODING-SYSTEM.
2701 This will, for example, convert Japanese characters into stuff such as
2702 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2703 text. BUFFER defaults to the current buffer if unspecified.
2705 (start, end, coding_system, buffer))
2708 struct buffer *buf = decode_buffer (buffer, 0);
2709 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2710 Lstream *istr, *ostr;
2711 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2713 get_buffer_range_char (buf, start, end, &b, &e, 0);
2715 barf_if_buffer_read_only (buf, b, e);
2717 coding_system = Fget_coding_system (coding_system);
2718 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2719 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2720 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2721 Fget_coding_system (Qbinary));
2722 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2724 istr = XLSTREAM (instream);
2725 ostr = XLSTREAM (outstream);
2726 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2727 /* The chain of streams looks like this:
2729 [BUFFER] <----- send through
2730 ------> [ENCODE AS SPECIFIED]
2731 ------> [DECODE AS BINARY]
2736 char tempbuf[1024]; /* some random amount */
2737 Bufpos newpos, even_newer_pos;
2738 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2739 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2743 newpos = lisp_buffer_stream_startpos (istr);
2744 Lstream_write (ostr, tempbuf, size_in_bytes);
2745 even_newer_pos = lisp_buffer_stream_startpos (istr);
2746 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2752 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2753 Lstream_close (istr);
2754 Lstream_close (ostr);
2756 Lstream_delete (istr);
2757 Lstream_delete (ostr);
2758 Lstream_delete (XLSTREAM (de_outstream));
2759 Lstream_delete (XLSTREAM (lb_outstream));
2760 return make_int (retlen);
2767 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2768 unsigned_char_dynarr *dst, unsigned int n)
2771 unsigned char char_boundary;
2772 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2773 unsigned int flags = str->flags;
2774 Emchar ch = str->ch;
2776 char_boundary = str->iso2022.current_char_boundary;
2782 if (char_boundary == 0)
2810 (*str->encode_char) (str, c, dst, &flags);
2812 else if (char_boundary == 1)
2814 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2820 ch = (ch << 6) | (c & 0x3f);
2825 if ((char_boundary == 0) && (flags & CODING_STATE_END))
2827 (*str->finish) (str, dst, &flags);
2832 str->iso2022.current_char_boundary = char_boundary;
2836 /************************************************************************/
2837 /* Shift-JIS methods */
2838 /************************************************************************/
2840 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2841 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2842 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2843 encoded by "position-code + 0x80". A character of JISX0208
2844 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2845 position-codes are divided and shifted so that it fit in the range
2848 --- CODE RANGE of Shift-JIS ---
2849 (character set) (range)
2851 JISX0201-Kana 0xA0 .. 0xDF
2852 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2853 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2854 -------------------------------
2858 /* Is this the first byte of a Shift-JIS two-byte char? */
2860 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2861 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2863 /* Is this the second byte of a Shift-JIS two-byte char? */
2865 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2866 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2868 #define BYTE_SJIS_KATAKANA_P(c) \
2869 ((c) >= 0xA1 && (c) <= 0xDF)
2872 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2880 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2882 if (st->shift_jis.in_second_byte)
2884 st->shift_jis.in_second_byte = 0;
2888 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2889 st->shift_jis.in_second_byte = 1;
2891 return CODING_CATEGORY_SHIFT_JIS_MASK;
2894 /* Convert Shift-JIS data to internal format. */
2897 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2898 unsigned_char_dynarr *dst, unsigned int n)
2901 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2902 unsigned int flags = str->flags;
2903 unsigned int ch = str->ch;
2904 eol_type_t eol_type = str->eol_type;
2912 /* Previous character was first byte of Shift-JIS Kanji char. */
2913 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2915 unsigned char e1, e2;
2917 DECODE_SJIS (ch, c, e1, e2);
2919 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2923 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2924 Dynarr_add (dst, e1);
2925 Dynarr_add (dst, e2);
2930 DECODE_ADD_BINARY_CHAR (ch, dst);
2931 DECODE_ADD_BINARY_CHAR (c, dst);
2937 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2938 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2940 else if (BYTE_SJIS_KATAKANA_P (c))
2943 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2946 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2947 Dynarr_add (dst, c);
2952 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
2956 DECODE_ADD_BINARY_CHAR (c, dst);
2958 label_continue_loop:;
2961 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2967 /* Convert internal character representation to Shift_JIS. */
2970 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
2971 unsigned_char_dynarr *dst, unsigned int *flags)
2973 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2977 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2978 Dynarr_add (dst, '\r');
2979 if (eol_type != EOL_CR)
2980 Dynarr_add (dst, ch);
2982 else if (ch <= 0x7f)
2983 Dynarr_add (dst, ch);
2984 else if (ch == 0xA5)
2985 Dynarr_add (dst, 0x5C);
2986 else if (ch == 0X203E)
2987 Dynarr_add (dst, 0x7E);
2990 Lisp_Object charset;
2991 unsigned int c1, c2, s1, s2;
2993 BREAKUP_CHAR (ch, charset, c1, c2);
2994 if (EQ(charset, Vcharset_katakana_jisx0201))
2996 Dynarr_add (dst, c1 | 0x80);
2998 else if (EQ(charset, Vcharset_japanese_jisx0208))
3000 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3001 Dynarr_add (dst, s1);
3002 Dynarr_add (dst, s2);
3008 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3009 unsigned int *flags)
3013 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3014 Decode a JISX0208 character of Shift-JIS coding-system.
3015 CODE is the character code in Shift-JIS as a cons of type bytes.
3016 Return the corresponding character.
3020 unsigned char c1, c2, s1, s2;
3023 CHECK_INT (XCAR (code));
3024 CHECK_INT (XCDR (code));
3025 s1 = XINT (XCAR (code));
3026 s2 = XINT (XCDR (code));
3027 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3028 BYTE_SJIS_TWO_BYTE_2_P (s2))
3030 DECODE_SJIS (s1, s2, c1, c2);
3031 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3032 c1 & 0x7F, c2 & 0x7F));
3038 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3039 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3040 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3044 Lisp_Object charset;
3047 CHECK_CHAR_COERCE_INT (ch);
3048 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3049 if (EQ (charset, Vcharset_japanese_jisx0208))
3051 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3052 return Fcons (make_int (s1), make_int (s2));
3059 /************************************************************************/
3061 /************************************************************************/
3063 /* BIG5 is a coding system encoding two character sets: ASCII and
3064 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3065 character set and is encoded in two-byte.
3067 --- CODE RANGE of BIG5 ---
3068 (character set) (range)
3070 Big5 (1st byte) 0xA1 .. 0xFE
3071 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3072 --------------------------
3074 Since the number of characters in Big5 is larger than maximum
3075 characters in Emacs' charset (96x96), it can't be handled as one
3076 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3077 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3078 contains frequently used characters and the latter contains less
3079 frequently used characters. */
3081 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3082 ((c) >= 0xA1 && (c) <= 0xFE)
3084 /* Is this the second byte of a Shift-JIS two-byte char? */
3086 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3087 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3089 /* Number of Big5 characters which have the same code in 1st byte. */
3091 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3093 /* Code conversion macros. These are macros because they are used in
3094 inner loops during code conversion.
3096 Note that temporary variables in macros introduce the classic
3097 dynamic-scoping problems with variable names. We use capital-
3098 lettered variables in the assumption that XEmacs does not use
3099 capital letters in variables except in a very formalized way
3102 /* Convert Big5 code (b1, b2) into its internal string representation
3105 /* There is a much simpler way to split the Big5 charset into two.
3106 For the moment I'm going to leave the algorithm as-is because it
3107 claims to separate out the most-used characters into a single
3108 charset, which perhaps will lead to optimizations in various
3111 The way the algorithm works is something like this:
3113 Big5 can be viewed as a 94x157 charset, where the row is
3114 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3115 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3116 the split between low and high column numbers is apparently
3117 meaningless; ascending rows produce less and less frequent chars.
3118 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3119 the first charset, and the upper half (0xC9 .. 0xFE) to the
3120 second. To do the conversion, we convert the character into
3121 a single number where 0 .. 156 is the first row, 157 .. 313
3122 is the second, etc. That way, the characters are ordered by
3123 decreasing frequency. Then we just chop the space in two
3124 and coerce the result into a 94x94 space.
3127 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3129 int B1 = b1, B2 = b2; \
3131 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3135 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3139 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3140 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3142 c1 = I / (0xFF - 0xA1) + 0xA1; \
3143 c2 = I % (0xFF - 0xA1) + 0xA1; \
3146 /* Convert the internal string representation of a Big5 character
3147 (lb, c1, c2) into Big5 code (b1, b2). */
3149 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3151 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3153 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3155 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3157 b1 = I / BIG5_SAME_ROW + 0xA1; \
3158 b2 = I % BIG5_SAME_ROW; \
3159 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3163 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3171 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3172 (c >= 0x80 && c <= 0xA0))
3174 if (st->big5.in_second_byte)
3176 st->big5.in_second_byte = 0;
3177 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3181 st->big5.in_second_byte = 1;
3183 return CODING_CATEGORY_BIG5_MASK;
3186 /* Convert Big5 data to internal format. */
3189 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3190 unsigned_char_dynarr *dst, unsigned int n)
3193 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3194 unsigned int flags = str->flags;
3195 unsigned int ch = str->ch;
3196 eol_type_t eol_type = str->eol_type;
3203 /* Previous character was first byte of Big5 char. */
3204 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3206 unsigned char b1, b2, b3;
3207 DECODE_BIG5 (ch, c, b1, b2, b3);
3208 Dynarr_add (dst, b1);
3209 Dynarr_add (dst, b2);
3210 Dynarr_add (dst, b3);
3214 DECODE_ADD_BINARY_CHAR (ch, dst);
3215 DECODE_ADD_BINARY_CHAR (c, dst);
3221 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3222 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3225 DECODE_ADD_BINARY_CHAR (c, dst);
3227 label_continue_loop:;
3230 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3236 /* Convert internally-formatted data to Big5. */
3239 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3240 unsigned_char_dynarr *dst, unsigned int n)
3244 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3245 unsigned int flags = str->flags;
3246 unsigned int ch = str->ch;
3247 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3254 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3255 Dynarr_add (dst, '\r');
3256 if (eol_type != EOL_CR)
3257 Dynarr_add (dst, '\n');
3259 else if (BYTE_ASCII_P (c))
3262 Dynarr_add (dst, c);
3264 else if (BUFBYTE_LEADING_BYTE_P (c))
3266 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3267 c == LEADING_BYTE_CHINESE_BIG5_2)
3269 /* A recognized leading byte. */
3271 continue; /* not done with this character. */
3273 /* otherwise just ignore this character. */
3275 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3276 ch == LEADING_BYTE_CHINESE_BIG5_2)
3278 /* Previous char was a recognized leading byte. */
3280 continue; /* not done with this character. */
3284 /* Encountering second byte of a Big5 character. */
3285 unsigned char b1, b2;
3287 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3288 Dynarr_add (dst, b1);
3289 Dynarr_add (dst, b2);
3301 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3302 Decode a Big5 character CODE of BIG5 coding-system.
3303 CODE is the character code in BIG5, a cons of two integers.
3304 Return the corresponding character.
3308 unsigned char c1, c2, b1, b2;
3311 CHECK_INT (XCAR (code));
3312 CHECK_INT (XCDR (code));
3313 b1 = XINT (XCAR (code));
3314 b2 = XINT (XCDR (code));
3315 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3316 BYTE_BIG5_TWO_BYTE_2_P (b2))
3318 Charset_ID leading_byte;
3319 Lisp_Object charset;
3320 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3321 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3322 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3328 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3329 Encode the Big5 character CH to BIG5 coding-system.
3330 Return the corresponding character code in Big5.
3334 Lisp_Object charset;
3337 CHECK_CHAR_COERCE_INT (ch);
3338 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3339 if (EQ (charset, Vcharset_chinese_big5_1) ||
3340 EQ (charset, Vcharset_chinese_big5_2))
3342 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3344 return Fcons (make_int (b1), make_int (b2));
3351 /************************************************************************/
3353 /************************************************************************/
3356 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3362 switch (st->ucs4.in_byte)
3371 st->ucs4.in_byte = 0;
3377 return CODING_CATEGORY_UCS4_MASK;
3381 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3382 unsigned_char_dynarr *dst, unsigned int n)
3384 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3385 unsigned int flags = str->flags;
3386 unsigned int ch = str->ch;
3387 unsigned char counter = str->counter;
3391 unsigned char c = *src++;
3399 DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3404 ch = ( ch << 8 ) | c;
3408 if (counter & CODING_STATE_END)
3409 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3413 str->counter = counter;
3417 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3418 unsigned_char_dynarr *dst, unsigned int *flags)
3420 Dynarr_add (dst, ch >> 24);
3421 Dynarr_add (dst, (ch >> 16) & 255);
3422 Dynarr_add (dst, (ch >> 8) & 255);
3423 Dynarr_add (dst, ch & 255);
3427 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3428 unsigned int *flags)
3433 /************************************************************************/
3435 /************************************************************************/
3438 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3443 unsigned char c = *src++;
3444 switch (st->utf8.in_byte)
3447 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3450 st->utf8.in_byte = 5;
3452 st->utf8.in_byte = 4;
3454 st->utf8.in_byte = 3;
3456 st->utf8.in_byte = 2;
3458 st->utf8.in_byte = 1;
3463 if ((c & 0xc0) != 0x80)
3469 return CODING_CATEGORY_UTF8_MASK;
3473 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3474 unsigned_char_dynarr *dst, unsigned int n)
3476 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3477 unsigned int flags = str->flags;
3478 unsigned int ch = str->ch;
3479 eol_type_t eol_type = str->eol_type;
3480 unsigned char counter = str->counter;
3484 unsigned char c = *src++;
3493 else if ( c >= 0xf8 )
3498 else if ( c >= 0xf0 )
3503 else if ( c >= 0xe0 )
3508 else if ( c >= 0xc0 )
3515 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3516 DECODE_ADD_UCS_CHAR (c, dst);
3520 ch = ( ch << 6 ) | ( c & 0x3f );
3521 DECODE_ADD_UCS_CHAR (ch, dst);
3526 ch = ( ch << 6 ) | ( c & 0x3f );
3529 label_continue_loop:;
3532 if (flags & CODING_STATE_END)
3533 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3537 str->counter = counter;
3541 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3542 unsigned_char_dynarr *dst, unsigned int *flags)
3544 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3548 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3549 Dynarr_add (dst, '\r');
3550 if (eol_type != EOL_CR)
3551 Dynarr_add (dst, ch);
3553 else if (ch <= 0x7f)
3555 Dynarr_add (dst, ch);
3557 else if (ch <= 0x7ff)
3559 Dynarr_add (dst, (ch >> 6) | 0xc0);
3560 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3562 else if (ch <= 0xffff)
3564 Dynarr_add (dst, (ch >> 12) | 0xe0);
3565 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3566 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3568 else if (ch <= 0x1fffff)
3570 Dynarr_add (dst, (ch >> 18) | 0xf0);
3571 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3572 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3573 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3575 else if (ch <= 0x3ffffff)
3577 Dynarr_add (dst, (ch >> 24) | 0xf8);
3578 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3579 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3580 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3581 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3585 Dynarr_add (dst, (ch >> 30) | 0xfc);
3586 Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3587 Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3588 Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3589 Dynarr_add (dst, ((ch >> 6) & 0x3f) | 0x80);
3590 Dynarr_add (dst, (ch & 0x3f) | 0x80);
3595 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3596 unsigned int *flags)
3601 /************************************************************************/
3602 /* ISO2022 methods */
3603 /************************************************************************/
3605 /* The following note describes the coding system ISO2022 briefly.
3606 Since the intention of this note is to help understand the
3607 functions in this file, some parts are NOT ACCURATE or OVERLY
3608 SIMPLIFIED. For thorough understanding, please refer to the
3609 original document of ISO2022.
3611 ISO2022 provides many mechanisms to encode several character sets
3612 in 7-bit and 8-bit environments. For 7-bit environments, all text
3613 is encoded using bytes less than 128. This may make the encoded
3614 text a little bit longer, but the text passes more easily through
3615 several gateways, some of which strip off MSB (Most Signigant Bit).
3617 There are two kinds of character sets: control character set and
3618 graphic character set. The former contains control characters such
3619 as `newline' and `escape' to provide control functions (control
3620 functions are also provided by escape sequences). The latter
3621 contains graphic characters such as 'A' and '-'. Emacs recognizes
3622 two control character sets and many graphic character sets.
3624 Graphic character sets are classified into one of the following
3625 four classes, according to the number of bytes (DIMENSION) and
3626 number of characters in one dimension (CHARS) of the set:
3627 - DIMENSION1_CHARS94
3628 - DIMENSION1_CHARS96
3629 - DIMENSION2_CHARS94
3630 - DIMENSION2_CHARS96
3632 In addition, each character set is assigned an identification tag,
3633 unique for each set, called "final character" (denoted as <F>
3634 hereafter). The <F> of each character set is decided by ECMA(*)
3635 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3636 (0x30..0x3F are for private use only).
3638 Note (*): ECMA = European Computer Manufacturers Association
3640 Here are examples of graphic character set [NAME(<F>)]:
3641 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3642 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3643 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3644 o DIMENSION2_CHARS96 -- none for the moment
3646 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3647 C0 [0x00..0x1F] -- control character plane 0
3648 GL [0x20..0x7F] -- graphic character plane 0
3649 C1 [0x80..0x9F] -- control character plane 1
3650 GR [0xA0..0xFF] -- graphic character plane 1
3652 A control character set is directly designated and invoked to C0 or
3653 C1 by an escape sequence. The most common case is that:
3654 - ISO646's control character set is designated/invoked to C0, and
3655 - ISO6429's control character set is designated/invoked to C1,
3656 and usually these designations/invocations are omitted in encoded
3657 text. In a 7-bit environment, only C0 can be used, and a control
3658 character for C1 is encoded by an appropriate escape sequence to
3659 fit into the environment. All control characters for C1 are
3660 defined to have corresponding escape sequences.
3662 A graphic character set is at first designated to one of four
3663 graphic registers (G0 through G3), then these graphic registers are
3664 invoked to GL or GR. These designations and invocations can be
3665 done independently. The most common case is that G0 is invoked to
3666 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3667 these invocations and designations are omitted in encoded text.
3668 In a 7-bit environment, only GL can be used.
3670 When a graphic character set of CHARS94 is invoked to GL, codes
3671 0x20 and 0x7F of the GL area work as control characters SPACE and
3672 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3675 There are two ways of invocation: locking-shift and single-shift.
3676 With locking-shift, the invocation lasts until the next different
3677 invocation, whereas with single-shift, the invocation affects the
3678 following character only and doesn't affect the locking-shift
3679 state. Invocations are done by the following control characters or
3682 ----------------------------------------------------------------------
3683 abbrev function cntrl escape seq description
3684 ----------------------------------------------------------------------
3685 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3686 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3687 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3688 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3689 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3690 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3691 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3692 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3693 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3694 ----------------------------------------------------------------------
3695 (*) These are not used by any known coding system.
3697 Control characters for these functions are defined by macros
3698 ISO_CODE_XXX in `coding.h'.
3700 Designations are done by the following escape sequences:
3701 ----------------------------------------------------------------------
3702 escape sequence description
3703 ----------------------------------------------------------------------
3704 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3705 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3706 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3707 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3708 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3709 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3710 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3711 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3712 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3713 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3714 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3715 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3716 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3717 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3718 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3719 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3720 ----------------------------------------------------------------------
3722 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3723 of dimension 1, chars 94, and final character <F>, etc...
3725 Note (*): Although these designations are not allowed in ISO2022,
3726 Emacs accepts them on decoding, and produces them on encoding
3727 CHARS96 character sets in a coding system which is characterized as
3728 7-bit environment, non-locking-shift, and non-single-shift.
3730 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3731 '(' can be omitted. We refer to this as "short-form" hereafter.
3733 Now you may notice that there are a lot of ways for encoding the
3734 same multilingual text in ISO2022. Actually, there exist many
3735 coding systems such as Compound Text (used in X11's inter client
3736 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3737 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3738 localized platforms), and all of these are variants of ISO2022.
3740 In addition to the above, Emacs handles two more kinds of escape
3741 sequences: ISO6429's direction specification and Emacs' private
3742 sequence for specifying character composition.
3744 ISO6429's direction specification takes the following form:
3745 o CSI ']' -- end of the current direction
3746 o CSI '0' ']' -- end of the current direction
3747 o CSI '1' ']' -- start of left-to-right text
3748 o CSI '2' ']' -- start of right-to-left text
3749 The control character CSI (0x9B: control sequence introducer) is
3750 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3752 Character composition specification takes the following form:
3753 o ESC '0' -- start character composition
3754 o ESC '1' -- end character composition
3755 Since these are not standard escape sequences of any ISO standard,
3756 their use with these meanings is restricted to Emacs only. */
3759 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3763 for (i = 0; i < 4; i++)
3765 if (!NILP (coding_system))
3767 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3769 iso->charset[i] = Qt;
3770 iso->invalid_designated[i] = 0;
3772 iso->esc = ISO_ESC_NOTHING;
3773 iso->esc_bytes_index = 0;
3774 iso->register_left = 0;
3775 iso->register_right = 1;
3776 iso->switched_dir_and_no_valid_charset_yet = 0;
3777 iso->invalid_switch_dir = 0;
3778 iso->output_direction_sequence = 0;
3779 iso->output_literally = 0;
3780 #ifdef ENABLE_COMPOSITE_CHARS
3781 if (iso->composite_chars)
3782 Dynarr_reset (iso->composite_chars);
3787 fit_to_be_escape_quoted (unsigned char c)
3804 /* Parse one byte of an ISO2022 escape sequence.
3805 If the result is an invalid escape sequence, return 0 and
3806 do not change anything in STR. Otherwise, if the result is
3807 an incomplete escape sequence, update ISO2022.ESC and
3808 ISO2022.ESC_BYTES and return -1. Otherwise, update
3809 all the state variables (but not ISO2022.ESC_BYTES) and
3812 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3813 or invocation of an invalid character set and treat that as
3814 an unrecognized escape sequence. */
3817 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3818 unsigned char c, unsigned int *flags,
3819 int check_invalid_charsets)
3821 /* (1) If we're at the end of a designation sequence, CS is the
3822 charset being designated and REG is the register to designate
3825 (2) If we're at the end of a locking-shift sequence, REG is
3826 the register to invoke and HALF (0 == left, 1 == right) is
3827 the half to invoke it into.
3829 (3) If we're at the end of a single-shift sequence, REG is
3830 the register to invoke. */
3831 Lisp_Object cs = Qnil;
3834 /* NOTE: This code does goto's all over the fucking place.
3835 The reason for this is that we're basically implementing
3836 a state machine here, and hierarchical languages like C
3837 don't really provide a clean way of doing this. */
3839 if (! (*flags & CODING_STATE_ESCAPE))
3840 /* At beginning of escape sequence; we need to reset our
3841 escape-state variables. */
3842 iso->esc = ISO_ESC_NOTHING;
3844 iso->output_literally = 0;
3845 iso->output_direction_sequence = 0;
3849 case ISO_ESC_NOTHING:
3850 iso->esc_bytes_index = 0;
3853 case ISO_CODE_ESC: /* Start escape sequence */
3854 *flags |= CODING_STATE_ESCAPE;
3858 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3859 *flags |= CODING_STATE_ESCAPE;
3860 iso->esc = ISO_ESC_5_11;
3863 case ISO_CODE_SO: /* locking shift 1 */
3866 case ISO_CODE_SI: /* locking shift 0 */
3870 case ISO_CODE_SS2: /* single shift */
3873 case ISO_CODE_SS3: /* single shift */
3877 default: /* Other control characters */
3884 /**** single shift ****/
3886 case 'N': /* single shift 2 */
3889 case 'O': /* single shift 3 */
3893 /**** locking shift ****/
3895 case '~': /* locking shift 1 right */
3898 case 'n': /* locking shift 2 */
3901 case '}': /* locking shift 2 right */
3904 case 'o': /* locking shift 3 */
3907 case '|': /* locking shift 3 right */
3911 #ifdef ENABLE_COMPOSITE_CHARS
3912 /**** composite ****/
3915 iso->esc = ISO_ESC_START_COMPOSITE;
3916 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3917 CODING_STATE_COMPOSITE;
3921 iso->esc = ISO_ESC_END_COMPOSITE;
3922 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3923 ~CODING_STATE_COMPOSITE;
3925 #endif /* ENABLE_COMPOSITE_CHARS */
3927 /**** directionality ****/
3930 iso->esc = ISO_ESC_5_11;
3933 /**** designation ****/
3935 case '$': /* multibyte charset prefix */
3936 iso->esc = ISO_ESC_2_4;
3940 if (0x28 <= c && c <= 0x2F)
3942 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3946 /* This function is called with CODESYS equal to nil when
3947 doing coding-system detection. */
3949 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3950 && fit_to_be_escape_quoted (c))
3952 iso->esc = ISO_ESC_LITERAL;
3953 *flags &= CODING_STATE_ISO2022_LOCK;
3963 /**** directionality ****/
3965 case ISO_ESC_5_11: /* ISO6429 direction control */
3968 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3969 goto directionality;
3971 if (c == '0') iso->esc = ISO_ESC_5_11_0;
3972 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3973 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3977 case ISO_ESC_5_11_0:
3980 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3981 goto directionality;
3985 case ISO_ESC_5_11_1:
3988 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3989 goto directionality;
3993 case ISO_ESC_5_11_2:
3996 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
3997 goto directionality;
4002 iso->esc = ISO_ESC_DIRECTIONALITY;
4003 /* Various junk here to attempt to preserve the direction sequences
4004 literally in the text if they would otherwise be swallowed due
4005 to invalid designations that don't show up as actual charset
4006 changes in the text. */
4007 if (iso->invalid_switch_dir)
4009 /* We already inserted a direction switch literally into the
4010 text. We assume (#### this may not be right) that the
4011 next direction switch is the one going the other way,
4012 and we need to output that literally as well. */
4013 iso->output_literally = 1;
4014 iso->invalid_switch_dir = 0;
4020 /* If we are in the thrall of an invalid designation,
4021 then stick the directionality sequence literally into the
4022 output stream so it ends up in the original text again. */
4023 for (jj = 0; jj < 4; jj++)
4024 if (iso->invalid_designated[jj])
4028 iso->output_literally = 1;
4029 iso->invalid_switch_dir = 1;
4032 /* Indicate that we haven't yet seen a valid designation,
4033 so that if a switch-dir is directly followed by an
4034 invalid designation, both get inserted literally. */
4035 iso->switched_dir_and_no_valid_charset_yet = 1;
4040 /**** designation ****/
4043 if (0x28 <= c && c <= 0x2F)
4045 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4048 if (0x40 <= c && c <= 0x42)
4050 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4051 *flags & CODING_STATE_R2L ?
4052 CHARSET_RIGHT_TO_LEFT :
4053 CHARSET_LEFT_TO_RIGHT);
4063 if (c < '0' || c > '~')
4064 return 0; /* bad final byte */
4066 if (iso->esc >= ISO_ESC_2_8 &&
4067 iso->esc <= ISO_ESC_2_15)
4069 type = ((iso->esc >= ISO_ESC_2_12) ?
4070 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4071 reg = (iso->esc - ISO_ESC_2_8) & 3;
4073 else if (iso->esc >= ISO_ESC_2_4_8 &&
4074 iso->esc <= ISO_ESC_2_4_15)
4076 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4077 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4078 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4082 /* Can this ever be reached? -slb */
4086 cs = CHARSET_BY_ATTRIBUTES (type, c,
4087 *flags & CODING_STATE_R2L ?
4088 CHARSET_RIGHT_TO_LEFT :
4089 CHARSET_LEFT_TO_RIGHT);
4095 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4099 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4100 /* can't invoke something that ain't there. */
4102 iso->esc = ISO_ESC_SINGLE_SHIFT;
4103 *flags &= CODING_STATE_ISO2022_LOCK;
4105 *flags |= CODING_STATE_SS2;
4107 *flags |= CODING_STATE_SS3;
4111 if (check_invalid_charsets &&
4112 !CHARSETP (iso->charset[reg]))
4113 /* can't invoke something that ain't there. */
4116 iso->register_right = reg;
4118 iso->register_left = reg;
4119 *flags &= CODING_STATE_ISO2022_LOCK;
4120 iso->esc = ISO_ESC_LOCKING_SHIFT;
4124 if (NILP (cs) && check_invalid_charsets)
4126 iso->invalid_designated[reg] = 1;
4127 iso->charset[reg] = Vcharset_ascii;
4128 iso->esc = ISO_ESC_DESIGNATE;
4129 *flags &= CODING_STATE_ISO2022_LOCK;
4130 iso->output_literally = 1;
4131 if (iso->switched_dir_and_no_valid_charset_yet)
4133 /* We encountered a switch-direction followed by an
4134 invalid designation. Ensure that the switch-direction
4135 gets outputted; otherwise it will probably get eaten
4136 when the text is written out again. */
4137 iso->switched_dir_and_no_valid_charset_yet = 0;
4138 iso->output_direction_sequence = 1;
4139 /* And make sure that the switch-dir going the other
4140 way gets outputted, as well. */
4141 iso->invalid_switch_dir = 1;
4145 /* This function is called with CODESYS equal to nil when
4146 doing coding-system detection. */
4147 if (!NILP (codesys))
4149 charset_conversion_spec_dynarr *dyn =
4150 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4156 for (i = 0; i < Dynarr_length (dyn); i++)
4158 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4159 if (EQ (cs, spec->from_charset))
4160 cs = spec->to_charset;
4165 iso->charset[reg] = cs;
4166 iso->esc = ISO_ESC_DESIGNATE;
4167 *flags &= CODING_STATE_ISO2022_LOCK;
4168 if (iso->invalid_designated[reg])
4170 iso->invalid_designated[reg] = 0;
4171 iso->output_literally = 1;
4173 if (iso->switched_dir_and_no_valid_charset_yet)
4174 iso->switched_dir_and_no_valid_charset_yet = 0;
4179 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4184 /* #### There are serious deficiencies in the recognition mechanism
4185 here. This needs to be much smarter if it's going to cut it.
4186 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4187 it should be detected as Latin-1.
4188 All the ISO2022 stuff in this file should be synced up with the
4189 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4190 Perhaps we should wait till R2L works in FSF Emacs? */
4192 if (!st->iso2022.initted)
4194 reset_iso2022 (Qnil, &st->iso2022.iso);
4195 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4196 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4197 CODING_CATEGORY_ISO_8_1_MASK |
4198 CODING_CATEGORY_ISO_8_2_MASK |
4199 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4200 st->iso2022.flags = 0;
4201 st->iso2022.high_byte_count = 0;
4202 st->iso2022.saw_single_shift = 0;
4203 st->iso2022.initted = 1;
4206 mask = st->iso2022.mask;
4213 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4214 st->iso2022.high_byte_count++;
4218 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4220 if (st->iso2022.high_byte_count & 1)
4221 /* odd number of high bytes; assume not iso-8-2 */
4222 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4224 st->iso2022.high_byte_count = 0;
4225 st->iso2022.saw_single_shift = 0;
4227 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4229 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4230 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4231 { /* control chars */
4234 /* Allow and ignore control characters that you might
4235 reasonably see in a text file */
4240 case 8: /* backspace */
4241 case 11: /* vertical tab */
4242 case 12: /* form feed */
4243 case 26: /* MS-DOS C-z junk */
4244 case 31: /* '^_' -- for info */
4245 goto label_continue_loop;
4252 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4255 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4256 &st->iso2022.flags, 0))
4258 switch (st->iso2022.iso.esc)
4260 case ISO_ESC_DESIGNATE:
4261 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4262 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4264 case ISO_ESC_LOCKING_SHIFT:
4265 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4266 goto ran_out_of_chars;
4267 case ISO_ESC_SINGLE_SHIFT:
4268 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4269 st->iso2022.saw_single_shift = 1;
4278 goto ran_out_of_chars;
4281 label_continue_loop:;
4290 postprocess_iso2022_mask (int mask)
4292 /* #### kind of cheesy */
4293 /* If seven-bit ISO is allowed, then assume that the encoding is
4294 entirely seven-bit and turn off the eight-bit ones. */
4295 if (mask & CODING_CATEGORY_ISO_7_MASK)
4296 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4297 CODING_CATEGORY_ISO_8_1_MASK |
4298 CODING_CATEGORY_ISO_8_2_MASK);
4302 /* If FLAGS is a null pointer or specifies right-to-left motion,
4303 output a switch-dir-to-left-to-right sequence to DST.
4304 Also update FLAGS if it is not a null pointer.
4305 If INTERNAL_P is set, we are outputting in internal format and
4306 need to handle the CSI differently. */
4309 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4310 unsigned_char_dynarr *dst,
4311 unsigned int *flags,
4314 if (!flags || (*flags & CODING_STATE_R2L))
4316 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4318 Dynarr_add (dst, ISO_CODE_ESC);
4319 Dynarr_add (dst, '[');
4321 else if (internal_p)
4322 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4324 Dynarr_add (dst, ISO_CODE_CSI);
4325 Dynarr_add (dst, '0');
4326 Dynarr_add (dst, ']');
4328 *flags &= ~CODING_STATE_R2L;
4332 /* If FLAGS is a null pointer or specifies a direction different from
4333 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4334 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4335 sequence to DST. Also update FLAGS if it is not a null pointer.
4336 If INTERNAL_P is set, we are outputting in internal format and
4337 need to handle the CSI differently. */
4340 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4341 unsigned_char_dynarr *dst, unsigned int *flags,
4344 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4345 direction == CHARSET_LEFT_TO_RIGHT)
4346 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4347 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4348 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4349 direction == CHARSET_RIGHT_TO_LEFT)
4351 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4353 Dynarr_add (dst, ISO_CODE_ESC);
4354 Dynarr_add (dst, '[');
4356 else if (internal_p)
4357 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4359 Dynarr_add (dst, ISO_CODE_CSI);
4360 Dynarr_add (dst, '2');
4361 Dynarr_add (dst, ']');
4363 *flags |= CODING_STATE_R2L;
4367 /* Convert ISO2022-format data to internal format. */
4370 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4371 unsigned_char_dynarr *dst, unsigned int n)
4373 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4374 unsigned int flags = str->flags;
4375 unsigned int ch = str->ch;
4376 eol_type_t eol_type = str->eol_type;
4377 #ifdef ENABLE_COMPOSITE_CHARS
4378 unsigned_char_dynarr *real_dst = dst;
4380 Lisp_Object coding_system;
4382 XSETCODING_SYSTEM (coding_system, str->codesys);
4384 #ifdef ENABLE_COMPOSITE_CHARS
4385 if (flags & CODING_STATE_COMPOSITE)
4386 dst = str->iso2022.composite_chars;
4387 #endif /* ENABLE_COMPOSITE_CHARS */
4391 unsigned char c = *src++;
4392 if (flags & CODING_STATE_ESCAPE)
4393 { /* Within ESC sequence */
4394 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4399 switch (str->iso2022.esc)
4401 #ifdef ENABLE_COMPOSITE_CHARS
4402 case ISO_ESC_START_COMPOSITE:
4403 if (str->iso2022.composite_chars)
4404 Dynarr_reset (str->iso2022.composite_chars);
4406 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4407 dst = str->iso2022.composite_chars;
4409 case ISO_ESC_END_COMPOSITE:
4411 Bufbyte comstr[MAX_EMCHAR_LEN];
4413 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4414 Dynarr_length (dst));
4416 len = set_charptr_emchar (comstr, emch);
4417 Dynarr_add_many (dst, comstr, len);
4420 #endif /* ENABLE_COMPOSITE_CHARS */
4422 case ISO_ESC_LITERAL:
4423 DECODE_ADD_BINARY_CHAR (c, dst);
4427 /* Everything else handled already */
4432 /* Attempted error recovery. */
4433 if (str->iso2022.output_direction_sequence)
4434 ensure_correct_direction (flags & CODING_STATE_R2L ?
4435 CHARSET_RIGHT_TO_LEFT :
4436 CHARSET_LEFT_TO_RIGHT,
4437 str->codesys, dst, 0, 1);
4438 /* More error recovery. */
4439 if (!retval || str->iso2022.output_literally)
4441 /* Output the (possibly invalid) sequence */
4443 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4444 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4445 flags &= CODING_STATE_ISO2022_LOCK;
4447 n++, src--;/* Repeat the loop with the same character. */
4450 /* No sense in reprocessing the final byte of the
4451 escape sequence; it could mess things up anyway.
4453 DECODE_ADD_BINARY_CHAR (c, dst);
4458 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4459 { /* Control characters */
4461 /***** Error-handling *****/
4463 /* If we were in the middle of a character, dump out the
4464 partial character. */
4465 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4467 /* If we just saw a single-shift character, dump it out.
4468 This may dump out the wrong sort of single-shift character,
4469 but least it will give an indication that something went
4471 if (flags & CODING_STATE_SS2)
4473 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4474 flags &= ~CODING_STATE_SS2;
4476 if (flags & CODING_STATE_SS3)
4478 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4479 flags &= ~CODING_STATE_SS3;
4482 /***** Now handle the control characters. *****/
4485 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4487 flags &= CODING_STATE_ISO2022_LOCK;
4489 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4490 DECODE_ADD_BINARY_CHAR (c, dst);
4493 { /* Graphic characters */
4494 Lisp_Object charset;
4500 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4502 /* Now determine the charset. */
4503 reg = ((flags & CODING_STATE_SS2) ? 2
4504 : (flags & CODING_STATE_SS3) ? 3
4505 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4506 : str->iso2022.register_left);
4507 charset = str->iso2022.charset[reg];
4509 /* Error checking: */
4510 if (! CHARSETP (charset)
4511 || str->iso2022.invalid_designated[reg]
4512 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4513 && XCHARSET_CHARS (charset) == 94))
4514 /* Mrmph. We are trying to invoke a register that has no
4515 or an invalid charset in it, or trying to add a character
4516 outside the range of the charset. Insert that char literally
4517 to preserve it for the output. */
4519 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4520 DECODE_ADD_BINARY_CHAR (c, dst);
4525 /* Things are probably hunky-dorey. */
4527 /* Fetch reverse charset, maybe. */
4528 if (((flags & CODING_STATE_R2L) &&
4529 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4531 (!(flags & CODING_STATE_R2L) &&
4532 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4534 Lisp_Object new_charset =
4535 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4536 if (!NILP (new_charset))
4537 charset = new_charset;
4541 if (XCHARSET_DIMENSION (charset) == 1)
4543 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4545 (MAKE_CHAR (charset, c & 0x7F, 0), dst);
4550 (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
4556 lb = XCHARSET_LEADING_BYTE (charset);
4557 switch (XCHARSET_REP_BYTES (charset))
4560 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4561 Dynarr_add (dst, c & 0x7F);
4564 case 2: /* one-byte official */
4565 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4566 Dynarr_add (dst, lb);
4567 Dynarr_add (dst, c | 0x80);
4570 case 3: /* one-byte private or two-byte official */
4571 if (XCHARSET_PRIVATE_P (charset))
4573 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4574 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4575 Dynarr_add (dst, lb);
4576 Dynarr_add (dst, c | 0x80);
4582 Dynarr_add (dst, lb);
4583 Dynarr_add (dst, ch | 0x80);
4584 Dynarr_add (dst, c | 0x80);
4592 default: /* two-byte private */
4595 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4596 Dynarr_add (dst, lb);
4597 Dynarr_add (dst, ch | 0x80);
4598 Dynarr_add (dst, c | 0x80);
4608 flags &= CODING_STATE_ISO2022_LOCK;
4611 label_continue_loop:;
4614 if (flags & CODING_STATE_END)
4615 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4622 /***** ISO2022 encoder *****/
4624 /* Designate CHARSET into register REG. */
4627 iso2022_designate (Lisp_Object charset, unsigned char reg,
4628 struct encoding_stream *str, unsigned_char_dynarr *dst)
4630 static CONST char inter94[] = "()*+";
4631 static CONST char inter96[] = ",-./";
4633 unsigned char final;
4634 Lisp_Object old_charset = str->iso2022.charset[reg];
4636 str->iso2022.charset[reg] = charset;
4637 if (!CHARSETP (charset))
4638 /* charset might be an initial nil or t. */
4640 type = XCHARSET_TYPE (charset);
4641 final = XCHARSET_FINAL (charset);
4642 if (!str->iso2022.force_charset_on_output[reg] &&
4643 CHARSETP (old_charset) &&
4644 XCHARSET_TYPE (old_charset) == type &&
4645 XCHARSET_FINAL (old_charset) == final)
4648 str->iso2022.force_charset_on_output[reg] = 0;
4651 charset_conversion_spec_dynarr *dyn =
4652 str->codesys->iso2022.output_conv;
4658 for (i = 0; i < Dynarr_length (dyn); i++)
4660 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4661 if (EQ (charset, spec->from_charset))
4662 charset = spec->to_charset;
4667 Dynarr_add (dst, ISO_CODE_ESC);
4670 case CHARSET_TYPE_94:
4671 Dynarr_add (dst, inter94[reg]);
4673 case CHARSET_TYPE_96:
4674 Dynarr_add (dst, inter96[reg]);
4676 case CHARSET_TYPE_94X94:
4677 Dynarr_add (dst, '$');
4679 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4682 Dynarr_add (dst, inter94[reg]);
4684 case CHARSET_TYPE_96X96:
4685 Dynarr_add (dst, '$');
4686 Dynarr_add (dst, inter96[reg]);
4689 Dynarr_add (dst, final);
4693 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4695 if (str->iso2022.register_left != 0)
4697 Dynarr_add (dst, ISO_CODE_SI);
4698 str->iso2022.register_left = 0;
4703 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4705 if (str->iso2022.register_left != 1)
4707 Dynarr_add (dst, ISO_CODE_SO);
4708 str->iso2022.register_left = 1;
4713 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4714 unsigned_char_dynarr *dst, unsigned int *flags)
4716 unsigned char charmask;
4717 Lisp_Coding_System* codesys = str->codesys;
4718 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4720 Lisp_Object charset = str->iso2022.current_charset;
4721 int half = str->iso2022.current_half;
4722 unsigned int byte1, byte2;
4726 restore_left_to_right_direction (codesys, dst, flags, 0);
4728 /* Make sure G0 contains ASCII */
4729 if ((ch > ' ' && ch < ISO_CODE_DEL)
4730 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4732 ensure_normal_shift (str, dst);
4733 iso2022_designate (Vcharset_ascii, 0, str, dst);
4736 /* If necessary, restore everything to the default state
4738 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4740 restore_left_to_right_direction (codesys, dst, flags, 0);
4742 ensure_normal_shift (str, dst);
4744 for (i = 0; i < 4; i++)
4746 Lisp_Object initial_charset =
4747 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4748 iso2022_designate (initial_charset, i, str, dst);
4753 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4754 Dynarr_add (dst, '\r');
4755 if (eol_type != EOL_CR)
4756 Dynarr_add (dst, ch);
4760 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4761 && fit_to_be_escape_quoted (ch))
4762 Dynarr_add (dst, ISO_CODE_ESC);
4763 Dynarr_add (dst, ch);
4766 else if ( (0x80 <= ch) && (ch <= 0x9f) )
4768 charmask = (half == 0 ? 0x00 : 0x80);
4770 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4771 && fit_to_be_escape_quoted (ch))
4772 Dynarr_add (dst, ISO_CODE_ESC);
4773 /* you asked for it ... */
4774 Dynarr_add (dst, ch);
4780 BREAKUP_CHAR (ch, charset, byte1, byte2);
4781 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4782 codesys, dst, flags, 0);
4784 /* Now determine which register to use. */
4786 for (i = 0; i < 4; i++)
4788 if (EQ (charset, str->iso2022.charset[i]) ||
4790 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4799 if (XCHARSET_GRAPHIC (charset) != 0)
4801 if (!NILP (str->iso2022.charset[1]) &&
4802 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
4803 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4805 else if (!NILP (str->iso2022.charset[2]))
4807 else if (!NILP (str->iso2022.charset[3]))
4816 iso2022_designate (charset, reg, str, dst);
4818 /* Now invoke that register. */
4822 ensure_normal_shift (str, dst);
4826 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4828 ensure_shift_out (str, dst);
4835 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4837 Dynarr_add (dst, ISO_CODE_ESC);
4838 Dynarr_add (dst, 'N');
4843 Dynarr_add (dst, ISO_CODE_SS2);
4848 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4850 Dynarr_add (dst, ISO_CODE_ESC);
4851 Dynarr_add (dst, 'O');
4856 Dynarr_add (dst, ISO_CODE_SS3);
4864 charmask = (half == 0 ? 0x00 : 0x80);
4866 switch (XCHARSET_DIMENSION (charset))
4869 Dynarr_add (dst, byte1 | charmask);
4872 Dynarr_add (dst, byte1 | charmask);
4873 Dynarr_add (dst, byte2 | charmask);
4879 str->iso2022.current_charset = charset;
4880 str->iso2022.current_half = half;
4884 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4885 unsigned int *flags)
4887 Lisp_Coding_System* codesys = str->codesys;
4890 restore_left_to_right_direction (codesys, dst, flags, 0);
4891 ensure_normal_shift (str, dst);
4892 for (i = 0; i < 4; i++)
4894 Lisp_Object initial_charset
4895 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4896 iso2022_designate (initial_charset, i, str, dst);
4901 /************************************************************************/
4902 /* No-conversion methods */
4903 /************************************************************************/
4905 /* This is used when reading in "binary" files -- i.e. files that may
4906 contain all 256 possible byte values and that are not to be
4907 interpreted as being in any particular decoding. */
4909 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4910 unsigned_char_dynarr *dst, unsigned int n)
4913 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4914 unsigned int flags = str->flags;
4915 unsigned int ch = str->ch;
4916 eol_type_t eol_type = str->eol_type;
4922 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4923 DECODE_ADD_BINARY_CHAR (c, dst);
4924 label_continue_loop:;
4927 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4934 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4935 unsigned_char_dynarr *dst, unsigned int n)
4938 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4939 unsigned int flags = str->flags;
4940 unsigned int ch = str->ch;
4941 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4943 unsigned char char_boundary = str->iso2022.current_char_boundary;
4950 switch (char_boundary)
4958 else if ( c >= 0xf8 )
4963 else if ( c >= 0xf0 )
4968 else if ( c >= 0xe0 )
4973 else if ( c >= 0xc0 )
4984 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4985 Dynarr_add (dst, '\r');
4986 if (eol_type != EOL_CR)
4987 Dynarr_add (dst, c);
4990 Dynarr_add (dst, c);
4995 ch = ( ch << 6 ) | ( c & 0x3f );
4996 switch ( str->codesys->fixed.size )
4999 Dynarr_add (dst, ch & 0xff);
5002 Dynarr_add (dst, (ch >> 8) & 0xff);
5003 Dynarr_add (dst, ch & 0xff);
5006 Dynarr_add (dst, (ch >> 16) & 0xff);
5007 Dynarr_add (dst, (ch >> 8) & 0xff);
5008 Dynarr_add (dst, ch & 0xff);
5011 Dynarr_add (dst, (ch >> 24) & 0xff);
5012 Dynarr_add (dst, (ch >> 16) & 0xff);
5013 Dynarr_add (dst, (ch >> 8) & 0xff);
5014 Dynarr_add (dst, ch & 0xff);
5017 fprintf(stderr, "It seems %d bytes stream.\n",
5018 str->codesys->fixed.size);
5024 ch = ( ch << 6 ) | ( c & 0x3f );
5027 #else /* not UTF2000 */
5030 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5031 Dynarr_add (dst, '\r');
5032 if (eol_type != EOL_CR)
5033 Dynarr_add (dst, '\n');
5036 else if (BYTE_ASCII_P (c))
5039 Dynarr_add (dst, c);
5041 else if (BUFBYTE_LEADING_BYTE_P (c))
5044 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5045 c == LEADING_BYTE_CONTROL_1)
5048 Dynarr_add (dst, '~'); /* untranslatable character */
5052 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5053 Dynarr_add (dst, c);
5054 else if (ch == LEADING_BYTE_CONTROL_1)
5057 Dynarr_add (dst, c - 0x20);
5059 /* else it should be the second or third byte of an
5060 untranslatable character, so ignore it */
5063 #endif /* not UTF2000 */
5069 str->iso2022.current_char_boundary = char_boundary;
5074 /************************************************************************/
5075 /* Simple internal/external functions */
5076 /************************************************************************/
5078 static Extbyte_dynarr *conversion_out_dynarr;
5079 static Bufbyte_dynarr *conversion_in_dynarr;
5081 /* Determine coding system from coding format */
5083 /* #### not correct for all values of `fmt'! */
5085 external_data_format_to_coding_system (enum external_data_format fmt)
5089 case FORMAT_FILENAME:
5090 case FORMAT_TERMINAL:
5091 if (EQ (Vfile_name_coding_system, Qnil) ||
5092 EQ (Vfile_name_coding_system, Qbinary))
5095 return Fget_coding_system (Vfile_name_coding_system);
5098 return Fget_coding_system (Qctext);
5106 convert_to_external_format (CONST Bufbyte *ptr,
5109 enum external_data_format fmt)
5111 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5113 if (!conversion_out_dynarr)
5114 conversion_out_dynarr = Dynarr_new (Extbyte);
5116 Dynarr_reset (conversion_out_dynarr);
5118 if (NILP (coding_system))
5120 CONST Bufbyte *end = ptr + len;
5126 (*ptr < 0xc0) ? *ptr :
5127 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5130 (BYTE_ASCII_P (*ptr)) ? *ptr :
5131 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5132 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5135 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5139 #ifdef ERROR_CHECK_BUFPOS
5140 assert (ptr == end);
5145 Lisp_Object instream, outstream, da_outstream;
5146 Lstream *istr, *ostr;
5147 struct gcpro gcpro1, gcpro2, gcpro3;
5148 char tempbuf[1024]; /* some random amount */
5150 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5151 da_outstream = make_dynarr_output_stream
5152 ((unsigned_char_dynarr *) conversion_out_dynarr);
5154 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5155 istr = XLSTREAM (instream);
5156 ostr = XLSTREAM (outstream);
5157 GCPRO3 (instream, outstream, da_outstream);
5160 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5163 Lstream_write (ostr, tempbuf, size_in_bytes);
5165 Lstream_close (istr);
5166 Lstream_close (ostr);
5168 Lstream_delete (istr);
5169 Lstream_delete (ostr);
5170 Lstream_delete (XLSTREAM (da_outstream));
5173 *len_out = Dynarr_length (conversion_out_dynarr);
5174 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5175 return Dynarr_atp (conversion_out_dynarr, 0);
5179 convert_from_external_format (CONST Extbyte *ptr,
5182 enum external_data_format fmt)
5184 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5186 if (!conversion_in_dynarr)
5187 conversion_in_dynarr = Dynarr_new (Bufbyte);
5189 Dynarr_reset (conversion_in_dynarr);
5191 if (NILP (coding_system))
5193 CONST Extbyte *end = ptr + len;
5194 for (; ptr < end; ptr++)
5197 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5202 Lisp_Object instream, outstream, da_outstream;
5203 Lstream *istr, *ostr;
5204 struct gcpro gcpro1, gcpro2, gcpro3;
5205 char tempbuf[1024]; /* some random amount */
5207 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5208 da_outstream = make_dynarr_output_stream
5209 ((unsigned_char_dynarr *) conversion_in_dynarr);
5211 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5212 istr = XLSTREAM (instream);
5213 ostr = XLSTREAM (outstream);
5214 GCPRO3 (instream, outstream, da_outstream);
5217 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5220 Lstream_write (ostr, tempbuf, size_in_bytes);
5222 Lstream_close (istr);
5223 Lstream_close (ostr);
5225 Lstream_delete (istr);
5226 Lstream_delete (ostr);
5227 Lstream_delete (XLSTREAM (da_outstream));
5230 *len_out = Dynarr_length (conversion_in_dynarr);
5231 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5232 return Dynarr_atp (conversion_in_dynarr, 0);
5236 /************************************************************************/
5237 /* Initialization */
5238 /************************************************************************/
5241 syms_of_file_coding (void)
5243 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5244 deferror (&Qcoding_system_error, "coding-system-error",
5245 "Coding-system error", Qio_error);
5247 DEFSUBR (Fcoding_system_p);
5248 DEFSUBR (Ffind_coding_system);
5249 DEFSUBR (Fget_coding_system);
5250 DEFSUBR (Fcoding_system_list);
5251 DEFSUBR (Fcoding_system_name);
5252 DEFSUBR (Fmake_coding_system);
5253 DEFSUBR (Fcopy_coding_system);
5254 DEFSUBR (Fdefine_coding_system_alias);
5255 DEFSUBR (Fsubsidiary_coding_system);
5257 DEFSUBR (Fcoding_system_type);
5258 DEFSUBR (Fcoding_system_doc_string);
5260 DEFSUBR (Fcoding_system_charset);
5262 DEFSUBR (Fcoding_system_property);
5264 DEFSUBR (Fcoding_category_list);
5265 DEFSUBR (Fset_coding_priority_list);
5266 DEFSUBR (Fcoding_priority_list);
5267 DEFSUBR (Fset_coding_category_system);
5268 DEFSUBR (Fcoding_category_system);
5270 DEFSUBR (Fdetect_coding_region);
5271 DEFSUBR (Fdecode_coding_region);
5272 DEFSUBR (Fencode_coding_region);
5274 DEFSUBR (Fdecode_shift_jis_char);
5275 DEFSUBR (Fencode_shift_jis_char);
5276 DEFSUBR (Fdecode_big5_char);
5277 DEFSUBR (Fencode_big5_char);
5279 defsymbol (&Qcoding_system_p, "coding-system-p");
5280 defsymbol (&Qno_conversion, "no-conversion");
5281 defsymbol (&Qraw_text, "raw-text");
5283 defsymbol (&Qbig5, "big5");
5284 defsymbol (&Qshift_jis, "shift-jis");
5285 defsymbol (&Qucs4, "ucs-4");
5286 defsymbol (&Qutf8, "utf-8");
5287 defsymbol (&Qccl, "ccl");
5288 defsymbol (&Qiso2022, "iso2022");
5290 defsymbol (&Qmnemonic, "mnemonic");
5291 defsymbol (&Qeol_type, "eol-type");
5292 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5293 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5295 defsymbol (&Qcr, "cr");
5296 defsymbol (&Qlf, "lf");
5297 defsymbol (&Qcrlf, "crlf");
5298 defsymbol (&Qeol_cr, "eol-cr");
5299 defsymbol (&Qeol_lf, "eol-lf");
5300 defsymbol (&Qeol_crlf, "eol-crlf");
5302 defsymbol (&Qcharset_g0, "charset-g0");
5303 defsymbol (&Qcharset_g1, "charset-g1");
5304 defsymbol (&Qcharset_g2, "charset-g2");
5305 defsymbol (&Qcharset_g3, "charset-g3");
5306 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5307 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5308 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5309 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5310 defsymbol (&Qno_iso6429, "no-iso6429");
5311 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5312 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5314 defsymbol (&Qshort, "short");
5315 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5316 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5317 defsymbol (&Qseven, "seven");
5318 defsymbol (&Qlock_shift, "lock-shift");
5319 defsymbol (&Qescape_quoted, "escape-quoted");
5321 defsymbol (&Qencode, "encode");
5322 defsymbol (&Qdecode, "decode");
5325 defsymbol (&Qctext, "ctext");
5326 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5328 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5330 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5332 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5334 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5336 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5338 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5340 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5342 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5345 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5350 lstream_type_create_file_coding (void)
5352 LSTREAM_HAS_METHOD (decoding, reader);
5353 LSTREAM_HAS_METHOD (decoding, writer);
5354 LSTREAM_HAS_METHOD (decoding, rewinder);
5355 LSTREAM_HAS_METHOD (decoding, seekable_p);
5356 LSTREAM_HAS_METHOD (decoding, flusher);
5357 LSTREAM_HAS_METHOD (decoding, closer);
5358 LSTREAM_HAS_METHOD (decoding, marker);
5360 LSTREAM_HAS_METHOD (encoding, reader);
5361 LSTREAM_HAS_METHOD (encoding, writer);
5362 LSTREAM_HAS_METHOD (encoding, rewinder);
5363 LSTREAM_HAS_METHOD (encoding, seekable_p);
5364 LSTREAM_HAS_METHOD (encoding, flusher);
5365 LSTREAM_HAS_METHOD (encoding, closer);
5366 LSTREAM_HAS_METHOD (encoding, marker);
5370 vars_of_file_coding (void)
5374 /* Initialize to something reasonable ... */
5375 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5377 coding_category_system[i] = Qnil;
5378 coding_category_by_priority[i] = i;
5381 Fprovide (intern ("file-coding"));
5383 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5384 Coding system used for TTY keyboard input.
5385 Not used under a windowing system.
5387 Vkeyboard_coding_system = Qnil;
5389 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5390 Coding system used for TTY display output.
5391 Not used under a windowing system.
5393 Vterminal_coding_system = Qnil;
5395 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5396 Overriding coding system used when writing a file or process.
5397 You should *bind* this, not set it. If this is non-nil, it specifies
5398 the coding system that will be used when a file or process is read
5399 in, and overrides `buffer-file-coding-system-for-read',
5400 `insert-file-contents-pre-hook', etc. Use those variables instead of
5401 this one for permanent changes to the environment.
5403 Vcoding_system_for_read = Qnil;
5405 DEFVAR_LISP ("coding-system-for-write",
5406 &Vcoding_system_for_write /*
5407 Overriding coding system used when writing a file or process.
5408 You should *bind* this, not set it. If this is non-nil, it specifies
5409 the coding system that will be used when a file or process is wrote
5410 in, and overrides `buffer-file-coding-system',
5411 `write-region-pre-hook', etc. Use those variables instead of this one
5412 for permanent changes to the environment.
5414 Vcoding_system_for_write = Qnil;
5416 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5417 Coding system used to convert pathnames when accessing files.
5419 Vfile_name_coding_system = Qnil;
5421 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5422 Non-nil means the buffer contents are regarded as multi-byte form
5423 of characters, not a binary code. This affects the display, file I/O,
5424 and behaviors of various editing commands.
5426 Setting this to nil does not do anything.
5428 enable_multibyte_characters = 1;
5432 complex_vars_of_file_coding (void)
5434 staticpro (&Vcoding_system_hash_table);
5435 Vcoding_system_hash_table =
5436 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5438 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5440 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5442 struct codesys_prop csp; \
5444 csp.prop_type = (Prop_Type); \
5445 Dynarr_add (the_codesys_prop_dynarr, csp); \
5448 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5449 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5450 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5451 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5452 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5453 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5454 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5456 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5457 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5458 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5459 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5460 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5461 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5462 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5463 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5464 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5465 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5466 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5467 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5468 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5469 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5470 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5471 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5472 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5474 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5475 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5477 /* Need to create this here or we're really screwed. */
5479 (Qraw_text, Qno_conversion,
5480 build_string ("Raw text, which means it converts only line-break-codes."),
5481 list2 (Qmnemonic, build_string ("Raw")));
5484 (Qbinary, Qno_conversion,
5485 build_string ("Binary, which means it does not convert anything."),
5486 list4 (Qeol_type, Qlf,
5487 Qmnemonic, build_string ("Binary")));
5492 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5493 list2 (Qmnemonic, build_string ("UTF8")));
5496 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5498 /* Need this for bootstrapping */
5499 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5500 Fget_coding_system (Qraw_text);
5503 coding_category_system[CODING_CATEGORY_UTF8]
5504 = Fget_coding_system (Qutf8);