1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs UTF-2000. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qutf_8_mcs;
109 Lisp_Object Qdisable_composition;
110 Lisp_Object Quse_entity_reference;
111 Lisp_Object Qd, Qx, QX;
113 Lisp_Object Qencode, Qdecode;
115 Lisp_Object Vcoding_system_hash_table;
117 int enable_multibyte_characters;
120 /* Additional information used by the ISO2022 decoder and detector. */
121 struct iso2022_decoder
123 /* CHARSET holds the character sets currently assigned to the G0
124 through G3 variables. It is initialized from the array
125 INITIAL_CHARSET in CODESYS. */
126 Lisp_Object charset[4];
128 /* Which registers are currently invoked into the left (GL) and
129 right (GR) halves of the 8-bit encoding space? */
130 int register_left, register_right;
132 /* ISO_ESC holds a value indicating part of an escape sequence
133 that has already been seen. */
134 enum iso_esc_flag esc;
136 /* This records the bytes we've seen so far in an escape sequence,
137 in case the sequence is invalid (we spit out the bytes unchanged). */
138 unsigned char esc_bytes[8];
140 /* Index for next byte to store in ISO escape sequence. */
143 #ifdef ENABLE_COMPOSITE_CHARS
144 /* Stuff seen so far when composing a string. */
145 unsigned_char_dynarr *composite_chars;
148 /* If we saw an invalid designation sequence for a particular
149 register, we flag it here and switch to ASCII. The next time we
150 see a valid designation for this register, we turn off the flag
151 and do the designation normally, but pretend the sequence was
152 invalid. The effect of all this is that (most of the time) the
153 escape sequences for both the switch to the unknown charset, and
154 the switch back to the known charset, get inserted literally into
155 the buffer and saved out as such. The hope is that we can
156 preserve the escape sequences so that the resulting written out
157 file makes sense. If we don't do any of this, the designation
158 to the invalid charset will be preserved but that switch back
159 to the known charset will probably get eaten because it was
160 the same charset that was already present in the register. */
161 unsigned char invalid_designated[4];
163 /* We try to do similar things as above for direction-switching
164 sequences. If we encountered a direction switch while an
165 invalid designation was present, or an invalid designation
166 just after a direction switch (i.e. no valid designation
167 encountered yet), we insert the direction-switch escape
168 sequence literally into the output stream, and later on
169 insert the corresponding direction-restoring escape sequence
171 unsigned int switched_dir_and_no_valid_charset_yet :1;
172 unsigned int invalid_switch_dir :1;
174 /* Tells the decoder to output the escape sequence literally
175 even though it was valid. Used in the games we play to
176 avoid lossage when we encounter invalid designations. */
177 unsigned int output_literally :1;
178 /* We encountered a direction switch followed by an invalid
179 designation. We didn't output the direction switch
180 literally because we didn't know about the invalid designation;
181 but we have to do so now. */
182 unsigned int output_direction_sequence :1;
185 EXFUN (Fcopy_coding_system, 2);
187 struct detection_state;
190 text_encode_generic (Lstream *encoding, const Bufbyte *src,
191 unsigned_char_dynarr *dst, size_t n);
193 static int detect_coding_sjis (struct detection_state *st,
194 const Extbyte *src, size_t n);
195 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
196 unsigned_char_dynarr *dst, size_t n);
197 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
198 unsigned_char_dynarr *dst, unsigned int *flags);
199 void char_finish_shift_jis (struct encoding_stream *str,
200 unsigned_char_dynarr *dst, unsigned int *flags);
202 static int detect_coding_big5 (struct detection_state *st,
203 const Extbyte *src, size_t n);
204 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
205 unsigned_char_dynarr *dst, size_t n);
206 void char_encode_big5 (struct encoding_stream *str, Emchar c,
207 unsigned_char_dynarr *dst, unsigned int *flags);
208 void char_finish_big5 (struct encoding_stream *str,
209 unsigned_char_dynarr *dst, unsigned int *flags);
211 static int detect_coding_ucs4 (struct detection_state *st,
212 const Extbyte *src, size_t n);
213 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
214 unsigned_char_dynarr *dst, size_t n);
215 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
216 unsigned_char_dynarr *dst, unsigned int *flags);
217 void char_finish_ucs4 (struct encoding_stream *str,
218 unsigned_char_dynarr *dst, unsigned int *flags);
220 static int detect_coding_utf8 (struct detection_state *st,
221 const Extbyte *src, size_t n);
222 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
223 unsigned_char_dynarr *dst, size_t n);
224 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
225 unsigned_char_dynarr *dst, unsigned int *flags);
226 void char_finish_utf8 (struct encoding_stream *str,
227 unsigned_char_dynarr *dst, unsigned int *flags);
229 static int postprocess_iso2022_mask (int mask);
230 static void reset_iso2022 (Lisp_Object coding_system,
231 struct iso2022_decoder *iso);
232 static int detect_coding_iso2022 (struct detection_state *st,
233 const Extbyte *src, size_t n);
234 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
235 unsigned_char_dynarr *dst, size_t n);
236 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
237 unsigned_char_dynarr *dst, unsigned int *flags);
238 void char_finish_iso2022 (struct encoding_stream *str,
239 unsigned_char_dynarr *dst, unsigned int *flags);
241 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
242 unsigned_char_dynarr *dst, size_t n);
243 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
244 unsigned_char_dynarr *dst, size_t n);
245 static void mule_decode (Lstream *decoding, const Extbyte *src,
246 unsigned_char_dynarr *dst, size_t n);
247 static void mule_encode (Lstream *encoding, const Bufbyte *src,
248 unsigned_char_dynarr *dst, size_t n);
250 typedef struct codesys_prop codesys_prop;
259 Dynarr_declare (codesys_prop);
260 } codesys_prop_dynarr;
262 static const struct lrecord_description codesys_prop_description_1[] = {
263 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
267 static const struct struct_description codesys_prop_description = {
268 sizeof (codesys_prop),
269 codesys_prop_description_1
272 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
273 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
277 static const struct struct_description codesys_prop_dynarr_description = {
278 sizeof (codesys_prop_dynarr),
279 codesys_prop_dynarr_description_1
282 codesys_prop_dynarr *the_codesys_prop_dynarr;
284 enum codesys_prop_enum
287 CODESYS_PROP_ISO2022,
292 /************************************************************************/
293 /* Coding system functions */
294 /************************************************************************/
296 static Lisp_Object mark_coding_system (Lisp_Object);
297 static void print_coding_system (Lisp_Object, Lisp_Object, int);
298 static void finalize_coding_system (void *header, int for_disksave);
301 static const struct lrecord_description ccs_description_1[] = {
302 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
303 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
307 static const struct struct_description ccs_description = {
308 sizeof (charset_conversion_spec),
312 static const struct lrecord_description ccsd_description_1[] = {
313 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
317 static const struct struct_description ccsd_description = {
318 sizeof (charset_conversion_spec_dynarr),
323 static const struct lrecord_description coding_system_description[] = {
324 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
325 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
326 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
327 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
330 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
331 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
333 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
334 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
335 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
345 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
346 mark_coding_system, print_coding_system,
347 finalize_coding_system,
348 0, 0, coding_system_description,
352 mark_coding_system (Lisp_Object obj)
354 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
356 mark_object (CODING_SYSTEM_NAME (codesys));
357 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
358 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
359 mark_object (CODING_SYSTEM_EOL_LF (codesys));
360 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
361 mark_object (CODING_SYSTEM_EOL_CR (codesys));
363 switch (CODING_SYSTEM_TYPE (codesys))
367 case CODESYS_ISO2022:
368 for (i = 0; i < 4; i++)
369 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
370 if (codesys->iso2022.input_conv)
372 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
374 struct charset_conversion_spec *ccs =
375 Dynarr_atp (codesys->iso2022.input_conv, i);
376 mark_object (ccs->from_charset);
377 mark_object (ccs->to_charset);
380 if (codesys->iso2022.output_conv)
382 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
384 struct charset_conversion_spec *ccs =
385 Dynarr_atp (codesys->iso2022.output_conv, i);
386 mark_object (ccs->from_charset);
387 mark_object (ccs->to_charset);
394 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
395 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
400 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
401 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
408 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
410 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
412 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
416 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
419 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
421 error ("printing unreadable object #<coding_system 0x%x>",
424 write_c_string ("#<coding_system ", printcharfun);
425 print_internal (c->name, printcharfun, 1);
426 write_c_string (">", printcharfun);
430 finalize_coding_system (void *header, int for_disksave)
432 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
433 /* Since coding systems never go away, this function is not
434 necessary. But it would be necessary if we changed things
435 so that coding systems could go away. */
436 if (!for_disksave) /* see comment in lstream.c */
438 switch (CODING_SYSTEM_TYPE (c))
441 case CODESYS_ISO2022:
442 if (c->iso2022.input_conv)
444 Dynarr_free (c->iso2022.input_conv);
445 c->iso2022.input_conv = 0;
447 if (c->iso2022.output_conv)
449 Dynarr_free (c->iso2022.output_conv);
450 c->iso2022.output_conv = 0;
461 symbol_to_eol_type (Lisp_Object symbol)
463 CHECK_SYMBOL (symbol);
464 if (NILP (symbol)) return EOL_AUTODETECT;
465 if (EQ (symbol, Qlf)) return EOL_LF;
466 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
467 if (EQ (symbol, Qcr)) return EOL_CR;
469 signal_simple_error ("Unrecognized eol type", symbol);
470 return EOL_AUTODETECT; /* not reached */
474 eol_type_to_symbol (eol_type_t type)
479 case EOL_LF: return Qlf;
480 case EOL_CRLF: return Qcrlf;
481 case EOL_CR: return Qcr;
482 case EOL_AUTODETECT: return Qnil;
487 setup_eol_coding_systems (Lisp_Coding_System *codesys)
489 Lisp_Object codesys_obj;
490 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
491 char *codesys_name = (char *) alloca (len + 7);
493 char *codesys_mnemonic=0;
495 Lisp_Object codesys_name_sym, sub_codesys_obj;
499 XSETCODING_SYSTEM (codesys_obj, codesys);
501 memcpy (codesys_name,
502 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
504 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
506 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
507 codesys_mnemonic = (char *) alloca (mlen + 7);
508 memcpy (codesys_mnemonic,
509 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
512 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
513 strcpy (codesys_name + len, "-" op_sys); \
515 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
516 codesys_name_sym = intern (codesys_name); \
517 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
518 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
520 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
521 build_string (codesys_mnemonic); \
522 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
525 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
526 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
527 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
530 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
531 Return t if OBJECT is a coding system.
532 A coding system is an object that defines how text containing multiple
533 character sets is encoded into a stream of (typically 8-bit) bytes.
534 The coding system is used to decode the stream into a series of
535 characters (which may be from multiple charsets) when the text is read
536 from a file or process, and is used to encode the text back into the
537 same format when it is written out to a file or process.
539 For example, many ISO2022-compliant coding systems (such as Compound
540 Text, which is used for inter-client data under the X Window System)
541 use escape sequences to switch between different charsets -- Japanese
542 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
543 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
544 `make-coding-system' for more information.
546 Coding systems are normally identified using a symbol, and the
547 symbol is accepted in place of the actual coding system object whenever
548 a coding system is called for. (This is similar to how faces work.)
552 return CODING_SYSTEMP (object) ? Qt : Qnil;
555 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
556 Retrieve the coding system of the given name.
558 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
559 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
560 If there is no such coding system, nil is returned. Otherwise the
561 associated coding system object is returned.
563 (coding_system_or_name))
565 if (NILP (coding_system_or_name))
566 coding_system_or_name = Qbinary;
567 else if (CODING_SYSTEMP (coding_system_or_name))
568 return coding_system_or_name;
570 CHECK_SYMBOL (coding_system_or_name);
574 coding_system_or_name =
575 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
577 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
578 return coding_system_or_name;
582 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
583 Retrieve the coding system of the given name.
584 Same as `find-coding-system' except that if there is no such
585 coding system, an error is signaled instead of returning nil.
589 Lisp_Object coding_system = Ffind_coding_system (name);
591 if (NILP (coding_system))
592 signal_simple_error ("No such coding system", name);
593 return coding_system;
596 /* We store the coding systems in hash tables with the names as the key and the
597 actual coding system object as the value. Occasionally we need to use them
598 in a list format. These routines provide us with that. */
599 struct coding_system_list_closure
601 Lisp_Object *coding_system_list;
605 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
606 void *coding_system_list_closure)
608 /* This function can GC */
609 struct coding_system_list_closure *cscl =
610 (struct coding_system_list_closure *) coding_system_list_closure;
611 Lisp_Object *coding_system_list = cscl->coding_system_list;
613 *coding_system_list = Fcons (key, *coding_system_list);
617 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
618 Return a list of the names of all defined coding systems.
622 Lisp_Object coding_system_list = Qnil;
624 struct coding_system_list_closure coding_system_list_closure;
626 GCPRO1 (coding_system_list);
627 coding_system_list_closure.coding_system_list = &coding_system_list;
628 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
629 &coding_system_list_closure);
632 return coding_system_list;
635 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
636 Return the name of the given coding system.
640 coding_system = Fget_coding_system (coding_system);
641 return XCODING_SYSTEM_NAME (coding_system);
644 static Lisp_Coding_System *
645 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
647 Lisp_Coding_System *codesys =
648 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
650 zero_lcrecord (codesys);
651 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
652 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
653 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
654 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
655 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
656 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
657 CODING_SYSTEM_TYPE (codesys) = type;
658 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
661 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
663 if (type == CODESYS_ISO2022)
666 for (i = 0; i < 4; i++)
667 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
670 if (type == CODESYS_BIG5)
672 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
674 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
675 = Vcharset_chinese_big5;
676 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
678 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
682 else if (type == CODESYS_CCL)
684 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
685 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
688 CODING_SYSTEM_NAME (codesys) = name;
694 /* Given a list of charset conversion specs as specified in a Lisp
695 program, parse it into STORE_HERE. */
698 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
699 Lisp_Object spec_list)
703 EXTERNAL_LIST_LOOP (rest, spec_list)
705 Lisp_Object car = XCAR (rest);
706 Lisp_Object from, to;
707 struct charset_conversion_spec spec;
709 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
710 signal_simple_error ("Invalid charset conversion spec", car);
711 from = Fget_charset (XCAR (car));
712 to = Fget_charset (XCAR (XCDR (car)));
713 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
714 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
715 signal_simple_error_2
716 ("Attempted conversion between different charset types",
718 spec.from_charset = from;
719 spec.to_charset = to;
721 Dynarr_add (store_here, spec);
725 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
726 specs, return the equivalent as the Lisp programmer would see it.
728 If LOAD_HERE is 0, return Qnil. */
731 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
738 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
740 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
741 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
744 return Fnreverse (result);
749 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
750 Register symbol NAME as a coding system.
752 TYPE describes the conversion method used and should be one of
755 Automatic conversion. XEmacs attempts to detect the coding system
758 No conversion. Use this for binary files and such. On output,
759 graphic characters that are not in ASCII or Latin-1 will be
760 replaced by a ?. (For a no-conversion-encoded buffer, these
761 characters will only be present if you explicitly insert them.)
763 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
765 ISO 10646 UCS-4 encoding.
767 ISO 10646 UTF-8 encoding.
769 Any ISO2022-compliant encoding. Among other things, this includes
770 JIS (the Japanese encoding commonly used for e-mail), EUC (the
771 standard Unix encoding for Japanese and other languages), and
772 Compound Text (the encoding used in X11). You can specify more
773 specific information about the conversion with the PROPS argument.
775 Big5 (the encoding commonly used for Taiwanese).
777 The conversion is performed using a user-written pseudo-code
778 program. CCL (Code Conversion Language) is the name of this
781 Write out or read in the raw contents of the memory representing
782 the buffer's text. This is primarily useful for debugging
783 purposes, and is only enabled when XEmacs has been compiled with
784 DEBUG_XEMACS defined (via the --debug configure option).
785 WARNING: Reading in a file using 'internal conversion can result
786 in an internal inconsistency in the memory representing a
787 buffer's text, which will produce unpredictable results and may
788 cause XEmacs to crash. Under normal circumstances you should
789 never use 'internal conversion.
791 DOC-STRING is a string describing the coding system.
793 PROPS is a property list, describing the specific nature of the
794 character set. Recognized properties are:
797 String to be displayed in the modeline when this coding system is
801 End-of-line conversion to be used. It should be one of
804 Automatically detect the end-of-line type (LF, CRLF,
805 or CR). Also generate subsidiary coding systems named
806 `NAME-unix', `NAME-dos', and `NAME-mac', that are
807 identical to this coding system but have an EOL-TYPE
808 value of 'lf, 'crlf, and 'cr, respectively.
810 The end of a line is marked externally using ASCII LF.
811 Since this is also the way that XEmacs represents an
812 end-of-line internally, specifying this option results
813 in no end-of-line conversion. This is the standard
814 format for Unix text files.
816 The end of a line is marked externally using ASCII
817 CRLF. This is the standard format for MS-DOS text
820 The end of a line is marked externally using ASCII CR.
821 This is the standard format for Macintosh text files.
823 Automatically detect the end-of-line type but do not
824 generate subsidiary coding systems. (This value is
825 converted to nil when stored internally, and
826 `coding-system-property' will return nil.)
829 If non-nil, composition/decomposition for combining characters
832 'use-entity-reference
833 If non-nil, SGML style entity-reference is used for non-system-characters.
835 'post-read-conversion
836 Function called after a file has been read in, to perform the
837 decoding. Called with two arguments, START and END, denoting
838 a region of the current buffer to be decoded.
840 'pre-write-conversion
841 Function called before a file is written out, to perform the
842 encoding. Called with two arguments, START and END, denoting
843 a region of the current buffer to be encoded.
846 The following additional properties are recognized if TYPE is 'iso2022:
852 The character set initially designated to the G0 - G3 registers.
853 The value should be one of
855 -- A charset object (designate that character set)
856 -- nil (do not ever use this register)
857 -- t (no character set is initially designated to
858 the register, but may be later on; this automatically
859 sets the corresponding `force-g*-on-output' property)
865 If non-nil, send an explicit designation sequence on output before
866 using the specified register.
869 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
870 "ESC $ B" on output in place of the full designation sequences
871 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
874 If non-nil, don't designate ASCII to G0 at each end of line on output.
875 Setting this to non-nil also suppresses other state-resetting that
876 normally happens at the end of a line.
879 If non-nil, don't designate ASCII to G0 before control chars on output.
882 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
886 If non-nil, use locking-shift (SO/SI) instead of single-shift
887 or designation by escape sequence.
890 If non-nil, don't use ISO6429's direction specification.
893 If non-nil, literal control characters that are the same as
894 the beginning of a recognized ISO2022 or ISO6429 escape sequence
895 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
896 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
897 so that they can be properly distinguished from an escape sequence.
898 (Note that doing this results in a non-portable encoding.) This
899 encoding flag is used for byte-compiled files. Note that ESC
900 is a good choice for a quoting character because there are no
901 escape sequences whose second byte is a character from the Control-0
902 or Control-1 character sets; this is explicitly disallowed by the
905 'input-charset-conversion
906 A list of conversion specifications, specifying conversion of
907 characters in one charset to another when decoding is performed.
908 Each specification is a list of two elements: the source charset,
909 and the destination charset.
911 'output-charset-conversion
912 A list of conversion specifications, specifying conversion of
913 characters in one charset to another when encoding is performed.
914 The form of each specification is the same as for
915 'input-charset-conversion.
918 The following additional properties are recognized (and required)
922 CCL program used for decoding (converting to internal format).
925 CCL program used for encoding (converting to external format).
927 (name, type, doc_string, props))
929 Lisp_Coding_System *codesys;
930 enum coding_system_type ty;
931 int need_to_setup_eol_systems = 1;
933 /* Convert type to constant */
934 if (NILP (type) || EQ (type, Qundecided))
935 { ty = CODESYS_AUTODETECT; }
937 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
938 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
939 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
940 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
941 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
942 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
944 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
946 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
949 signal_simple_error ("Invalid coding system type", type);
953 codesys = allocate_coding_system (ty, name);
955 if (NILP (doc_string))
956 doc_string = build_string ("");
958 CHECK_STRING (doc_string);
959 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
962 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
964 if (EQ (key, Qmnemonic))
967 CHECK_STRING (value);
968 CODING_SYSTEM_MNEMONIC (codesys) = value;
971 else if (EQ (key, Qeol_type))
973 need_to_setup_eol_systems = NILP (value);
976 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
979 else if (EQ (key, Qpost_read_conversion))
980 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
981 else if (EQ (key, Qpre_write_conversion))
982 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
984 else if (EQ (key, Qdisable_composition))
985 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
986 else if (EQ (key, Quse_entity_reference))
987 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
990 else if (ty == CODESYS_ISO2022)
992 #define FROB_INITIAL_CHARSET(charset_num) \
993 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
994 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
996 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
997 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
998 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
999 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1001 #define FROB_FORCE_CHARSET(charset_num) \
1002 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1004 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1005 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1006 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1007 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1009 #define FROB_BOOLEAN_PROPERTY(prop) \
1010 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1012 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1013 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1014 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1015 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1016 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1017 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1018 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1020 else if (EQ (key, Qinput_charset_conversion))
1022 codesys->iso2022.input_conv =
1023 Dynarr_new (charset_conversion_spec);
1024 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1027 else if (EQ (key, Qoutput_charset_conversion))
1029 codesys->iso2022.output_conv =
1030 Dynarr_new (charset_conversion_spec);
1031 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1035 signal_simple_error ("Unrecognized property", key);
1038 else if (ty == CODESYS_BIG5)
1040 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1041 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1043 signal_simple_error ("Unrecognized property", key);
1046 else if (EQ (type, Qccl))
1049 struct ccl_program test_ccl;
1052 /* Check key first. */
1053 if (EQ (key, Qdecode))
1054 suffix = "-ccl-decode";
1055 else if (EQ (key, Qencode))
1056 suffix = "-ccl-encode";
1058 signal_simple_error ("Unrecognized property", key);
1060 /* If value is vector, register it as a ccl program
1061 associated with an newly created symbol for
1062 backward compatibility. */
1063 if (VECTORP (value))
1065 sym = Fintern (concat2 (Fsymbol_name (name),
1066 build_string (suffix)),
1068 Fregister_ccl_program (sym, value);
1072 CHECK_SYMBOL (value);
1075 /* check if the given ccl programs are valid. */
1076 if (setup_ccl_program (&test_ccl, sym) < 0)
1077 signal_simple_error ("Invalid CCL program", value);
1079 if (EQ (key, Qdecode))
1080 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1081 else if (EQ (key, Qencode))
1082 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1087 signal_simple_error ("Unrecognized property", key);
1091 if (need_to_setup_eol_systems)
1092 setup_eol_coding_systems (codesys);
1095 Lisp_Object codesys_obj;
1096 XSETCODING_SYSTEM (codesys_obj, codesys);
1097 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1102 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1103 Copy OLD-CODING-SYSTEM to NEW-NAME.
1104 If NEW-NAME does not name an existing coding system, a new one will
1107 (old_coding_system, new_name))
1109 Lisp_Object new_coding_system;
1110 old_coding_system = Fget_coding_system (old_coding_system);
1111 new_coding_system = Ffind_coding_system (new_name);
1112 if (NILP (new_coding_system))
1114 XSETCODING_SYSTEM (new_coding_system,
1115 allocate_coding_system
1116 (XCODING_SYSTEM_TYPE (old_coding_system),
1118 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1122 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1123 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1124 memcpy (((char *) to ) + sizeof (to->header),
1125 ((char *) from) + sizeof (from->header),
1126 sizeof (*from) - sizeof (from->header));
1127 to->name = new_name;
1129 return new_coding_system;
1132 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1133 Return t if OBJECT names a coding system, and is not a coding system alias.
1137 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1141 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1142 Return t if OBJECT is a coding system alias.
1143 All coding system aliases are created by `define-coding-system-alias'.
1147 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1151 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1152 Return the coding-system symbol for which symbol ALIAS is an alias.
1156 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1157 if (SYMBOLP (aliasee))
1160 signal_simple_error ("Symbol is not a coding system alias", alias);
1161 return Qnil; /* To keep the compiler happy */
1165 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1167 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1171 /* A maphash function, for removing dangling coding system aliases. */
1173 dangling_coding_system_alias_p (Lisp_Object alias,
1174 Lisp_Object aliasee,
1175 void *dangling_aliases)
1177 if (SYMBOLP (aliasee)
1178 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1180 (*(int *) dangling_aliases)++;
1187 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1188 Define symbol ALIAS as an alias for coding system ALIASEE.
1190 You can use this function to redefine an alias that has already been defined,
1191 but you cannot redefine a name which is the canonical name for a coding system.
1192 \(a canonical name of a coding system is what is returned when you call
1193 `coding-system-name' on a coding system).
1195 ALIASEE itself can be an alias, which allows you to define nested aliases.
1197 You are forbidden, however, from creating alias loops or `dangling' aliases.
1198 These will be detected, and an error will be signaled if you attempt to do so.
1200 If ALIASEE is nil, then ALIAS will simply be undefined.
1202 See also `coding-system-alias-p', `coding-system-aliasee',
1203 and `coding-system-canonical-name-p'.
1207 Lisp_Object real_coding_system, probe;
1209 CHECK_SYMBOL (alias);
1211 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1213 ("Symbol is the canonical name of a coding system and cannot be redefined",
1218 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1219 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1220 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1222 Fremhash (alias, Vcoding_system_hash_table);
1224 /* Undefine subsidiary aliases,
1225 presumably created by a previous call to this function */
1226 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1227 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1228 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1230 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1231 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1232 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1235 /* Undefine dangling coding system aliases. */
1237 int dangling_aliases;
1240 dangling_aliases = 0;
1241 elisp_map_remhash (dangling_coding_system_alias_p,
1242 Vcoding_system_hash_table,
1244 } while (dangling_aliases > 0);
1250 if (CODING_SYSTEMP (aliasee))
1251 aliasee = XCODING_SYSTEM_NAME (aliasee);
1253 /* Checks that aliasee names a coding-system */
1254 real_coding_system = Fget_coding_system (aliasee);
1256 /* Check for coding system alias loops */
1257 if (EQ (alias, aliasee))
1258 alias_loop: signal_simple_error_2
1259 ("Attempt to create a coding system alias loop", alias, aliasee);
1261 for (probe = aliasee;
1263 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1265 if (EQ (probe, alias))
1269 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1271 /* Set up aliases for subsidiaries.
1272 #### There must be a better way to handle subsidiary coding systems. */
1274 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1276 for (i = 0; i < countof (suffixes); i++)
1278 Lisp_Object alias_subsidiary =
1279 append_suffix_to_symbol (alias, suffixes[i]);
1280 Lisp_Object aliasee_subsidiary =
1281 append_suffix_to_symbol (aliasee, suffixes[i]);
1283 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1284 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1287 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1288 but it doesn't look intentional, so I'd rather return something
1289 meaningful or nothing at all. */
1294 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1296 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1297 Lisp_Object new_coding_system;
1299 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1300 return coding_system;
1304 case EOL_AUTODETECT: return coding_system;
1305 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1306 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1307 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1308 default: abort (); return Qnil;
1311 return NILP (new_coding_system) ? coding_system : new_coding_system;
1314 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1315 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1317 (coding_system, eol_type))
1319 coding_system = Fget_coding_system (coding_system);
1321 return subsidiary_coding_system (coding_system,
1322 symbol_to_eol_type (eol_type));
1326 /************************************************************************/
1327 /* Coding system accessors */
1328 /************************************************************************/
1330 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1331 Return the doc string for CODING-SYSTEM.
1335 coding_system = Fget_coding_system (coding_system);
1336 return XCODING_SYSTEM_DOC_STRING (coding_system);
1339 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1340 Return the type of CODING-SYSTEM.
1344 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1347 case CODESYS_AUTODETECT: return Qundecided;
1349 case CODESYS_SHIFT_JIS: return Qshift_jis;
1350 case CODESYS_ISO2022: return Qiso2022;
1351 case CODESYS_BIG5: return Qbig5;
1352 case CODESYS_UCS4: return Qucs4;
1353 case CODESYS_UTF8: return Qutf8;
1354 case CODESYS_CCL: return Qccl;
1356 case CODESYS_NO_CONVERSION: return Qno_conversion;
1358 case CODESYS_INTERNAL: return Qinternal;
1365 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1368 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1370 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1373 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1374 Return initial charset of CODING-SYSTEM designated to GNUM.
1377 (coding_system, gnum))
1379 coding_system = Fget_coding_system (coding_system);
1382 return coding_system_charset (coding_system, XINT (gnum));
1386 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1387 Return the PROP property of CODING-SYSTEM.
1389 (coding_system, prop))
1392 enum coding_system_type type;
1394 coding_system = Fget_coding_system (coding_system);
1395 CHECK_SYMBOL (prop);
1396 type = XCODING_SYSTEM_TYPE (coding_system);
1398 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1399 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1402 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1404 case CODESYS_PROP_ALL_OK:
1407 case CODESYS_PROP_ISO2022:
1408 if (type != CODESYS_ISO2022)
1410 ("Property only valid in ISO2022 coding systems",
1414 case CODESYS_PROP_CCL:
1415 if (type != CODESYS_CCL)
1417 ("Property only valid in CCL coding systems",
1427 signal_simple_error ("Unrecognized property", prop);
1429 if (EQ (prop, Qname))
1430 return XCODING_SYSTEM_NAME (coding_system);
1431 else if (EQ (prop, Qtype))
1432 return Fcoding_system_type (coding_system);
1433 else if (EQ (prop, Qdoc_string))
1434 return XCODING_SYSTEM_DOC_STRING (coding_system);
1435 else if (EQ (prop, Qmnemonic))
1436 return XCODING_SYSTEM_MNEMONIC (coding_system);
1437 else if (EQ (prop, Qeol_type))
1438 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1439 else if (EQ (prop, Qeol_lf))
1440 return XCODING_SYSTEM_EOL_LF (coding_system);
1441 else if (EQ (prop, Qeol_crlf))
1442 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1443 else if (EQ (prop, Qeol_cr))
1444 return XCODING_SYSTEM_EOL_CR (coding_system);
1445 else if (EQ (prop, Qpost_read_conversion))
1446 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1447 else if (EQ (prop, Qpre_write_conversion))
1448 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1451 else if (EQ (prop, Qdisable_composition))
1452 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1453 else if (EQ (prop, Quse_entity_reference))
1454 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1456 else if (type == CODESYS_ISO2022)
1458 if (EQ (prop, Qcharset_g0))
1459 return coding_system_charset (coding_system, 0);
1460 else if (EQ (prop, Qcharset_g1))
1461 return coding_system_charset (coding_system, 1);
1462 else if (EQ (prop, Qcharset_g2))
1463 return coding_system_charset (coding_system, 2);
1464 else if (EQ (prop, Qcharset_g3))
1465 return coding_system_charset (coding_system, 3);
1467 #define FORCE_CHARSET(charset_num) \
1468 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1469 (coding_system, charset_num) ? Qt : Qnil)
1471 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1472 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1473 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1474 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1476 #define LISP_BOOLEAN(prop) \
1477 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1479 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1480 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1481 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1482 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1483 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1484 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1485 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1487 else if (EQ (prop, Qinput_charset_conversion))
1489 unparse_charset_conversion_specs
1490 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1491 else if (EQ (prop, Qoutput_charset_conversion))
1493 unparse_charset_conversion_specs
1494 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1498 else if (type == CODESYS_CCL)
1500 if (EQ (prop, Qdecode))
1501 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1502 else if (EQ (prop, Qencode))
1503 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1511 return Qnil; /* not reached */
1515 /************************************************************************/
1516 /* Coding category functions */
1517 /************************************************************************/
1520 decode_coding_category (Lisp_Object symbol)
1524 CHECK_SYMBOL (symbol);
1525 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1526 if (EQ (coding_category_symbol[i], symbol))
1529 signal_simple_error ("Unrecognized coding category", symbol);
1530 return 0; /* not reached */
1533 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1534 Return a list of all recognized coding categories.
1539 Lisp_Object list = Qnil;
1541 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1542 list = Fcons (coding_category_symbol[i], list);
1546 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1547 Change the priority order of the coding categories.
1548 LIST should be list of coding categories, in descending order of
1549 priority. Unspecified coding categories will be lower in priority
1550 than all specified ones, in the same relative order they were in
1555 int category_to_priority[CODING_CATEGORY_LAST];
1559 /* First generate a list that maps coding categories to priorities. */
1561 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1562 category_to_priority[i] = -1;
1564 /* Highest priority comes from the specified list. */
1566 EXTERNAL_LIST_LOOP (rest, list)
1568 int cat = decode_coding_category (XCAR (rest));
1570 if (category_to_priority[cat] >= 0)
1571 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1572 category_to_priority[cat] = i++;
1575 /* Now go through the existing categories by priority to retrieve
1576 the categories not yet specified and preserve their priority
1578 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1580 int cat = fcd->coding_category_by_priority[j];
1581 if (category_to_priority[cat] < 0)
1582 category_to_priority[cat] = i++;
1585 /* Now we need to construct the inverse of the mapping we just
1588 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1589 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1591 /* Phew! That was confusing. */
1595 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1596 Return a list of coding categories in descending order of priority.
1601 Lisp_Object list = Qnil;
1603 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1604 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1609 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1610 Change the coding system associated with a coding category.
1612 (coding_category, coding_system))
1614 int cat = decode_coding_category (coding_category);
1616 coding_system = Fget_coding_system (coding_system);
1617 fcd->coding_category_system[cat] = coding_system;
1621 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1622 Return the coding system associated with a coding category.
1626 int cat = decode_coding_category (coding_category);
1627 Lisp_Object sys = fcd->coding_category_system[cat];
1630 return XCODING_SYSTEM_NAME (sys);
1635 /************************************************************************/
1636 /* Detecting the encoding of data */
1637 /************************************************************************/
1639 struct detection_state
1641 eol_type_t eol_type;
1677 struct iso2022_decoder iso;
1679 int high_byte_count;
1680 unsigned int saw_single_shift:1;
1693 acceptable_control_char_p (int c)
1697 /* Allow and ignore control characters that you might
1698 reasonably see in a text file */
1703 case 8: /* backspace */
1704 case 11: /* vertical tab */
1705 case 12: /* form feed */
1706 case 26: /* MS-DOS C-z junk */
1707 case 31: /* '^_' -- for info */
1715 mask_has_at_most_one_bit_p (int mask)
1717 /* Perhaps the only thing useful you learn from intensive Microsoft
1718 technical interviews */
1719 return (mask & (mask - 1)) == 0;
1723 detect_eol_type (struct detection_state *st, const Extbyte *src,
1728 unsigned char c = *(unsigned char *)src++;
1731 if (st->eol.just_saw_cr)
1733 else if (st->eol.seen_anything)
1736 else if (st->eol.just_saw_cr)
1739 st->eol.just_saw_cr = 1;
1741 st->eol.just_saw_cr = 0;
1742 st->eol.seen_anything = 1;
1745 return EOL_AUTODETECT;
1748 /* Attempt to determine the encoding and EOL type of the given text.
1749 Before calling this function for the first type, you must initialize
1750 st->eol_type as appropriate and initialize st->mask to ~0.
1752 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1755 st->mask holds the determined coding category mask, or ~0 if only
1756 ASCII has been seen so far.
1760 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1761 is present in st->mask
1762 1 == definitive answers are here for both st->eol_type and st->mask
1766 detect_coding_type (struct detection_state *st, const Extbyte *src,
1767 size_t n, int just_do_eol)
1769 if (st->eol_type == EOL_AUTODETECT)
1770 st->eol_type = detect_eol_type (st, src, n);
1773 return st->eol_type != EOL_AUTODETECT;
1775 if (!st->seen_non_ascii)
1777 for (; n; n--, src++)
1779 unsigned char c = *(unsigned char *) src;
1780 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1782 st->seen_non_ascii = 1;
1784 st->shift_jis.mask = ~0;
1788 st->iso2022.mask = ~0;
1798 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1799 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1800 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1801 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1802 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1803 st->big5.mask = detect_coding_big5 (st, src, n);
1804 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1805 st->utf8.mask = detect_coding_utf8 (st, src, n);
1806 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1807 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1810 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1811 | st->utf8.mask | st->ucs4.mask;
1814 int retval = mask_has_at_most_one_bit_p (st->mask);
1815 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1816 return retval && st->eol_type != EOL_AUTODETECT;
1821 coding_system_from_mask (int mask)
1825 /* If the file was entirely or basically ASCII, use the
1826 default value of `buffer-file-coding-system'. */
1827 Lisp_Object retval =
1828 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1831 retval = Ffind_coding_system (retval);
1835 (Qbad_variable, Qwarning,
1836 "Invalid `default-buffer-file-coding-system', set to nil");
1837 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1841 retval = Fget_coding_system (Qraw_text);
1849 mask = postprocess_iso2022_mask (mask);
1851 /* Look through the coding categories by priority and find
1852 the first one that is allowed. */
1853 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1855 cat = fcd->coding_category_by_priority[i];
1856 if ((mask & (1 << cat)) &&
1857 !NILP (fcd->coding_category_system[cat]))
1861 return fcd->coding_category_system[cat];
1863 return Fget_coding_system (Qraw_text);
1867 /* Given a seekable read stream and potential coding system and EOL type
1868 as specified, do any autodetection that is called for. If the
1869 coding system and/or EOL type are not `autodetect', they will be left
1870 alone; but this function will never return an autodetect coding system
1873 This function does not automatically fetch subsidiary coding systems;
1874 that should be unnecessary with the explicit eol-type argument. */
1876 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1879 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1880 eol_type_t *eol_type_in_out)
1882 struct detection_state decst;
1884 if (*eol_type_in_out == EOL_AUTODETECT)
1885 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1888 decst.eol_type = *eol_type_in_out;
1891 /* If autodetection is called for, do it now. */
1892 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1893 || *eol_type_in_out == EOL_AUTODETECT)
1896 Lisp_Object coding_system = Qnil;
1898 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1901 /* Look for initial "-*-"; mode line prefix */
1903 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1908 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1910 Extbyte *local_vars_beg = p + 3;
1911 /* Look for final "-*-"; mode line suffix */
1912 for (p = local_vars_beg,
1913 scan_end = buf + nread - LENGTH ("-*-");
1918 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1920 Extbyte *suffix = p;
1921 /* Look for "coding:" */
1922 for (p = local_vars_beg,
1923 scan_end = suffix - LENGTH ("coding:?");
1926 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1927 && (p == local_vars_beg
1928 || (*(p-1) == ' ' ||
1934 p += LENGTH ("coding:");
1935 while (*p == ' ' || *p == '\t') p++;
1937 /* Get coding system name */
1938 save = *suffix; *suffix = '\0';
1939 /* Characters valid in a MIME charset name (rfc 1521),
1940 and in a Lisp symbol name. */
1941 n = strspn ( (char *) p,
1942 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1943 "abcdefghijklmnopqrstuvwxyz"
1949 save = p[n]; p[n] = '\0';
1951 Ffind_coding_system (intern ((char *) p));
1961 if (NILP (coding_system))
1964 if (detect_coding_type (&decst, buf, nread,
1965 XCODING_SYSTEM_TYPE (*codesys_in_out)
1966 != CODESYS_AUTODETECT))
1968 nread = Lstream_read (stream, buf, sizeof (buf));
1974 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1975 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1978 if (detect_coding_type (&decst, buf, nread, 1))
1980 nread = Lstream_read (stream, buf, sizeof (buf));
1986 *eol_type_in_out = decst.eol_type;
1987 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1989 if (NILP (coding_system))
1990 *codesys_in_out = coding_system_from_mask (decst.mask);
1992 *codesys_in_out = coding_system;
1996 /* If we absolutely can't determine the EOL type, just assume LF. */
1997 if (*eol_type_in_out == EOL_AUTODETECT)
1998 *eol_type_in_out = EOL_LF;
2000 Lstream_rewind (stream);
2003 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2004 Detect coding system of the text in the region between START and END.
2005 Return a list of possible coding systems ordered by priority.
2006 If only ASCII characters are found, return 'undecided or one of
2007 its subsidiary coding systems according to a detected end-of-line
2008 type. Optional arg BUFFER defaults to the current buffer.
2010 (start, end, buffer))
2012 Lisp_Object val = Qnil;
2013 struct buffer *buf = decode_buffer (buffer, 0);
2015 Lisp_Object instream, lb_instream;
2016 Lstream *istr, *lb_istr;
2017 struct detection_state decst;
2018 struct gcpro gcpro1, gcpro2;
2020 get_buffer_range_char (buf, start, end, &b, &e, 0);
2021 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2022 lb_istr = XLSTREAM (lb_instream);
2023 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2024 istr = XLSTREAM (instream);
2025 GCPRO2 (instream, lb_instream);
2027 decst.eol_type = EOL_AUTODETECT;
2031 Extbyte random_buffer[4096];
2032 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2036 if (detect_coding_type (&decst, random_buffer, nread, 0))
2040 if (decst.mask == ~0)
2041 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2049 decst.mask = postprocess_iso2022_mask (decst.mask);
2051 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2053 int sys = fcd->coding_category_by_priority[i];
2054 if (decst.mask & (1 << sys))
2056 Lisp_Object codesys = fcd->coding_category_system[sys];
2057 if (!NILP (codesys))
2058 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2059 val = Fcons (codesys, val);
2063 Lstream_close (istr);
2065 Lstream_delete (istr);
2066 Lstream_delete (lb_istr);
2071 /************************************************************************/
2072 /* Converting to internal Mule format ("decoding") */
2073 /************************************************************************/
2075 /* A decoding stream is a stream used for decoding text (i.e.
2076 converting from some external format to internal format).
2077 The decoding-stream object keeps track of the actual coding
2078 stream, the stream that is at the other end, and data that
2079 needs to be persistent across the lifetime of the stream. */
2081 /* Handle the EOL stuff related to just-read-in character C.
2082 EOL_TYPE is the EOL type of the coding stream.
2083 FLAGS is the current value of FLAGS in the coding stream, and may
2084 be modified by this macro. (The macro only looks at the
2085 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2086 bytes are to be written. You need to also define a local goto
2087 label "label_continue_loop" that is at the end of the main
2088 character-reading loop.
2090 If C is a CR character, then this macro handles it entirely and
2091 jumps to label_continue_loop. Otherwise, this macro does not add
2092 anything to DST, and continues normally. You should continue
2093 processing C normally after this macro. */
2095 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2099 if (eol_type == EOL_CR) \
2100 Dynarr_add (dst, '\n'); \
2101 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2102 Dynarr_add (dst, c); \
2104 flags |= CODING_STATE_CR; \
2105 goto label_continue_loop; \
2107 else if (flags & CODING_STATE_CR) \
2108 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2110 Dynarr_add (dst, '\r'); \
2111 flags &= ~CODING_STATE_CR; \
2115 /* C should be a binary character in the range 0 - 255; convert
2116 to internal format and add to Dynarr DST. */
2119 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2121 if (BYTE_ASCII_P (c)) \
2122 Dynarr_add (dst, c); \
2125 Dynarr_add (dst, (c >> 6) | 0xc0); \
2126 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2130 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2132 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2136 Dynarr_add (dst, c);
2138 else if ( c <= 0x7ff )
2140 Dynarr_add (dst, (c >> 6) | 0xc0);
2141 Dynarr_add (dst, (c & 0x3f) | 0x80);
2143 else if ( c <= 0xffff )
2145 Dynarr_add (dst, (c >> 12) | 0xe0);
2146 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2147 Dynarr_add (dst, (c & 0x3f) | 0x80);
2149 else if ( c <= 0x1fffff )
2151 Dynarr_add (dst, (c >> 18) | 0xf0);
2152 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2153 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2154 Dynarr_add (dst, (c & 0x3f) | 0x80);
2156 else if ( c <= 0x3ffffff )
2158 Dynarr_add (dst, (c >> 24) | 0xf8);
2159 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2160 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2161 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2162 Dynarr_add (dst, (c & 0x3f) | 0x80);
2166 Dynarr_add (dst, (c >> 30) | 0xfc);
2167 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2168 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2169 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2170 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2171 Dynarr_add (dst, (c & 0x3f) | 0x80);
2175 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2177 if (BYTE_ASCII_P (c)) \
2178 Dynarr_add (dst, c); \
2179 else if (BYTE_C1_P (c)) \
2181 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2182 Dynarr_add (dst, c + 0x20); \
2186 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2187 Dynarr_add (dst, c); \
2192 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2196 DECODE_ADD_BINARY_CHAR (ch, dst); \
2201 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2203 if (flags & CODING_STATE_END) \
2205 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2206 if (flags & CODING_STATE_CR) \
2207 Dynarr_add (dst, '\r'); \
2211 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2213 struct decoding_stream
2215 /* Coding system that governs the conversion. */
2216 Lisp_Coding_System *codesys;
2218 /* Stream that we read the encoded data from or
2219 write the decoded data to. */
2222 /* If we are reading, then we can return only a fixed amount of
2223 data, so if the conversion resulted in too much data, we store it
2224 here for retrieval the next time around. */
2225 unsigned_char_dynarr *runoff;
2227 /* FLAGS holds flags indicating the current state of the decoding.
2228 Some of these flags are dependent on the coding system. */
2231 /* CPOS holds a partially built-up code-point of character. */
2234 /* EOL_TYPE specifies the type of end-of-line conversion that
2235 currently applies. We need to keep this separate from the
2236 EOL type stored in CODESYS because the latter might indicate
2237 automatic EOL-type detection while the former will always
2238 indicate a particular EOL type. */
2239 eol_type_t eol_type;
2241 /* Additional ISO2022 information. We define the structure above
2242 because it's also needed by the detection routines. */
2243 struct iso2022_decoder iso2022;
2245 /* Additional information (the state of the running CCL program)
2246 used by the CCL decoder. */
2247 struct ccl_program ccl;
2249 /* counter for UTF-8 or UCS-4 */
2250 unsigned char counter;
2253 unsigned char er_counter;
2254 unsigned char er_buf[16];
2256 unsigned combined_char_count;
2257 Emchar combined_chars[16];
2258 Lisp_Object combining_table;
2260 struct detection_state decst;
2264 extern Lisp_Object Qcomposition;
2267 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
2269 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
2271 if ( str->er_counter > 0)
2273 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2274 str->er_counter = 0;
2278 void decode_add_er_char (struct decoding_stream *str, Emchar character,
2279 unsigned_char_dynarr* dst);
2281 decode_add_er_char (struct decoding_stream *str, Emchar c,
2282 unsigned_char_dynarr* dst)
2284 if (str->er_counter == 0)
2286 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
2289 str->er_buf[0] = '&';
2293 DECODE_ADD_UCS_CHAR (c, dst);
2297 Lisp_Object string = make_string (str->er_buf,
2299 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
2306 while (!NILP (rest))
2310 if (NILP (ccs = Ffind_charset (ccs)))
2325 pat = concat3 (build_string ("^&"),
2326 pat, build_string ("\\([0-9]+\\)$"));
2329 else if (EQ (ret, Qx))
2331 pat = concat3 (build_string ("^&"),
2332 pat, build_string ("\\([0-9a-f]+\\)$"));
2335 else if (EQ (ret, QX))
2337 pat = concat3 (build_string ("^&"),
2338 pat, build_string ("\\([0-9A-F]+\\)$"));
2344 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
2347 = XINT (Fstring_to_number
2348 (Fsubstring (string,
2349 Fmatch_beginning (make_int (1)),
2350 Fmatch_end (make_int (1))),
2353 DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst);
2358 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
2359 string, Qnil, Qnil)))
2362 = XINT (Fstring_to_number
2363 (Fsubstring (string,
2364 Fmatch_beginning (make_int (1)),
2365 Fmatch_end (make_int (1))),
2368 DECODE_ADD_UCS_CHAR (code, dst);
2372 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2373 Dynarr_add (dst, ';');
2376 str->er_counter = 0;
2378 else if ( (str->er_counter >= 16) || (c >= 0x7F) )
2380 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2381 str->er_counter = 0;
2382 DECODE_ADD_UCS_CHAR (c, dst);
2385 str->er_buf[str->er_counter++] = c;
2389 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2391 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2395 for (i = 0; i < str->combined_char_count; i++)
2396 decode_add_er_char (str, str->combined_chars[i], dst);
2397 str->combined_char_count = 0;
2398 str->combining_table = Qnil;
2401 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
2402 unsigned_char_dynarr* dst);
2404 COMPOSE_ADD_CHAR (struct decoding_stream *str,
2405 Emchar character, unsigned_char_dynarr* dst)
2407 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2408 decode_add_er_char (str, character, dst);
2409 else if (!CONSP (str->combining_table))
2412 = Fget_char_attribute (make_char (character), Qcomposition, Qnil);
2415 decode_add_er_char (str, character, dst);
2418 str->combined_chars[0] = character;
2419 str->combined_char_count = 1;
2420 str->combining_table = ret;
2425 Lisp_Object ret = Fcdr (Fassq (make_char (character), str->combining_table));
2429 Emchar char2 = XCHARVAL (ret);
2430 ret = Fget_char_attribute (make_char (character), Qcomposition, Qnil);
2433 decode_add_er_char (str, character, dst);
2434 str->combined_char_count = 0;
2435 str->combining_table = Qnil;
2439 str->combined_chars[0] = char2;
2440 str->combined_char_count = 1;
2441 str->combining_table = ret;
2446 COMPOSE_FLUSH_CHARS (str, dst);
2447 decode_add_er_char (str, character, dst);
2451 #else /* not UTF2000 */
2452 #define COMPOSE_FLUSH_CHARS(str, dst)
2453 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2454 #endif /* UTF2000 */
2456 static ssize_t decoding_reader (Lstream *stream,
2457 unsigned char *data, size_t size);
2458 static ssize_t decoding_writer (Lstream *stream,
2459 const unsigned char *data, size_t size);
2460 static int decoding_rewinder (Lstream *stream);
2461 static int decoding_seekable_p (Lstream *stream);
2462 static int decoding_flusher (Lstream *stream);
2463 static int decoding_closer (Lstream *stream);
2465 static Lisp_Object decoding_marker (Lisp_Object stream);
2467 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2468 sizeof (struct decoding_stream));
2471 decoding_marker (Lisp_Object stream)
2473 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2474 Lisp_Object str_obj;
2476 /* We do not need to mark the coding systems or charsets stored
2477 within the stream because they are stored in a global list
2478 and automatically marked. */
2480 XSETLSTREAM (str_obj, str);
2481 mark_object (str_obj);
2482 if (str->imp->marker)
2483 return (str->imp->marker) (str_obj);
2488 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2489 so we read data from the other end, decode it, and store it into DATA. */
2492 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2494 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2495 unsigned char *orig_data = data;
2497 int error_occurred = 0;
2499 /* We need to interface to mule_decode(), which expects to take some
2500 amount of data and store the result into a Dynarr. We have
2501 mule_decode() store into str->runoff, and take data from there
2504 /* We loop until we have enough data, reading chunks from the other
2505 end and decoding it. */
2508 /* Take data from the runoff if we can. Make sure to take at
2509 most SIZE bytes, and delete the data from the runoff. */
2510 if (Dynarr_length (str->runoff) > 0)
2512 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2513 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2514 Dynarr_delete_many (str->runoff, 0, chunk);
2520 break; /* No more room for data */
2522 if (str->flags & CODING_STATE_END)
2523 /* This means that on the previous iteration, we hit the EOF on
2524 the other end. We loop once more so that mule_decode() can
2525 output any final stuff it may be holding, or any "go back
2526 to a sane state" escape sequences. (This latter makes sense
2527 during encoding.) */
2530 /* Exhausted the runoff, so get some more. DATA has at least
2531 SIZE bytes left of storage in it, so it's OK to read directly
2532 into it. (We'll be overwriting above, after we've decoded it
2533 into the runoff.) */
2534 read_size = Lstream_read (str->other_end, data, size);
2541 /* There might be some more end data produced in the translation.
2542 See the comment above. */
2543 str->flags |= CODING_STATE_END;
2544 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2547 if (data - orig_data == 0)
2548 return error_occurred ? -1 : 0;
2550 return data - orig_data;
2554 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2556 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2559 /* Decode all our data into the runoff, and then attempt to write
2560 it all out to the other end. Remove whatever chunk we succeeded
2562 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2563 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2564 Dynarr_length (str->runoff));
2566 Dynarr_delete_many (str->runoff, 0, retval);
2567 /* Do NOT return retval. The return value indicates how much
2568 of the incoming data was written, not how many bytes were
2574 reset_decoding_stream (struct decoding_stream *str)
2577 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2579 Lisp_Object coding_system;
2580 XSETCODING_SYSTEM (coding_system, str->codesys);
2581 reset_iso2022 (coding_system, &str->iso2022);
2583 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2585 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2590 str->er_counter = 0;
2591 str->combined_char_count = 0;
2592 str->combining_table = Qnil;
2594 str->flags = str->cpos = 0;
2598 decoding_rewinder (Lstream *stream)
2600 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2601 reset_decoding_stream (str);
2602 Dynarr_reset (str->runoff);
2603 return Lstream_rewind (str->other_end);
2607 decoding_seekable_p (Lstream *stream)
2609 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2610 return Lstream_seekable_p (str->other_end);
2614 decoding_flusher (Lstream *stream)
2616 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2617 return Lstream_flush (str->other_end);
2621 decoding_closer (Lstream *stream)
2623 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2624 if (stream->flags & LSTREAM_FL_WRITE)
2626 str->flags |= CODING_STATE_END;
2627 decoding_writer (stream, 0, 0);
2629 Dynarr_free (str->runoff);
2631 #ifdef ENABLE_COMPOSITE_CHARS
2632 if (str->iso2022.composite_chars)
2633 Dynarr_free (str->iso2022.composite_chars);
2636 return Lstream_close (str->other_end);
2640 decoding_stream_coding_system (Lstream *stream)
2642 Lisp_Object coding_system;
2643 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2645 XSETCODING_SYSTEM (coding_system, str->codesys);
2646 return subsidiary_coding_system (coding_system, str->eol_type);
2650 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2652 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2653 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2655 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2656 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2657 reset_decoding_stream (str);
2660 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2661 stream for writing, no automatic code detection will be performed.
2662 The reason for this is that automatic code detection requires a
2663 seekable input. Things will also fail if you open a decoding
2664 stream for reading using a non-fully-specified coding system and
2665 a non-seekable input stream. */
2668 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2671 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2672 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2676 str->other_end = stream;
2677 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2678 str->eol_type = EOL_AUTODETECT;
2679 if (!strcmp (mode, "r")
2680 && Lstream_seekable_p (stream))
2681 /* We can determine the coding system now. */
2682 determine_real_coding_system (stream, &codesys, &str->eol_type);
2683 set_decoding_stream_coding_system (lstr, codesys);
2684 str->decst.eol_type = str->eol_type;
2685 str->decst.mask = ~0;
2686 XSETLSTREAM (obj, lstr);
2691 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2693 return make_decoding_stream_1 (stream, codesys, "r");
2697 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2699 return make_decoding_stream_1 (stream, codesys, "w");
2702 /* Note: the decode_coding_* functions all take the same
2703 arguments as mule_decode(), which is to say some SRC data of
2704 size N, which is to be stored into dynamic array DST.
2705 DECODING is the stream within which the decoding is
2706 taking place, but no data is actually read from or
2707 written to that stream; that is handled in decoding_reader()
2708 or decoding_writer(). This allows the same functions to
2709 be used for both reading and writing. */
2712 mule_decode (Lstream *decoding, const Extbyte *src,
2713 unsigned_char_dynarr *dst, size_t n)
2715 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2717 /* If necessary, do encoding-detection now. We do this when
2718 we're a writing stream or a non-seekable reading stream,
2719 meaning that we can't just process the whole input,
2720 rewind, and start over. */
2722 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2723 str->eol_type == EOL_AUTODETECT)
2725 Lisp_Object codesys;
2727 XSETCODING_SYSTEM (codesys, str->codesys);
2728 detect_coding_type (&str->decst, src, n,
2729 CODING_SYSTEM_TYPE (str->codesys) !=
2730 CODESYS_AUTODETECT);
2731 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2732 str->decst.mask != ~0)
2733 /* #### This is cheesy. What we really ought to do is
2734 buffer up a certain amount of data so as to get a
2735 less random result. */
2736 codesys = coding_system_from_mask (str->decst.mask);
2737 str->eol_type = str->decst.eol_type;
2738 if (XCODING_SYSTEM (codesys) != str->codesys)
2740 /* Preserve the CODING_STATE_END flag in case it was set.
2741 If we erase it, bad things might happen. */
2742 int was_end = str->flags & CODING_STATE_END;
2743 set_decoding_stream_coding_system (decoding, codesys);
2745 str->flags |= CODING_STATE_END;
2749 switch (CODING_SYSTEM_TYPE (str->codesys))
2752 case CODESYS_INTERNAL:
2753 Dynarr_add_many (dst, src, n);
2756 case CODESYS_AUTODETECT:
2757 /* If we got this far and still haven't decided on the coding
2758 system, then do no conversion. */
2759 case CODESYS_NO_CONVERSION:
2760 decode_coding_no_conversion (decoding, src, dst, n);
2763 case CODESYS_SHIFT_JIS:
2764 decode_coding_sjis (decoding, src, dst, n);
2767 decode_coding_big5 (decoding, src, dst, n);
2770 decode_coding_ucs4 (decoding, src, dst, n);
2773 decode_coding_utf8 (decoding, src, dst, n);
2776 str->ccl.last_block = str->flags & CODING_STATE_END;
2777 /* When applying ccl program to stream, MUST NOT set NULL
2779 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2780 dst, n, 0, CCL_MODE_DECODING);
2782 case CODESYS_ISO2022:
2783 decode_coding_iso2022 (decoding, src, dst, n);
2791 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2792 Decode the text between START and END which is encoded in CODING-SYSTEM.
2793 This is useful if you've read in encoded text from a file without decoding
2794 it (e.g. you read in a JIS-formatted file but used the `binary' or
2795 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2796 Return length of decoded text.
2797 BUFFER defaults to the current buffer if unspecified.
2799 (start, end, coding_system, buffer))
2802 struct buffer *buf = decode_buffer (buffer, 0);
2803 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2804 Lstream *istr, *ostr;
2805 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2807 get_buffer_range_char (buf, start, end, &b, &e, 0);
2809 barf_if_buffer_read_only (buf, b, e);
2811 coding_system = Fget_coding_system (coding_system);
2812 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2813 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2814 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2816 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2817 Fget_coding_system (Qbinary));
2818 istr = XLSTREAM (instream);
2819 ostr = XLSTREAM (outstream);
2820 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2822 /* The chain of streams looks like this:
2824 [BUFFER] <----- send through
2825 ------> [ENCODE AS BINARY]
2826 ------> [DECODE AS SPECIFIED]
2832 char tempbuf[1024]; /* some random amount */
2833 Bufpos newpos, even_newer_pos;
2834 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2835 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2839 newpos = lisp_buffer_stream_startpos (istr);
2840 Lstream_write (ostr, tempbuf, size_in_bytes);
2841 even_newer_pos = lisp_buffer_stream_startpos (istr);
2842 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2845 Lstream_close (istr);
2846 Lstream_close (ostr);
2848 Lstream_delete (istr);
2849 Lstream_delete (ostr);
2850 Lstream_delete (XLSTREAM (de_outstream));
2851 Lstream_delete (XLSTREAM (lb_outstream));
2856 /************************************************************************/
2857 /* Converting to an external encoding ("encoding") */
2858 /************************************************************************/
2860 /* An encoding stream is an output stream. When you create the
2861 stream, you specify the coding system that governs the encoding
2862 and another stream that the resulting encoded data is to be
2863 sent to, and then start sending data to it. */
2865 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2867 struct encoding_stream
2869 /* Coding system that governs the conversion. */
2870 Lisp_Coding_System *codesys;
2872 /* Stream that we read the encoded data from or
2873 write the decoded data to. */
2876 /* If we are reading, then we can return only a fixed amount of
2877 data, so if the conversion resulted in too much data, we store it
2878 here for retrieval the next time around. */
2879 unsigned_char_dynarr *runoff;
2881 /* FLAGS holds flags indicating the current state of the encoding.
2882 Some of these flags are dependent on the coding system. */
2885 /* CH holds a partially built-up character. Since we only deal
2886 with one- and two-byte characters at the moment, we only use
2887 this to store the first byte of a two-byte character. */
2890 /* Additional information used by the ISO2022 encoder. */
2893 /* CHARSET holds the character sets currently assigned to the G0
2894 through G3 registers. It is initialized from the array
2895 INITIAL_CHARSET in CODESYS. */
2896 Lisp_Object charset[4];
2898 /* Which registers are currently invoked into the left (GL) and
2899 right (GR) halves of the 8-bit encoding space? */
2900 int register_left, register_right;
2902 /* Whether we need to explicitly designate the charset in the
2903 G? register before using it. It is initialized from the
2904 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2905 unsigned char force_charset_on_output[4];
2907 /* Other state variables that need to be preserved across
2909 Lisp_Object current_charset;
2911 int current_char_boundary;
2914 void (*encode_char) (struct encoding_stream *str, Emchar c,
2915 unsigned_char_dynarr *dst, unsigned int *flags);
2916 void (*finish) (struct encoding_stream *str,
2917 unsigned_char_dynarr *dst, unsigned int *flags);
2919 /* Additional information (the state of the running CCL program)
2920 used by the CCL encoder. */
2921 struct ccl_program ccl;
2925 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2926 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2928 static int encoding_rewinder (Lstream *stream);
2929 static int encoding_seekable_p (Lstream *stream);
2930 static int encoding_flusher (Lstream *stream);
2931 static int encoding_closer (Lstream *stream);
2933 static Lisp_Object encoding_marker (Lisp_Object stream);
2935 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2936 sizeof (struct encoding_stream));
2939 encoding_marker (Lisp_Object stream)
2941 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2942 Lisp_Object str_obj;
2944 /* We do not need to mark the coding systems or charsets stored
2945 within the stream because they are stored in a global list
2946 and automatically marked. */
2948 XSETLSTREAM (str_obj, str);
2949 mark_object (str_obj);
2950 if (str->imp->marker)
2951 return (str->imp->marker) (str_obj);
2956 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2957 so we read data from the other end, encode it, and store it into DATA. */
2960 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2962 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2963 unsigned char *orig_data = data;
2965 int error_occurred = 0;
2967 /* We need to interface to mule_encode(), which expects to take some
2968 amount of data and store the result into a Dynarr. We have
2969 mule_encode() store into str->runoff, and take data from there
2972 /* We loop until we have enough data, reading chunks from the other
2973 end and encoding it. */
2976 /* Take data from the runoff if we can. Make sure to take at
2977 most SIZE bytes, and delete the data from the runoff. */
2978 if (Dynarr_length (str->runoff) > 0)
2980 int chunk = min ((int) size, Dynarr_length (str->runoff));
2981 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2982 Dynarr_delete_many (str->runoff, 0, chunk);
2988 break; /* No more room for data */
2990 if (str->flags & CODING_STATE_END)
2991 /* This means that on the previous iteration, we hit the EOF on
2992 the other end. We loop once more so that mule_encode() can
2993 output any final stuff it may be holding, or any "go back
2994 to a sane state" escape sequences. (This latter makes sense
2995 during encoding.) */
2998 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2999 left of storage in it, so it's OK to read directly into it.
3000 (We'll be overwriting above, after we've encoded it into the
3002 read_size = Lstream_read (str->other_end, data, size);
3009 /* There might be some more end data produced in the translation.
3010 See the comment above. */
3011 str->flags |= CODING_STATE_END;
3012 mule_encode (stream, data, str->runoff, read_size);
3015 if (data == orig_data)
3016 return error_occurred ? -1 : 0;
3018 return data - orig_data;
3022 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
3024 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3027 /* Encode all our data into the runoff, and then attempt to write
3028 it all out to the other end. Remove whatever chunk we succeeded
3030 mule_encode (stream, data, str->runoff, size);
3031 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
3032 Dynarr_length (str->runoff));
3034 Dynarr_delete_many (str->runoff, 0, retval);
3035 /* Do NOT return retval. The return value indicates how much
3036 of the incoming data was written, not how many bytes were
3042 reset_encoding_stream (struct encoding_stream *str)
3045 switch (CODING_SYSTEM_TYPE (str->codesys))
3047 case CODESYS_ISO2022:
3051 str->encode_char = &char_encode_iso2022;
3052 str->finish = &char_finish_iso2022;
3053 for (i = 0; i < 4; i++)
3055 str->iso2022.charset[i] =
3056 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
3057 str->iso2022.force_charset_on_output[i] =
3058 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
3060 str->iso2022.register_left = 0;
3061 str->iso2022.register_right = 1;
3062 str->iso2022.current_charset = Qnil;
3063 str->iso2022.current_half = 0;
3067 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
3070 str->encode_char = &char_encode_utf8;
3071 str->finish = &char_finish_utf8;
3074 str->encode_char = &char_encode_ucs4;
3075 str->finish = &char_finish_ucs4;
3077 case CODESYS_SHIFT_JIS:
3078 str->encode_char = &char_encode_shift_jis;
3079 str->finish = &char_finish_shift_jis;
3082 str->encode_char = &char_encode_big5;
3083 str->finish = &char_finish_big5;
3089 str->iso2022.current_char_boundary = 0;
3090 str->flags = str->ch = 0;
3094 encoding_rewinder (Lstream *stream)
3096 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3097 reset_encoding_stream (str);
3098 Dynarr_reset (str->runoff);
3099 return Lstream_rewind (str->other_end);
3103 encoding_seekable_p (Lstream *stream)
3105 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3106 return Lstream_seekable_p (str->other_end);
3110 encoding_flusher (Lstream *stream)
3112 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3113 return Lstream_flush (str->other_end);
3117 encoding_closer (Lstream *stream)
3119 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3120 if (stream->flags & LSTREAM_FL_WRITE)
3122 str->flags |= CODING_STATE_END;
3123 encoding_writer (stream, 0, 0);
3125 Dynarr_free (str->runoff);
3126 return Lstream_close (str->other_end);
3130 encoding_stream_coding_system (Lstream *stream)
3132 Lisp_Object coding_system;
3133 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3135 XSETCODING_SYSTEM (coding_system, str->codesys);
3136 return coding_system;
3140 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3142 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3143 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3145 reset_encoding_stream (str);
3149 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3152 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3153 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3157 str->runoff = Dynarr_new (unsigned_char);
3158 str->other_end = stream;
3159 set_encoding_stream_coding_system (lstr, codesys);
3160 XSETLSTREAM (obj, lstr);
3165 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3167 return make_encoding_stream_1 (stream, codesys, "r");
3171 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3173 return make_encoding_stream_1 (stream, codesys, "w");
3176 /* Convert N bytes of internally-formatted data stored in SRC to an
3177 external format, according to the encoding stream ENCODING.
3178 Store the encoded data into DST. */
3181 mule_encode (Lstream *encoding, const Bufbyte *src,
3182 unsigned_char_dynarr *dst, size_t n)
3184 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3186 switch (CODING_SYSTEM_TYPE (str->codesys))
3189 case CODESYS_INTERNAL:
3190 Dynarr_add_many (dst, src, n);
3193 case CODESYS_AUTODETECT:
3194 /* If we got this far and still haven't decided on the coding
3195 system, then do no conversion. */
3196 case CODESYS_NO_CONVERSION:
3197 encode_coding_no_conversion (encoding, src, dst, n);
3201 str->ccl.last_block = str->flags & CODING_STATE_END;
3202 /* When applying ccl program to stream, MUST NOT set NULL
3204 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3205 dst, n, 0, CCL_MODE_ENCODING);
3209 text_encode_generic (encoding, src, dst, n);
3213 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3214 Encode the text between START and END using CODING-SYSTEM.
3215 This will, for example, convert Japanese characters into stuff such as
3216 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3217 text. BUFFER defaults to the current buffer if unspecified.
3219 (start, end, coding_system, buffer))
3222 struct buffer *buf = decode_buffer (buffer, 0);
3223 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3224 Lstream *istr, *ostr;
3225 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3227 get_buffer_range_char (buf, start, end, &b, &e, 0);
3229 barf_if_buffer_read_only (buf, b, e);
3231 coding_system = Fget_coding_system (coding_system);
3232 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3233 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3234 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3235 Fget_coding_system (Qbinary));
3236 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3238 istr = XLSTREAM (instream);
3239 ostr = XLSTREAM (outstream);
3240 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3241 /* The chain of streams looks like this:
3243 [BUFFER] <----- send through
3244 ------> [ENCODE AS SPECIFIED]
3245 ------> [DECODE AS BINARY]
3250 char tempbuf[1024]; /* some random amount */
3251 Bufpos newpos, even_newer_pos;
3252 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3253 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3257 newpos = lisp_buffer_stream_startpos (istr);
3258 Lstream_write (ostr, tempbuf, size_in_bytes);
3259 even_newer_pos = lisp_buffer_stream_startpos (istr);
3260 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3266 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3267 Lstream_close (istr);
3268 Lstream_close (ostr);
3270 Lstream_delete (istr);
3271 Lstream_delete (ostr);
3272 Lstream_delete (XLSTREAM (de_outstream));
3273 Lstream_delete (XLSTREAM (lb_outstream));
3274 return make_int (retlen);
3281 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3282 unsigned_char_dynarr *dst, size_t n)
3285 unsigned char char_boundary;
3286 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3287 unsigned int flags = str->flags;
3288 Emchar ch = str->ch;
3290 char_boundary = str->iso2022.current_char_boundary;
3296 if (char_boundary == 0)
3324 (*str->encode_char) (str, c, dst, &flags);
3326 else if (char_boundary == 1)
3328 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3334 ch = (ch << 6) | (c & 0x3f);
3339 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3341 (*str->finish) (str, dst, &flags);
3346 str->iso2022.current_char_boundary = char_boundary;
3350 /************************************************************************/
3351 /* Shift-JIS methods */
3352 /************************************************************************/
3354 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3355 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3356 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3357 encoded by "position-code + 0x80". A character of JISX0208
3358 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3359 position-codes are divided and shifted so that it fit in the range
3362 --- CODE RANGE of Shift-JIS ---
3363 (character set) (range)
3365 JISX0201-Kana 0xA0 .. 0xDF
3366 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3367 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3368 -------------------------------
3372 /* Is this the first byte of a Shift-JIS two-byte char? */
3374 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3375 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3377 /* Is this the second byte of a Shift-JIS two-byte char? */
3379 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3380 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3382 #define BYTE_SJIS_KATAKANA_P(c) \
3383 ((c) >= 0xA1 && (c) <= 0xDF)
3386 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3390 unsigned char c = *(unsigned char *)src++;
3391 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3393 if (st->shift_jis.in_second_byte)
3395 st->shift_jis.in_second_byte = 0;
3399 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3400 st->shift_jis.in_second_byte = 1;
3402 return CODING_CATEGORY_SHIFT_JIS_MASK;
3405 /* Convert Shift-JIS data to internal format. */
3408 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3409 unsigned_char_dynarr *dst, size_t n)
3411 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3412 unsigned int flags = str->flags;
3413 unsigned int cpos = str->cpos;
3414 eol_type_t eol_type = str->eol_type;
3418 unsigned char c = *(unsigned char *)src++;
3422 /* Previous character was first byte of Shift-JIS Kanji char. */
3423 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3425 unsigned char e1, e2;
3427 DECODE_SJIS (cpos, c, e1, e2);
3429 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3433 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3434 Dynarr_add (dst, e1);
3435 Dynarr_add (dst, e2);
3440 DECODE_ADD_BINARY_CHAR (cpos, dst);
3441 DECODE_ADD_BINARY_CHAR (c, dst);
3447 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3448 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3450 else if (BYTE_SJIS_KATAKANA_P (c))
3453 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3456 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3457 Dynarr_add (dst, c);
3462 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3466 DECODE_ADD_BINARY_CHAR (c, dst);
3468 label_continue_loop:;
3471 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3477 /* Convert internal character representation to Shift_JIS. */
3480 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3481 unsigned_char_dynarr *dst, unsigned int *flags)
3483 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3487 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3488 Dynarr_add (dst, '\r');
3489 if (eol_type != EOL_CR)
3490 Dynarr_add (dst, ch);
3494 unsigned int s1, s2;
3496 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3498 if (code_point >= 0)
3499 Dynarr_add (dst, code_point);
3500 else if ((code_point
3501 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3504 ENCODE_SJIS ((code_point >> 8) | 0x80,
3505 (code_point & 0xFF) | 0x80, s1, s2);
3506 Dynarr_add (dst, s1);
3507 Dynarr_add (dst, s2);
3509 else if ((code_point
3510 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3512 Dynarr_add (dst, code_point | 0x80);
3513 else if ((code_point
3514 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3517 ENCODE_SJIS ((code_point >> 8) | 0x80,
3518 (code_point & 0xFF) | 0x80, s1, s2);
3519 Dynarr_add (dst, s1);
3520 Dynarr_add (dst, s2);
3522 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3524 Dynarr_add (dst, code_point);
3526 Dynarr_add (dst, '?');
3528 Lisp_Object charset;
3529 unsigned int c1, c2;
3531 BREAKUP_CHAR (ch, charset, c1, c2);
3533 if (EQ(charset, Vcharset_katakana_jisx0201))
3535 Dynarr_add (dst, c1 | 0x80);
3539 Dynarr_add (dst, c1);
3541 else if (EQ(charset, Vcharset_japanese_jisx0208))
3543 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3544 Dynarr_add (dst, s1);
3545 Dynarr_add (dst, s2);
3548 Dynarr_add (dst, '?');
3554 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3555 unsigned int *flags)
3559 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3560 Decode a JISX0208 character of Shift-JIS coding-system.
3561 CODE is the character code in Shift-JIS as a cons of type bytes.
3562 Return the corresponding character.
3566 unsigned char c1, c2, s1, s2;
3569 CHECK_INT (XCAR (code));
3570 CHECK_INT (XCDR (code));
3571 s1 = XINT (XCAR (code));
3572 s2 = XINT (XCDR (code));
3573 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3574 BYTE_SJIS_TWO_BYTE_2_P (s2))
3576 DECODE_SJIS (s1, s2, c1, c2);
3577 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3578 c1 & 0x7F, c2 & 0x7F));
3584 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3585 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3586 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3590 Lisp_Object charset;
3593 CHECK_CHAR_COERCE_INT (character);
3594 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3595 if (EQ (charset, Vcharset_japanese_jisx0208))
3597 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3598 return Fcons (make_int (s1), make_int (s2));
3605 /************************************************************************/
3607 /************************************************************************/
3609 /* BIG5 is a coding system encoding two character sets: ASCII and
3610 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3611 character set and is encoded in two-byte.
3613 --- CODE RANGE of BIG5 ---
3614 (character set) (range)
3616 Big5 (1st byte) 0xA1 .. 0xFE
3617 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3618 --------------------------
3620 Since the number of characters in Big5 is larger than maximum
3621 characters in Emacs' charset (96x96), it can't be handled as one
3622 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3623 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3624 contains frequently used characters and the latter contains less
3625 frequently used characters. */
3628 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3629 ((c) >= 0x81 && (c) <= 0xFE)
3631 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3632 ((c) >= 0xA1 && (c) <= 0xFE)
3635 /* Is this the second byte of a Shift-JIS two-byte char? */
3637 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3638 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3640 /* Number of Big5 characters which have the same code in 1st byte. */
3642 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3644 /* Code conversion macros. These are macros because they are used in
3645 inner loops during code conversion.
3647 Note that temporary variables in macros introduce the classic
3648 dynamic-scoping problems with variable names. We use capital-
3649 lettered variables in the assumption that XEmacs does not use
3650 capital letters in variables except in a very formalized way
3653 /* Convert Big5 code (b1, b2) into its internal string representation
3656 /* There is a much simpler way to split the Big5 charset into two.
3657 For the moment I'm going to leave the algorithm as-is because it
3658 claims to separate out the most-used characters into a single
3659 charset, which perhaps will lead to optimizations in various
3662 The way the algorithm works is something like this:
3664 Big5 can be viewed as a 94x157 charset, where the row is
3665 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3666 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3667 the split between low and high column numbers is apparently
3668 meaningless; ascending rows produce less and less frequent chars.
3669 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3670 the first charset, and the upper half (0xC9 .. 0xFE) to the
3671 second. To do the conversion, we convert the character into
3672 a single number where 0 .. 156 is the first row, 157 .. 313
3673 is the second, etc. That way, the characters are ordered by
3674 decreasing frequency. Then we just chop the space in two
3675 and coerce the result into a 94x94 space.
3678 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3680 int B1 = b1, B2 = b2; \
3682 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3686 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3690 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3691 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3693 c1 = I / (0xFF - 0xA1) + 0xA1; \
3694 c2 = I % (0xFF - 0xA1) + 0xA1; \
3697 /* Convert the internal string representation of a Big5 character
3698 (lb, c1, c2) into Big5 code (b1, b2). */
3700 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3702 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3704 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3706 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3708 b1 = I / BIG5_SAME_ROW + 0xA1; \
3709 b2 = I % BIG5_SAME_ROW; \
3710 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3714 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3718 unsigned char c = *(unsigned char *)src++;
3719 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3721 || (c >= 0x80 && c <= 0xA0)
3725 if (st->big5.in_second_byte)
3727 st->big5.in_second_byte = 0;
3728 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3738 st->big5.in_second_byte = 1;
3740 return CODING_CATEGORY_BIG5_MASK;
3743 /* Convert Big5 data to internal format. */
3746 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3747 unsigned_char_dynarr *dst, size_t n)
3749 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3750 unsigned int flags = str->flags;
3751 unsigned int cpos = str->cpos;
3752 eol_type_t eol_type = str->eol_type;
3755 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3756 (decoding)->codesys, 1);
3761 unsigned char c = *(unsigned char *)src++;
3764 /* Previous character was first byte of Big5 char. */
3765 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3768 int code_point = (cpos << 8) | c;
3769 Emchar char_id = decode_defined_char (ccs, code_point);
3772 char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
3773 DECODE_ADD_UCS_CHAR (char_id, dst);
3775 unsigned char b1, b2, b3;
3776 DECODE_BIG5 (cpos, c, b1, b2, b3);
3777 Dynarr_add (dst, b1);
3778 Dynarr_add (dst, b2);
3779 Dynarr_add (dst, b3);
3784 DECODE_ADD_BINARY_CHAR (cpos, dst);
3785 DECODE_ADD_BINARY_CHAR (c, dst);
3791 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3792 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3795 DECODE_ADD_BINARY_CHAR (c, dst);
3797 label_continue_loop:;
3800 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3806 /* Convert internally-formatted data to Big5. */
3809 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3810 unsigned_char_dynarr *dst, unsigned int *flags)
3812 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3816 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3817 Dynarr_add (dst, '\r');
3818 if (eol_type != EOL_CR)
3819 Dynarr_add (dst, ch);
3826 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
3828 if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3829 Dynarr_add (dst, code_point);
3830 else if ((code_point = charset_code_point (ccs, ch)) >= 0)
3832 Dynarr_add (dst, code_point >> 8);
3833 Dynarr_add (dst, code_point & 0xFF);
3835 else if ((code_point
3836 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3838 Dynarr_add (dst, code_point >> 8);
3839 Dynarr_add (dst, code_point & 0xFF);
3841 else if ((code_point
3842 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3845 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3846 + ((code_point & 0xFF) - 33);
3847 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3848 unsigned char b2 = I % BIG5_SAME_ROW;
3850 b2 += b2 < 0x3F ? 0x40 : 0x62;
3851 Dynarr_add (dst, b1);
3852 Dynarr_add (dst, b2);
3854 else if ((code_point
3855 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3858 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3859 + ((code_point & 0xFF) - 33);
3860 unsigned char b1, b2;
3862 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3863 b1 = I / BIG5_SAME_ROW + 0xA1;
3864 b2 = I % BIG5_SAME_ROW;
3865 b2 += b2 < 0x3F ? 0x40 : 0x62;
3866 Dynarr_add (dst, b1);
3867 Dynarr_add (dst, b2);
3870 Dynarr_add (dst, '?');
3877 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3878 unsigned int *flags)
3883 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3884 Decode a Big5 character CODE of BIG5 coding-system.
3885 CODE is the character code in BIG5, a cons of two integers.
3886 Return the corresponding character.
3890 unsigned char c1, c2, b1, b2;
3893 CHECK_INT (XCAR (code));
3894 CHECK_INT (XCDR (code));
3895 b1 = XINT (XCAR (code));
3896 b2 = XINT (XCDR (code));
3897 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3898 BYTE_BIG5_TWO_BYTE_2_P (b2))
3900 Charset_ID leading_byte;
3901 Lisp_Object charset;
3902 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3903 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3904 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3910 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3911 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3912 Return the corresponding character code in Big5.
3916 Lisp_Object charset;
3919 CHECK_CHAR_COERCE_INT (character);
3920 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3921 if (EQ (charset, Vcharset_chinese_big5_1) ||
3922 EQ (charset, Vcharset_chinese_big5_2))
3924 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3926 return Fcons (make_int (b1), make_int (b2));
3933 /************************************************************************/
3935 /************************************************************************/
3938 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3942 unsigned char c = *(unsigned char *)src++;
3943 switch (st->ucs4.in_byte)
3952 st->ucs4.in_byte = 0;
3958 return CODING_CATEGORY_UCS4_MASK;
3962 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3963 unsigned_char_dynarr *dst, size_t n)
3965 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3966 unsigned int flags = str->flags;
3967 unsigned int cpos = str->cpos;
3968 unsigned char counter = str->counter;
3972 unsigned char c = *(unsigned char *)src++;
3980 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3985 cpos = ( cpos << 8 ) | c;
3989 if (counter & CODING_STATE_END)
3990 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3994 str->counter = counter;
3998 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3999 unsigned_char_dynarr *dst, unsigned int *flags)
4001 Dynarr_add (dst, ch >> 24);
4002 Dynarr_add (dst, ch >> 16);
4003 Dynarr_add (dst, ch >> 8);
4004 Dynarr_add (dst, ch );
4008 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4009 unsigned int *flags)
4014 /************************************************************************/
4016 /************************************************************************/
4019 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
4023 unsigned char c = *(unsigned char *)src++;
4024 switch (st->utf8.in_byte)
4027 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4030 st->utf8.in_byte = 5;
4032 st->utf8.in_byte = 4;
4034 st->utf8.in_byte = 3;
4036 st->utf8.in_byte = 2;
4038 st->utf8.in_byte = 1;
4043 if ((c & 0xc0) != 0x80)
4049 return CODING_CATEGORY_UTF8_MASK;
4053 decode_output_utf8_partial_char (unsigned char counter,
4055 unsigned_char_dynarr *dst)
4058 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4059 else if (counter == 4)
4061 if (cpos < (1 << 6))
4062 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4065 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4066 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4069 else if (counter == 3)
4071 if (cpos < (1 << 6))
4072 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4073 else if (cpos < (1 << 12))
4075 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4076 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4080 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4081 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4082 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4085 else if (counter == 2)
4087 if (cpos < (1 << 6))
4088 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4089 else if (cpos < (1 << 12))
4091 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4092 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4094 else if (cpos < (1 << 18))
4096 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4097 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4098 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4102 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4103 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4104 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4105 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4110 if (cpos < (1 << 6))
4111 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4112 else if (cpos < (1 << 12))
4114 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4115 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4117 else if (cpos < (1 << 18))
4119 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4120 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4121 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4123 else if (cpos < (1 << 24))
4125 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4126 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4127 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4128 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4132 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4133 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4134 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4135 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4136 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4142 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4143 unsigned_char_dynarr *dst, size_t n)
4145 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4146 unsigned int flags = str->flags;
4147 unsigned int cpos = str->cpos;
4148 eol_type_t eol_type = str->eol_type;
4149 unsigned char counter = str->counter;
4153 unsigned char c = *(unsigned char *)src++;
4158 COMPOSE_FLUSH_CHARS (str, dst);
4159 decode_flush_er_chars (str, dst);
4160 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4161 DECODE_ADD_UCS_CHAR (c, dst);
4163 else if ( c < 0xC0 )
4164 /* decode_add_er_char (str, c, dst); */
4165 COMPOSE_ADD_CHAR (str, c, dst);
4168 /* decode_flush_er_chars (str, dst); */
4174 else if ( c < 0xF0 )
4179 else if ( c < 0xF8 )
4184 else if ( c < 0xFC )
4196 else if ( (c & 0xC0) == 0x80 )
4198 cpos = ( cpos << 6 ) | ( c & 0x3f );
4201 /* DECODE_ADD_UCS_CHAR (cpos, dst); */
4202 COMPOSE_ADD_CHAR (str, cpos, dst);
4211 COMPOSE_FLUSH_CHARS (str, dst);
4212 decode_flush_er_chars (str, dst);
4213 decode_output_utf8_partial_char (counter, cpos, dst);
4214 DECODE_ADD_BINARY_CHAR (c, dst);
4218 label_continue_loop:;
4221 if (flags & CODING_STATE_END)
4223 COMPOSE_FLUSH_CHARS (str, dst);
4224 decode_flush_er_chars (str, dst);
4227 decode_output_utf8_partial_char (counter, cpos, dst);
4234 str->counter = counter;
4238 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4239 unsigned_char_dynarr *dst, unsigned int *flags)
4241 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4245 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4246 Dynarr_add (dst, '\r');
4247 if (eol_type != EOL_CR)
4248 Dynarr_add (dst, ch);
4250 else if (ch <= 0x7f)
4252 Dynarr_add (dst, ch);
4256 int code_point = charset_code_point (Vcharset_ucs, ch);
4258 if ( (code_point < 0) || (code_point > 0x10FFFF) )
4260 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4262 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
4266 int format_columns, idx;
4267 char buf[16], format[16];
4269 while (!NILP (rest))
4273 if (!NILP (ccs = Ffind_charset (ccs)))
4276 = charset_code_point (ccs, ch)) >= 0 )
4281 && ((idx =XSTRING_LENGTH (ret)) <= 6))
4283 strncpy (format, XSTRING_DATA (ret), idx);
4292 format [idx++] = '%';
4293 format_columns = XINT (ret);
4294 if ( (2 <= format_columns)
4295 && (format_columns <= 8) )
4297 format [idx++] = '0';
4298 format [idx++] = '0' + format_columns;
4305 format [idx++] = 'd';
4306 else if (EQ (ret, Qx))
4307 format [idx++] = 'x';
4308 else if (EQ (ret, QX))
4309 format [idx++] = 'X';
4314 sprintf (buf, format, code_point);
4315 Dynarr_add (dst, '&');
4316 Dynarr_add_many (dst, buf, strlen (buf));
4317 Dynarr_add (dst, ';');
4323 sprintf (buf, "&MCS-%08X;", ch);
4324 Dynarr_add_many (dst, buf, strlen (buf));
4330 if (code_point <= 0x7ff)
4332 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4333 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4335 else if (code_point <= 0xffff)
4337 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4338 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4339 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4341 else if (code_point <= 0x1fffff)
4343 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4344 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4345 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4346 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4348 else if (code_point <= 0x3ffffff)
4350 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4351 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4352 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4353 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4354 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4358 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4359 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4360 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4361 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4362 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4363 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4369 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4370 unsigned int *flags)
4375 /************************************************************************/
4376 /* ISO2022 methods */
4377 /************************************************************************/
4379 /* The following note describes the coding system ISO2022 briefly.
4380 Since the intention of this note is to help understand the
4381 functions in this file, some parts are NOT ACCURATE or OVERLY
4382 SIMPLIFIED. For thorough understanding, please refer to the
4383 original document of ISO2022.
4385 ISO2022 provides many mechanisms to encode several character sets
4386 in 7-bit and 8-bit environments. For 7-bit environments, all text
4387 is encoded using bytes less than 128. This may make the encoded
4388 text a little bit longer, but the text passes more easily through
4389 several gateways, some of which strip off MSB (Most Signigant Bit).
4391 There are two kinds of character sets: control character set and
4392 graphic character set. The former contains control characters such
4393 as `newline' and `escape' to provide control functions (control
4394 functions are also provided by escape sequences). The latter
4395 contains graphic characters such as 'A' and '-'. Emacs recognizes
4396 two control character sets and many graphic character sets.
4398 Graphic character sets are classified into one of the following
4399 four classes, according to the number of bytes (DIMENSION) and
4400 number of characters in one dimension (CHARS) of the set:
4401 - DIMENSION1_CHARS94
4402 - DIMENSION1_CHARS96
4403 - DIMENSION2_CHARS94
4404 - DIMENSION2_CHARS96
4406 In addition, each character set is assigned an identification tag,
4407 unique for each set, called "final character" (denoted as <F>
4408 hereafter). The <F> of each character set is decided by ECMA(*)
4409 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4410 (0x30..0x3F are for private use only).
4412 Note (*): ECMA = European Computer Manufacturers Association
4414 Here are examples of graphic character set [NAME(<F>)]:
4415 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4416 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4417 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4418 o DIMENSION2_CHARS96 -- none for the moment
4420 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4421 C0 [0x00..0x1F] -- control character plane 0
4422 GL [0x20..0x7F] -- graphic character plane 0
4423 C1 [0x80..0x9F] -- control character plane 1
4424 GR [0xA0..0xFF] -- graphic character plane 1
4426 A control character set is directly designated and invoked to C0 or
4427 C1 by an escape sequence. The most common case is that:
4428 - ISO646's control character set is designated/invoked to C0, and
4429 - ISO6429's control character set is designated/invoked to C1,
4430 and usually these designations/invocations are omitted in encoded
4431 text. In a 7-bit environment, only C0 can be used, and a control
4432 character for C1 is encoded by an appropriate escape sequence to
4433 fit into the environment. All control characters for C1 are
4434 defined to have corresponding escape sequences.
4436 A graphic character set is at first designated to one of four
4437 graphic registers (G0 through G3), then these graphic registers are
4438 invoked to GL or GR. These designations and invocations can be
4439 done independently. The most common case is that G0 is invoked to
4440 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4441 these invocations and designations are omitted in encoded text.
4442 In a 7-bit environment, only GL can be used.
4444 When a graphic character set of CHARS94 is invoked to GL, codes
4445 0x20 and 0x7F of the GL area work as control characters SPACE and
4446 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4449 There are two ways of invocation: locking-shift and single-shift.
4450 With locking-shift, the invocation lasts until the next different
4451 invocation, whereas with single-shift, the invocation affects the
4452 following character only and doesn't affect the locking-shift
4453 state. Invocations are done by the following control characters or
4456 ----------------------------------------------------------------------
4457 abbrev function cntrl escape seq description
4458 ----------------------------------------------------------------------
4459 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4460 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4461 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4462 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4463 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4464 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4465 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4466 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4467 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4468 ----------------------------------------------------------------------
4469 (*) These are not used by any known coding system.
4471 Control characters for these functions are defined by macros
4472 ISO_CODE_XXX in `coding.h'.
4474 Designations are done by the following escape sequences:
4475 ----------------------------------------------------------------------
4476 escape sequence description
4477 ----------------------------------------------------------------------
4478 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4479 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4480 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4481 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4482 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4483 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4484 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4485 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4486 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4487 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4488 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4489 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4490 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4491 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4492 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4493 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4494 ----------------------------------------------------------------------
4496 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4497 of dimension 1, chars 94, and final character <F>, etc...
4499 Note (*): Although these designations are not allowed in ISO2022,
4500 Emacs accepts them on decoding, and produces them on encoding
4501 CHARS96 character sets in a coding system which is characterized as
4502 7-bit environment, non-locking-shift, and non-single-shift.
4504 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4505 '(' can be omitted. We refer to this as "short-form" hereafter.
4507 Now you may notice that there are a lot of ways for encoding the
4508 same multilingual text in ISO2022. Actually, there exist many
4509 coding systems such as Compound Text (used in X11's inter client
4510 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4511 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4512 localized platforms), and all of these are variants of ISO2022.
4514 In addition to the above, Emacs handles two more kinds of escape
4515 sequences: ISO6429's direction specification and Emacs' private
4516 sequence for specifying character composition.
4518 ISO6429's direction specification takes the following form:
4519 o CSI ']' -- end of the current direction
4520 o CSI '0' ']' -- end of the current direction
4521 o CSI '1' ']' -- start of left-to-right text
4522 o CSI '2' ']' -- start of right-to-left text
4523 The control character CSI (0x9B: control sequence introducer) is
4524 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4526 Character composition specification takes the following form:
4527 o ESC '0' -- start character composition
4528 o ESC '1' -- end character composition
4529 Since these are not standard escape sequences of any ISO standard,
4530 their use with these meanings is restricted to Emacs only. */
4533 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4537 for (i = 0; i < 4; i++)
4539 if (!NILP (coding_system))
4541 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4543 iso->charset[i] = Qt;
4544 iso->invalid_designated[i] = 0;
4546 iso->esc = ISO_ESC_NOTHING;
4547 iso->esc_bytes_index = 0;
4548 iso->register_left = 0;
4549 iso->register_right = 1;
4550 iso->switched_dir_and_no_valid_charset_yet = 0;
4551 iso->invalid_switch_dir = 0;
4552 iso->output_direction_sequence = 0;
4553 iso->output_literally = 0;
4554 #ifdef ENABLE_COMPOSITE_CHARS
4555 if (iso->composite_chars)
4556 Dynarr_reset (iso->composite_chars);
4561 fit_to_be_escape_quoted (unsigned char c)
4578 /* Parse one byte of an ISO2022 escape sequence.
4579 If the result is an invalid escape sequence, return 0 and
4580 do not change anything in STR. Otherwise, if the result is
4581 an incomplete escape sequence, update ISO2022.ESC and
4582 ISO2022.ESC_BYTES and return -1. Otherwise, update
4583 all the state variables (but not ISO2022.ESC_BYTES) and
4586 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4587 or invocation of an invalid character set and treat that as
4588 an unrecognized escape sequence. */
4591 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4592 unsigned char c, unsigned int *flags,
4593 int check_invalid_charsets)
4595 /* (1) If we're at the end of a designation sequence, CS is the
4596 charset being designated and REG is the register to designate
4599 (2) If we're at the end of a locking-shift sequence, REG is
4600 the register to invoke and HALF (0 == left, 1 == right) is
4601 the half to invoke it into.
4603 (3) If we're at the end of a single-shift sequence, REG is
4604 the register to invoke. */
4605 Lisp_Object cs = Qnil;
4608 /* NOTE: This code does goto's all over the fucking place.
4609 The reason for this is that we're basically implementing
4610 a state machine here, and hierarchical languages like C
4611 don't really provide a clean way of doing this. */
4613 if (! (*flags & CODING_STATE_ESCAPE))
4614 /* At beginning of escape sequence; we need to reset our
4615 escape-state variables. */
4616 iso->esc = ISO_ESC_NOTHING;
4618 iso->output_literally = 0;
4619 iso->output_direction_sequence = 0;
4623 case ISO_ESC_NOTHING:
4624 iso->esc_bytes_index = 0;
4627 case ISO_CODE_ESC: /* Start escape sequence */
4628 *flags |= CODING_STATE_ESCAPE;
4632 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4633 *flags |= CODING_STATE_ESCAPE;
4634 iso->esc = ISO_ESC_5_11;
4637 case ISO_CODE_SO: /* locking shift 1 */
4640 case ISO_CODE_SI: /* locking shift 0 */
4644 case ISO_CODE_SS2: /* single shift */
4647 case ISO_CODE_SS3: /* single shift */
4651 default: /* Other control characters */
4658 /**** single shift ****/
4660 case 'N': /* single shift 2 */
4663 case 'O': /* single shift 3 */
4667 /**** locking shift ****/
4669 case '~': /* locking shift 1 right */
4672 case 'n': /* locking shift 2 */
4675 case '}': /* locking shift 2 right */
4678 case 'o': /* locking shift 3 */
4681 case '|': /* locking shift 3 right */
4685 #ifdef ENABLE_COMPOSITE_CHARS
4686 /**** composite ****/
4689 iso->esc = ISO_ESC_START_COMPOSITE;
4690 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4691 CODING_STATE_COMPOSITE;
4695 iso->esc = ISO_ESC_END_COMPOSITE;
4696 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4697 ~CODING_STATE_COMPOSITE;
4699 #endif /* ENABLE_COMPOSITE_CHARS */
4701 /**** directionality ****/
4704 iso->esc = ISO_ESC_5_11;
4707 /**** designation ****/
4709 case '$': /* multibyte charset prefix */
4710 iso->esc = ISO_ESC_2_4;
4714 if (0x28 <= c && c <= 0x2F)
4716 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4720 /* This function is called with CODESYS equal to nil when
4721 doing coding-system detection. */
4723 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4724 && fit_to_be_escape_quoted (c))
4726 iso->esc = ISO_ESC_LITERAL;
4727 *flags &= CODING_STATE_ISO2022_LOCK;
4737 /**** directionality ****/
4739 case ISO_ESC_5_11: /* ISO6429 direction control */
4742 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4743 goto directionality;
4745 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4746 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4747 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4751 case ISO_ESC_5_11_0:
4754 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4755 goto directionality;
4759 case ISO_ESC_5_11_1:
4762 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4763 goto directionality;
4767 case ISO_ESC_5_11_2:
4770 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4771 goto directionality;
4776 iso->esc = ISO_ESC_DIRECTIONALITY;
4777 /* Various junk here to attempt to preserve the direction sequences
4778 literally in the text if they would otherwise be swallowed due
4779 to invalid designations that don't show up as actual charset
4780 changes in the text. */
4781 if (iso->invalid_switch_dir)
4783 /* We already inserted a direction switch literally into the
4784 text. We assume (#### this may not be right) that the
4785 next direction switch is the one going the other way,
4786 and we need to output that literally as well. */
4787 iso->output_literally = 1;
4788 iso->invalid_switch_dir = 0;
4794 /* If we are in the thrall of an invalid designation,
4795 then stick the directionality sequence literally into the
4796 output stream so it ends up in the original text again. */
4797 for (jj = 0; jj < 4; jj++)
4798 if (iso->invalid_designated[jj])
4802 iso->output_literally = 1;
4803 iso->invalid_switch_dir = 1;
4806 /* Indicate that we haven't yet seen a valid designation,
4807 so that if a switch-dir is directly followed by an
4808 invalid designation, both get inserted literally. */
4809 iso->switched_dir_and_no_valid_charset_yet = 1;
4814 /**** designation ****/
4817 if (0x28 <= c && c <= 0x2F)
4819 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4822 if (0x40 <= c && c <= 0x42)
4825 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4826 *flags & CODING_STATE_R2L ?
4827 CHARSET_RIGHT_TO_LEFT :
4828 CHARSET_LEFT_TO_RIGHT);
4839 if (c < '0' || c > '~')
4840 return 0; /* bad final byte */
4842 if (iso->esc >= ISO_ESC_2_8 &&
4843 iso->esc <= ISO_ESC_2_15)
4845 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4846 single = 1; /* single-byte */
4847 reg = (iso->esc - ISO_ESC_2_8) & 3;
4849 else if (iso->esc >= ISO_ESC_2_4_8 &&
4850 iso->esc <= ISO_ESC_2_4_15)
4852 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4853 single = -1; /* multi-byte */
4854 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4858 /* Can this ever be reached? -slb */
4862 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4863 *flags & CODING_STATE_R2L ?
4864 CHARSET_RIGHT_TO_LEFT :
4865 CHARSET_LEFT_TO_RIGHT);
4871 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4875 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4876 /* can't invoke something that ain't there. */
4878 iso->esc = ISO_ESC_SINGLE_SHIFT;
4879 *flags &= CODING_STATE_ISO2022_LOCK;
4881 *flags |= CODING_STATE_SS2;
4883 *flags |= CODING_STATE_SS3;
4887 if (check_invalid_charsets &&
4888 !CHARSETP (iso->charset[reg]))
4889 /* can't invoke something that ain't there. */
4892 iso->register_right = reg;
4894 iso->register_left = reg;
4895 *flags &= CODING_STATE_ISO2022_LOCK;
4896 iso->esc = ISO_ESC_LOCKING_SHIFT;
4900 if (NILP (cs) && check_invalid_charsets)
4902 iso->invalid_designated[reg] = 1;
4903 iso->charset[reg] = Vcharset_ascii;
4904 iso->esc = ISO_ESC_DESIGNATE;
4905 *flags &= CODING_STATE_ISO2022_LOCK;
4906 iso->output_literally = 1;
4907 if (iso->switched_dir_and_no_valid_charset_yet)
4909 /* We encountered a switch-direction followed by an
4910 invalid designation. Ensure that the switch-direction
4911 gets outputted; otherwise it will probably get eaten
4912 when the text is written out again. */
4913 iso->switched_dir_and_no_valid_charset_yet = 0;
4914 iso->output_direction_sequence = 1;
4915 /* And make sure that the switch-dir going the other
4916 way gets outputted, as well. */
4917 iso->invalid_switch_dir = 1;
4921 /* This function is called with CODESYS equal to nil when
4922 doing coding-system detection. */
4923 if (!NILP (codesys))
4925 charset_conversion_spec_dynarr *dyn =
4926 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4932 for (i = 0; i < Dynarr_length (dyn); i++)
4934 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4935 if (EQ (cs, spec->from_charset))
4936 cs = spec->to_charset;
4941 iso->charset[reg] = cs;
4942 iso->esc = ISO_ESC_DESIGNATE;
4943 *flags &= CODING_STATE_ISO2022_LOCK;
4944 if (iso->invalid_designated[reg])
4946 iso->invalid_designated[reg] = 0;
4947 iso->output_literally = 1;
4949 if (iso->switched_dir_and_no_valid_charset_yet)
4950 iso->switched_dir_and_no_valid_charset_yet = 0;
4955 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4959 /* #### There are serious deficiencies in the recognition mechanism
4960 here. This needs to be much smarter if it's going to cut it.
4961 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4962 it should be detected as Latin-1.
4963 All the ISO2022 stuff in this file should be synced up with the
4964 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4965 Perhaps we should wait till R2L works in FSF Emacs? */
4967 if (!st->iso2022.initted)
4969 reset_iso2022 (Qnil, &st->iso2022.iso);
4970 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4971 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4972 CODING_CATEGORY_ISO_8_1_MASK |
4973 CODING_CATEGORY_ISO_8_2_MASK |
4974 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4975 st->iso2022.flags = 0;
4976 st->iso2022.high_byte_count = 0;
4977 st->iso2022.saw_single_shift = 0;
4978 st->iso2022.initted = 1;
4981 mask = st->iso2022.mask;
4985 unsigned char c = *(unsigned char *)src++;
4988 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4989 st->iso2022.high_byte_count++;
4993 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4995 if (st->iso2022.high_byte_count & 1)
4996 /* odd number of high bytes; assume not iso-8-2 */
4997 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4999 st->iso2022.high_byte_count = 0;
5000 st->iso2022.saw_single_shift = 0;
5002 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5004 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5005 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5006 { /* control chars */
5009 /* Allow and ignore control characters that you might
5010 reasonably see in a text file */
5015 case 8: /* backspace */
5016 case 11: /* vertical tab */
5017 case 12: /* form feed */
5018 case 26: /* MS-DOS C-z junk */
5019 case 31: /* '^_' -- for info */
5020 goto label_continue_loop;
5027 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5030 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5031 &st->iso2022.flags, 0))
5033 switch (st->iso2022.iso.esc)
5035 case ISO_ESC_DESIGNATE:
5036 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5037 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5039 case ISO_ESC_LOCKING_SHIFT:
5040 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5041 goto ran_out_of_chars;
5042 case ISO_ESC_SINGLE_SHIFT:
5043 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5044 st->iso2022.saw_single_shift = 1;
5053 goto ran_out_of_chars;
5056 label_continue_loop:;
5065 postprocess_iso2022_mask (int mask)
5067 /* #### kind of cheesy */
5068 /* If seven-bit ISO is allowed, then assume that the encoding is
5069 entirely seven-bit and turn off the eight-bit ones. */
5070 if (mask & CODING_CATEGORY_ISO_7_MASK)
5071 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5072 CODING_CATEGORY_ISO_8_1_MASK |
5073 CODING_CATEGORY_ISO_8_2_MASK);
5077 /* If FLAGS is a null pointer or specifies right-to-left motion,
5078 output a switch-dir-to-left-to-right sequence to DST.
5079 Also update FLAGS if it is not a null pointer.
5080 If INTERNAL_P is set, we are outputting in internal format and
5081 need to handle the CSI differently. */
5084 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5085 unsigned_char_dynarr *dst,
5086 unsigned int *flags,
5089 if (!flags || (*flags & CODING_STATE_R2L))
5091 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5093 Dynarr_add (dst, ISO_CODE_ESC);
5094 Dynarr_add (dst, '[');
5096 else if (internal_p)
5097 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5099 Dynarr_add (dst, ISO_CODE_CSI);
5100 Dynarr_add (dst, '0');
5101 Dynarr_add (dst, ']');
5103 *flags &= ~CODING_STATE_R2L;
5107 /* If FLAGS is a null pointer or specifies a direction different from
5108 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5109 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5110 sequence to DST. Also update FLAGS if it is not a null pointer.
5111 If INTERNAL_P is set, we are outputting in internal format and
5112 need to handle the CSI differently. */
5115 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5116 unsigned_char_dynarr *dst, unsigned int *flags,
5119 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5120 direction == CHARSET_LEFT_TO_RIGHT)
5121 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5122 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5123 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5124 direction == CHARSET_RIGHT_TO_LEFT)
5126 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5128 Dynarr_add (dst, ISO_CODE_ESC);
5129 Dynarr_add (dst, '[');
5131 else if (internal_p)
5132 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5134 Dynarr_add (dst, ISO_CODE_CSI);
5135 Dynarr_add (dst, '2');
5136 Dynarr_add (dst, ']');
5138 *flags |= CODING_STATE_R2L;
5142 /* Convert ISO2022-format data to internal format. */
5145 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5146 unsigned_char_dynarr *dst, size_t n)
5148 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5149 unsigned int flags = str->flags;
5150 unsigned int cpos = str->cpos;
5151 unsigned char counter = str->counter;
5152 eol_type_t eol_type = str->eol_type;
5153 #ifdef ENABLE_COMPOSITE_CHARS
5154 unsigned_char_dynarr *real_dst = dst;
5156 Lisp_Object coding_system;
5158 XSETCODING_SYSTEM (coding_system, str->codesys);
5160 #ifdef ENABLE_COMPOSITE_CHARS
5161 if (flags & CODING_STATE_COMPOSITE)
5162 dst = str->iso2022.composite_chars;
5163 #endif /* ENABLE_COMPOSITE_CHARS */
5167 unsigned char c = *(unsigned char *)src++;
5168 if (flags & CODING_STATE_ESCAPE)
5169 { /* Within ESC sequence */
5170 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5175 switch (str->iso2022.esc)
5177 #ifdef ENABLE_COMPOSITE_CHARS
5178 case ISO_ESC_START_COMPOSITE:
5179 if (str->iso2022.composite_chars)
5180 Dynarr_reset (str->iso2022.composite_chars);
5182 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5183 dst = str->iso2022.composite_chars;
5185 case ISO_ESC_END_COMPOSITE:
5187 Bufbyte comstr[MAX_EMCHAR_LEN];
5189 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5190 Dynarr_length (dst));
5192 len = set_charptr_emchar (comstr, emch);
5193 Dynarr_add_many (dst, comstr, len);
5196 #endif /* ENABLE_COMPOSITE_CHARS */
5198 case ISO_ESC_LITERAL:
5199 COMPOSE_FLUSH_CHARS (str, dst);
5200 decode_flush_er_chars (str, dst);
5201 DECODE_ADD_BINARY_CHAR (c, dst);
5205 /* Everything else handled already */
5210 /* Attempted error recovery. */
5211 if (str->iso2022.output_direction_sequence)
5212 ensure_correct_direction (flags & CODING_STATE_R2L ?
5213 CHARSET_RIGHT_TO_LEFT :
5214 CHARSET_LEFT_TO_RIGHT,
5215 str->codesys, dst, 0, 1);
5216 /* More error recovery. */
5217 if (!retval || str->iso2022.output_literally)
5219 /* Output the (possibly invalid) sequence */
5221 COMPOSE_FLUSH_CHARS (str, dst);
5222 decode_flush_er_chars (str, dst);
5223 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5224 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5225 flags &= CODING_STATE_ISO2022_LOCK;
5227 n++, src--;/* Repeat the loop with the same character. */
5230 /* No sense in reprocessing the final byte of the
5231 escape sequence; it could mess things up anyway.
5233 COMPOSE_FLUSH_CHARS (str, dst);
5234 decode_flush_er_chars (str, dst);
5235 DECODE_ADD_BINARY_CHAR (c, dst);
5241 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5242 { /* Control characters */
5244 /***** Error-handling *****/
5246 /* If we were in the middle of a character, dump out the
5247 partial character. */
5250 COMPOSE_FLUSH_CHARS (str, dst);
5251 decode_flush_er_chars (str, dst);
5255 DECODE_ADD_BINARY_CHAR
5256 ((unsigned char)(cpos >> (counter * 8)), dst);
5261 /* If we just saw a single-shift character, dump it out.
5262 This may dump out the wrong sort of single-shift character,
5263 but least it will give an indication that something went
5265 if (flags & CODING_STATE_SS2)
5267 COMPOSE_FLUSH_CHARS (str, dst);
5268 decode_flush_er_chars (str, dst);
5269 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5270 flags &= ~CODING_STATE_SS2;
5272 if (flags & CODING_STATE_SS3)
5274 COMPOSE_FLUSH_CHARS (str, dst);
5275 decode_flush_er_chars (str, dst);
5276 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5277 flags &= ~CODING_STATE_SS3;
5280 /***** Now handle the control characters. *****/
5286 COMPOSE_FLUSH_CHARS (str, dst);
5287 decode_flush_er_chars (str, dst);
5288 if (eol_type == EOL_CR)
5289 Dynarr_add (dst, '\n');
5290 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5291 Dynarr_add (dst, c);
5293 flags |= CODING_STATE_CR;
5294 goto label_continue_loop;
5296 else if (flags & CODING_STATE_CR)
5297 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5299 Dynarr_add (dst, '\r');
5300 flags &= ~CODING_STATE_CR;
5303 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5306 flags &= CODING_STATE_ISO2022_LOCK;
5308 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5310 COMPOSE_FLUSH_CHARS (str, dst);
5311 decode_flush_er_chars (str, dst);
5312 DECODE_ADD_BINARY_CHAR (c, dst);
5316 { /* Graphic characters */
5317 Lisp_Object charset;
5326 COMPOSE_FLUSH_CHARS (str, dst);
5327 decode_flush_er_chars (str, dst);
5328 if (eol_type == EOL_CR)
5329 Dynarr_add (dst, '\n');
5330 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5331 Dynarr_add (dst, c);
5333 flags |= CODING_STATE_CR;
5334 goto label_continue_loop;
5336 else if (flags & CODING_STATE_CR)
5337 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5339 Dynarr_add (dst, '\r');
5340 flags &= ~CODING_STATE_CR;
5343 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5346 /* Now determine the charset. */
5347 reg = ((flags & CODING_STATE_SS2) ? 2
5348 : (flags & CODING_STATE_SS3) ? 3
5349 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5350 : str->iso2022.register_left);
5351 charset = str->iso2022.charset[reg];
5353 /* Error checking: */
5354 if (! CHARSETP (charset)
5355 || str->iso2022.invalid_designated[reg]
5356 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5357 && XCHARSET_CHARS (charset) == 94))
5358 /* Mrmph. We are trying to invoke a register that has no
5359 or an invalid charset in it, or trying to add a character
5360 outside the range of the charset. Insert that char literally
5361 to preserve it for the output. */
5363 COMPOSE_FLUSH_CHARS (str, dst);
5364 decode_flush_er_chars (str, dst);
5368 DECODE_ADD_BINARY_CHAR
5369 ((unsigned char)(cpos >> (counter * 8)), dst);
5372 DECODE_ADD_BINARY_CHAR (c, dst);
5377 /* Things are probably hunky-dorey. */
5379 /* Fetch reverse charset, maybe. */
5380 if (((flags & CODING_STATE_R2L) &&
5381 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5383 (!(flags & CODING_STATE_R2L) &&
5384 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5386 Lisp_Object new_charset =
5387 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5388 if (!NILP (new_charset))
5389 charset = new_charset;
5394 if (XCHARSET_DIMENSION (charset) == counter)
5396 COMPOSE_ADD_CHAR (str,
5397 DECODE_CHAR (charset,
5398 ((cpos & 0x7F7F7F) << 8)
5405 cpos = (cpos << 8) | c;
5407 lb = XCHARSET_LEADING_BYTE (charset);
5408 switch (XCHARSET_REP_BYTES (charset))
5411 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5412 Dynarr_add (dst, c & 0x7F);
5415 case 2: /* one-byte official */
5416 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5417 Dynarr_add (dst, lb);
5418 Dynarr_add (dst, c | 0x80);
5421 case 3: /* one-byte private or two-byte official */
5422 if (XCHARSET_PRIVATE_P (charset))
5424 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5425 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5426 Dynarr_add (dst, lb);
5427 Dynarr_add (dst, c | 0x80);
5433 Dynarr_add (dst, lb);
5434 Dynarr_add (dst, ch | 0x80);
5435 Dynarr_add (dst, c | 0x80);
5443 default: /* two-byte private */
5446 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5447 Dynarr_add (dst, lb);
5448 Dynarr_add (dst, ch | 0x80);
5449 Dynarr_add (dst, c | 0x80);
5459 flags &= CODING_STATE_ISO2022_LOCK;
5462 label_continue_loop:;
5465 if (flags & CODING_STATE_END)
5467 COMPOSE_FLUSH_CHARS (str, dst);
5468 decode_flush_er_chars (str, dst);
5469 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5473 str->counter = counter;
5477 /***** ISO2022 encoder *****/
5479 /* Designate CHARSET into register REG. */
5482 iso2022_designate (Lisp_Object charset, unsigned char reg,
5483 struct encoding_stream *str, unsigned_char_dynarr *dst)
5485 static const char inter94[] = "()*+";
5486 static const char inter96[] = ",-./";
5487 unsigned short chars;
5488 unsigned char dimension;
5489 unsigned char final;
5490 Lisp_Object old_charset = str->iso2022.charset[reg];
5492 str->iso2022.charset[reg] = charset;
5493 if (!CHARSETP (charset))
5494 /* charset might be an initial nil or t. */
5496 chars = XCHARSET_CHARS (charset);
5497 dimension = XCHARSET_DIMENSION (charset);
5498 final = XCHARSET_FINAL (charset);
5499 if (!str->iso2022.force_charset_on_output[reg] &&
5500 CHARSETP (old_charset) &&
5501 XCHARSET_CHARS (old_charset) == chars &&
5502 XCHARSET_DIMENSION (old_charset) == dimension &&
5503 XCHARSET_FINAL (old_charset) == final)
5506 str->iso2022.force_charset_on_output[reg] = 0;
5509 charset_conversion_spec_dynarr *dyn =
5510 str->codesys->iso2022.output_conv;
5516 for (i = 0; i < Dynarr_length (dyn); i++)
5518 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5519 if (EQ (charset, spec->from_charset))
5520 charset = spec->to_charset;
5525 Dynarr_add (dst, ISO_CODE_ESC);
5530 Dynarr_add (dst, inter94[reg]);
5533 Dynarr_add (dst, '$');
5535 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5538 Dynarr_add (dst, inter94[reg]);
5543 Dynarr_add (dst, inter96[reg]);
5546 Dynarr_add (dst, '$');
5547 Dynarr_add (dst, inter96[reg]);
5551 Dynarr_add (dst, final);
5555 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5557 if (str->iso2022.register_left != 0)
5559 Dynarr_add (dst, ISO_CODE_SI);
5560 str->iso2022.register_left = 0;
5565 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5567 if (str->iso2022.register_left != 1)
5569 Dynarr_add (dst, ISO_CODE_SO);
5570 str->iso2022.register_left = 1;
5575 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5576 unsigned_char_dynarr *dst, unsigned int *flags)
5578 unsigned char charmask;
5579 Lisp_Coding_System* codesys = str->codesys;
5580 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5582 Lisp_Object charset = str->iso2022.current_charset;
5583 int half = str->iso2022.current_half;
5584 int code_point = -1;
5588 restore_left_to_right_direction (codesys, dst, flags, 0);
5590 /* Make sure G0 contains ASCII */
5591 if ((ch > ' ' && ch < ISO_CODE_DEL)
5592 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5594 ensure_normal_shift (str, dst);
5595 iso2022_designate (Vcharset_ascii, 0, str, dst);
5598 /* If necessary, restore everything to the default state
5600 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5602 restore_left_to_right_direction (codesys, dst, flags, 0);
5604 ensure_normal_shift (str, dst);
5606 for (i = 0; i < 4; i++)
5608 Lisp_Object initial_charset =
5609 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5610 iso2022_designate (initial_charset, i, str, dst);
5615 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5616 Dynarr_add (dst, '\r');
5617 if (eol_type != EOL_CR)
5618 Dynarr_add (dst, ch);
5622 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5623 && fit_to_be_escape_quoted (ch))
5624 Dynarr_add (dst, ISO_CODE_ESC);
5625 Dynarr_add (dst, ch);
5628 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5630 charmask = (half == 0 ? 0x00 : 0x80);
5632 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5633 && fit_to_be_escape_quoted (ch))
5634 Dynarr_add (dst, ISO_CODE_ESC);
5635 /* you asked for it ... */
5636 Dynarr_add (dst, ch);
5642 /* Now determine which register to use. */
5644 for (i = 0; i < 4; i++)
5646 if ((CHARSETP (charset = str->iso2022.charset[i])
5647 && ((code_point = charset_code_point (charset, ch)) >= 0))
5651 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5652 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5660 Lisp_Object original_default_coded_charset_priority_list
5661 = Vdefault_coded_charset_priority_list;
5663 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5665 code_point = ENCODE_CHAR (ch, charset);
5666 if (XCHARSET_FINAL (charset))
5668 Vdefault_coded_charset_priority_list
5669 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5670 Vdefault_coded_charset_priority_list));
5672 code_point = ENCODE_CHAR (ch, charset);
5673 if (!XCHARSET_FINAL (charset))
5675 charset = Vcharset_ascii;
5679 Vdefault_coded_charset_priority_list
5680 = original_default_coded_charset_priority_list;
5682 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5683 codesys, dst, flags, 0);
5687 if (XCHARSET_GRAPHIC (charset) != 0)
5689 if (!NILP (str->iso2022.charset[1]) &&
5690 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5691 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5693 else if (!NILP (str->iso2022.charset[2]))
5695 else if (!NILP (str->iso2022.charset[3]))
5704 iso2022_designate (charset, reg, str, dst);
5706 /* Now invoke that register. */
5710 ensure_normal_shift (str, dst);
5714 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5716 ensure_shift_out (str, dst);
5723 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5725 Dynarr_add (dst, ISO_CODE_ESC);
5726 Dynarr_add (dst, 'N');
5731 Dynarr_add (dst, ISO_CODE_SS2);
5736 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5738 Dynarr_add (dst, ISO_CODE_ESC);
5739 Dynarr_add (dst, 'O');
5744 Dynarr_add (dst, ISO_CODE_SS3);
5752 charmask = (half == 0 ? 0x00 : 0x80);
5754 switch (XCHARSET_DIMENSION (charset))
5757 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5760 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5761 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5764 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5765 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5766 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5769 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5770 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5771 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5772 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5778 str->iso2022.current_charset = charset;
5779 str->iso2022.current_half = half;
5783 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5784 unsigned int *flags)
5786 Lisp_Coding_System* codesys = str->codesys;
5789 restore_left_to_right_direction (codesys, dst, flags, 0);
5790 ensure_normal_shift (str, dst);
5791 for (i = 0; i < 4; i++)
5793 Lisp_Object initial_charset
5794 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5795 iso2022_designate (initial_charset, i, str, dst);
5800 /************************************************************************/
5801 /* No-conversion methods */
5802 /************************************************************************/
5804 /* This is used when reading in "binary" files -- i.e. files that may
5805 contain all 256 possible byte values and that are not to be
5806 interpreted as being in any particular decoding. */
5808 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5809 unsigned_char_dynarr *dst, size_t n)
5811 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5812 unsigned int flags = str->flags;
5813 unsigned int cpos = str->cpos;
5814 eol_type_t eol_type = str->eol_type;
5818 unsigned char c = *(unsigned char *)src++;
5820 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5821 DECODE_ADD_BINARY_CHAR (c, dst);
5822 label_continue_loop:;
5825 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5832 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5833 unsigned_char_dynarr *dst, size_t n)
5836 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5837 unsigned int flags = str->flags;
5838 unsigned int ch = str->ch;
5839 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5841 unsigned char char_boundary = str->iso2022.current_char_boundary;
5848 if (char_boundary == 0)
5854 else if ( c >= 0xf8 )
5859 else if ( c >= 0xf0 )
5864 else if ( c >= 0xe0 )
5869 else if ( c >= 0xc0 )
5879 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5880 Dynarr_add (dst, '\r');
5881 if (eol_type != EOL_CR)
5882 Dynarr_add (dst, c);
5885 Dynarr_add (dst, c);
5888 else if (char_boundary == 1)
5890 ch = ( ch << 6 ) | ( c & 0x3f );
5891 Dynarr_add (dst, ch & 0xff);
5896 ch = ( ch << 6 ) | ( c & 0x3f );
5899 #else /* not UTF2000 */
5902 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5903 Dynarr_add (dst, '\r');
5904 if (eol_type != EOL_CR)
5905 Dynarr_add (dst, '\n');
5908 else if (BYTE_ASCII_P (c))
5911 Dynarr_add (dst, c);
5913 else if (BUFBYTE_LEADING_BYTE_P (c))
5916 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5917 c == LEADING_BYTE_CONTROL_1)
5920 Dynarr_add (dst, '~'); /* untranslatable character */
5924 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5925 Dynarr_add (dst, c);
5926 else if (ch == LEADING_BYTE_CONTROL_1)
5929 Dynarr_add (dst, c - 0x20);
5931 /* else it should be the second or third byte of an
5932 untranslatable character, so ignore it */
5935 #endif /* not UTF2000 */
5941 str->iso2022.current_char_boundary = char_boundary;
5947 /************************************************************************/
5948 /* Initialization */
5949 /************************************************************************/
5952 syms_of_file_coding (void)
5954 INIT_LRECORD_IMPLEMENTATION (coding_system);
5956 deferror (&Qcoding_system_error, "coding-system-error",
5957 "Coding-system error", Qio_error);
5959 DEFSUBR (Fcoding_system_p);
5960 DEFSUBR (Ffind_coding_system);
5961 DEFSUBR (Fget_coding_system);
5962 DEFSUBR (Fcoding_system_list);
5963 DEFSUBR (Fcoding_system_name);
5964 DEFSUBR (Fmake_coding_system);
5965 DEFSUBR (Fcopy_coding_system);
5966 DEFSUBR (Fcoding_system_canonical_name_p);
5967 DEFSUBR (Fcoding_system_alias_p);
5968 DEFSUBR (Fcoding_system_aliasee);
5969 DEFSUBR (Fdefine_coding_system_alias);
5970 DEFSUBR (Fsubsidiary_coding_system);
5972 DEFSUBR (Fcoding_system_type);
5973 DEFSUBR (Fcoding_system_doc_string);
5975 DEFSUBR (Fcoding_system_charset);
5977 DEFSUBR (Fcoding_system_property);
5979 DEFSUBR (Fcoding_category_list);
5980 DEFSUBR (Fset_coding_priority_list);
5981 DEFSUBR (Fcoding_priority_list);
5982 DEFSUBR (Fset_coding_category_system);
5983 DEFSUBR (Fcoding_category_system);
5985 DEFSUBR (Fdetect_coding_region);
5986 DEFSUBR (Fdecode_coding_region);
5987 DEFSUBR (Fencode_coding_region);
5989 DEFSUBR (Fdecode_shift_jis_char);
5990 DEFSUBR (Fencode_shift_jis_char);
5991 DEFSUBR (Fdecode_big5_char);
5992 DEFSUBR (Fencode_big5_char);
5994 defsymbol (&Qcoding_systemp, "coding-system-p");
5995 defsymbol (&Qno_conversion, "no-conversion");
5996 defsymbol (&Qraw_text, "raw-text");
5998 defsymbol (&Qbig5, "big5");
5999 defsymbol (&Qshift_jis, "shift-jis");
6000 defsymbol (&Qucs4, "ucs-4");
6001 defsymbol (&Qutf8, "utf-8");
6002 defsymbol (&Qccl, "ccl");
6003 defsymbol (&Qiso2022, "iso2022");
6005 defsymbol (&Qmnemonic, "mnemonic");
6006 defsymbol (&Qeol_type, "eol-type");
6007 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6008 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6010 defsymbol (&Qcr, "cr");
6011 defsymbol (&Qlf, "lf");
6012 defsymbol (&Qcrlf, "crlf");
6013 defsymbol (&Qeol_cr, "eol-cr");
6014 defsymbol (&Qeol_lf, "eol-lf");
6015 defsymbol (&Qeol_crlf, "eol-crlf");
6017 defsymbol (&Qcharset_g0, "charset-g0");
6018 defsymbol (&Qcharset_g1, "charset-g1");
6019 defsymbol (&Qcharset_g2, "charset-g2");
6020 defsymbol (&Qcharset_g3, "charset-g3");
6021 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6022 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6023 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6024 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6025 defsymbol (&Qno_iso6429, "no-iso6429");
6026 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6027 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6029 defsymbol (&Qshort, "short");
6030 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6031 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6032 defsymbol (&Qseven, "seven");
6033 defsymbol (&Qlock_shift, "lock-shift");
6034 defsymbol (&Qescape_quoted, "escape-quoted");
6037 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6038 defsymbol (&Qdisable_composition, "disable-composition");
6039 defsymbol (&Quse_entity_reference, "use-entity-reference");
6040 defsymbol (&Qd, "d");
6041 defsymbol (&Qx, "x");
6042 defsymbol (&QX, "X");
6044 defsymbol (&Qencode, "encode");
6045 defsymbol (&Qdecode, "decode");
6048 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6050 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6052 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6054 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6056 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6058 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6060 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6062 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6064 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6067 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6072 lstream_type_create_file_coding (void)
6074 LSTREAM_HAS_METHOD (decoding, reader);
6075 LSTREAM_HAS_METHOD (decoding, writer);
6076 LSTREAM_HAS_METHOD (decoding, rewinder);
6077 LSTREAM_HAS_METHOD (decoding, seekable_p);
6078 LSTREAM_HAS_METHOD (decoding, flusher);
6079 LSTREAM_HAS_METHOD (decoding, closer);
6080 LSTREAM_HAS_METHOD (decoding, marker);
6082 LSTREAM_HAS_METHOD (encoding, reader);
6083 LSTREAM_HAS_METHOD (encoding, writer);
6084 LSTREAM_HAS_METHOD (encoding, rewinder);
6085 LSTREAM_HAS_METHOD (encoding, seekable_p);
6086 LSTREAM_HAS_METHOD (encoding, flusher);
6087 LSTREAM_HAS_METHOD (encoding, closer);
6088 LSTREAM_HAS_METHOD (encoding, marker);
6092 vars_of_file_coding (void)
6096 fcd = xnew (struct file_coding_dump);
6097 dump_add_root_struct_ptr (&fcd, &fcd_description);
6099 /* Initialize to something reasonable ... */
6100 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6102 fcd->coding_category_system[i] = Qnil;
6103 fcd->coding_category_by_priority[i] = i;
6106 Fprovide (intern ("file-coding"));
6108 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6109 Coding system used for TTY keyboard input.
6110 Not used under a windowing system.
6112 Vkeyboard_coding_system = Qnil;
6114 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6115 Coding system used for TTY display output.
6116 Not used under a windowing system.
6118 Vterminal_coding_system = Qnil;
6120 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6121 Overriding coding system used when reading from a file or process.
6122 You should bind this variable with `let', but do not set it globally.
6123 If this is non-nil, it specifies the coding system that will be used
6124 to decode input on read operations, such as from a file or process.
6125 It overrides `buffer-file-coding-system-for-read',
6126 `insert-file-contents-pre-hook', etc. Use those variables instead of
6127 this one for permanent changes to the environment. */ );
6128 Vcoding_system_for_read = Qnil;
6130 DEFVAR_LISP ("coding-system-for-write",
6131 &Vcoding_system_for_write /*
6132 Overriding coding system used when writing to a file or process.
6133 You should bind this variable with `let', but do not set it globally.
6134 If this is non-nil, it specifies the coding system that will be used
6135 to encode output for write operations, such as to a file or process.
6136 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6137 Use those variables instead of this one for permanent changes to the
6139 Vcoding_system_for_write = Qnil;
6141 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6142 Coding system used to convert pathnames when accessing files.
6144 Vfile_name_coding_system = Qnil;
6146 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6147 &Vcoded_charset_entity_reference_alist /*
6148 Alist of coded-charset vs corresponding entity-reference.
6149 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6150 CCS is coded-charset.
6151 CODE-COLUMNS is columns of code-point of entity-reference.
6152 CODE-TYPE is format type of code-point of entity-reference.
6153 `d' means decimal value and `x' means hexadecimal value.
6155 Vcoded_charset_entity_reference_alist = Qnil;
6157 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6158 Non-nil means the buffer contents are regarded as multi-byte form
6159 of characters, not a binary code. This affects the display, file I/O,
6160 and behaviors of various editing commands.
6162 Setting this to nil does not do anything.
6164 enable_multibyte_characters = 1;
6168 complex_vars_of_file_coding (void)
6170 staticpro (&Vcoding_system_hash_table);
6171 Vcoding_system_hash_table =
6172 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6174 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6175 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6177 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6179 struct codesys_prop csp; \
6181 csp.prop_type = (Prop_Type); \
6182 Dynarr_add (the_codesys_prop_dynarr, csp); \
6185 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6186 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6187 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6188 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6189 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6190 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6191 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6193 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6194 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6195 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6196 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6197 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6198 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6199 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6200 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6201 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6202 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6203 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6204 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6205 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6206 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6207 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6208 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6209 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6211 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6212 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6214 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6215 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6218 /* Need to create this here or we're really screwed. */
6220 (Qraw_text, Qno_conversion,
6221 build_string ("Raw text, which means it converts only line-break-codes."),
6222 list2 (Qmnemonic, build_string ("Raw")));
6225 (Qbinary, Qno_conversion,
6226 build_string ("Binary, which means it does not convert anything."),
6227 list4 (Qeol_type, Qlf,
6228 Qmnemonic, build_string ("Binary")));
6234 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6235 list2 (Qmnemonic, build_string ("MTF8")));
6238 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6240 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6242 Fdefine_coding_system_alias (Qterminal, Qbinary);
6243 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6245 /* Need this for bootstrapping */
6246 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6247 Fget_coding_system (Qraw_text);
6250 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6251 = Fget_coding_system (Qutf_8_mcs);
6254 #if defined(MULE) && !defined(UTF2000)
6258 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6259 fcd->ucs_to_mule_table[i] = Qnil;
6261 staticpro (&mule_to_ucs_table);
6262 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6263 #endif /* defined(MULE) && !defined(UTF2000) */