1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs UTF-2000. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qdisable_composition;
109 Lisp_Object Quse_entity_reference;
110 Lisp_Object Qd, Qx, QX;
112 Lisp_Object Qencode, Qdecode;
114 Lisp_Object Vcoding_system_hash_table;
116 int enable_multibyte_characters;
119 /* Additional information used by the ISO2022 decoder and detector. */
120 struct iso2022_decoder
122 /* CHARSET holds the character sets currently assigned to the G0
123 through G3 variables. It is initialized from the array
124 INITIAL_CHARSET in CODESYS. */
125 Lisp_Object charset[4];
127 /* Which registers are currently invoked into the left (GL) and
128 right (GR) halves of the 8-bit encoding space? */
129 int register_left, register_right;
131 /* ISO_ESC holds a value indicating part of an escape sequence
132 that has already been seen. */
133 enum iso_esc_flag esc;
135 /* This records the bytes we've seen so far in an escape sequence,
136 in case the sequence is invalid (we spit out the bytes unchanged). */
137 unsigned char esc_bytes[8];
139 /* Index for next byte to store in ISO escape sequence. */
142 #ifdef ENABLE_COMPOSITE_CHARS
143 /* Stuff seen so far when composing a string. */
144 unsigned_char_dynarr *composite_chars;
147 /* If we saw an invalid designation sequence for a particular
148 register, we flag it here and switch to ASCII. The next time we
149 see a valid designation for this register, we turn off the flag
150 and do the designation normally, but pretend the sequence was
151 invalid. The effect of all this is that (most of the time) the
152 escape sequences for both the switch to the unknown charset, and
153 the switch back to the known charset, get inserted literally into
154 the buffer and saved out as such. The hope is that we can
155 preserve the escape sequences so that the resulting written out
156 file makes sense. If we don't do any of this, the designation
157 to the invalid charset will be preserved but that switch back
158 to the known charset will probably get eaten because it was
159 the same charset that was already present in the register. */
160 unsigned char invalid_designated[4];
162 /* We try to do similar things as above for direction-switching
163 sequences. If we encountered a direction switch while an
164 invalid designation was present, or an invalid designation
165 just after a direction switch (i.e. no valid designation
166 encountered yet), we insert the direction-switch escape
167 sequence literally into the output stream, and later on
168 insert the corresponding direction-restoring escape sequence
170 unsigned int switched_dir_and_no_valid_charset_yet :1;
171 unsigned int invalid_switch_dir :1;
173 /* Tells the decoder to output the escape sequence literally
174 even though it was valid. Used in the games we play to
175 avoid lossage when we encounter invalid designations. */
176 unsigned int output_literally :1;
177 /* We encountered a direction switch followed by an invalid
178 designation. We didn't output the direction switch
179 literally because we didn't know about the invalid designation;
180 but we have to do so now. */
181 unsigned int output_direction_sequence :1;
184 EXFUN (Fcopy_coding_system, 2);
186 struct detection_state;
189 text_encode_generic (Lstream *encoding, const Bufbyte *src,
190 unsigned_char_dynarr *dst, size_t n);
192 static int detect_coding_sjis (struct detection_state *st,
193 const Extbyte *src, size_t n);
194 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
195 unsigned_char_dynarr *dst, size_t n);
196 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
197 unsigned_char_dynarr *dst, unsigned int *flags);
198 void char_finish_shift_jis (struct encoding_stream *str,
199 unsigned_char_dynarr *dst, unsigned int *flags);
201 static int detect_coding_big5 (struct detection_state *st,
202 const Extbyte *src, size_t n);
203 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
204 unsigned_char_dynarr *dst, size_t n);
205 void char_encode_big5 (struct encoding_stream *str, Emchar c,
206 unsigned_char_dynarr *dst, unsigned int *flags);
207 void char_finish_big5 (struct encoding_stream *str,
208 unsigned_char_dynarr *dst, unsigned int *flags);
210 static int detect_coding_ucs4 (struct detection_state *st,
211 const Extbyte *src, size_t n);
212 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
213 unsigned_char_dynarr *dst, size_t n);
214 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
215 unsigned_char_dynarr *dst, unsigned int *flags);
216 void char_finish_ucs4 (struct encoding_stream *str,
217 unsigned_char_dynarr *dst, unsigned int *flags);
219 static int detect_coding_utf8 (struct detection_state *st,
220 const Extbyte *src, size_t n);
221 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
222 unsigned_char_dynarr *dst, size_t n);
223 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
224 unsigned_char_dynarr *dst, unsigned int *flags);
225 void char_finish_utf8 (struct encoding_stream *str,
226 unsigned_char_dynarr *dst, unsigned int *flags);
228 static int postprocess_iso2022_mask (int mask);
229 static void reset_iso2022 (Lisp_Object coding_system,
230 struct iso2022_decoder *iso);
231 static int detect_coding_iso2022 (struct detection_state *st,
232 const Extbyte *src, size_t n);
233 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
234 unsigned_char_dynarr *dst, size_t n);
235 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
236 unsigned_char_dynarr *dst, unsigned int *flags);
237 void char_finish_iso2022 (struct encoding_stream *str,
238 unsigned_char_dynarr *dst, unsigned int *flags);
240 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
241 unsigned_char_dynarr *dst, size_t n);
242 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
243 unsigned_char_dynarr *dst, size_t n);
244 static void mule_decode (Lstream *decoding, const Extbyte *src,
245 unsigned_char_dynarr *dst, size_t n);
246 static void mule_encode (Lstream *encoding, const Bufbyte *src,
247 unsigned_char_dynarr *dst, size_t n);
249 typedef struct codesys_prop codesys_prop;
258 Dynarr_declare (codesys_prop);
259 } codesys_prop_dynarr;
261 static const struct lrecord_description codesys_prop_description_1[] = {
262 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
266 static const struct struct_description codesys_prop_description = {
267 sizeof (codesys_prop),
268 codesys_prop_description_1
271 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
272 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
276 static const struct struct_description codesys_prop_dynarr_description = {
277 sizeof (codesys_prop_dynarr),
278 codesys_prop_dynarr_description_1
281 codesys_prop_dynarr *the_codesys_prop_dynarr;
283 enum codesys_prop_enum
286 CODESYS_PROP_ISO2022,
291 /************************************************************************/
292 /* Coding system functions */
293 /************************************************************************/
295 static Lisp_Object mark_coding_system (Lisp_Object);
296 static void print_coding_system (Lisp_Object, Lisp_Object, int);
297 static void finalize_coding_system (void *header, int for_disksave);
300 static const struct lrecord_description ccs_description_1[] = {
301 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
302 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
306 static const struct struct_description ccs_description = {
307 sizeof (charset_conversion_spec),
311 static const struct lrecord_description ccsd_description_1[] = {
312 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
316 static const struct struct_description ccsd_description = {
317 sizeof (charset_conversion_spec_dynarr),
322 static const struct lrecord_description coding_system_description[] = {
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
324 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
325 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
326 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
327 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
330 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
332 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
333 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
334 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
335 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
344 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
345 mark_coding_system, print_coding_system,
346 finalize_coding_system,
347 0, 0, coding_system_description,
351 mark_coding_system (Lisp_Object obj)
353 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
355 mark_object (CODING_SYSTEM_NAME (codesys));
356 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
357 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
358 mark_object (CODING_SYSTEM_EOL_LF (codesys));
359 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
360 mark_object (CODING_SYSTEM_EOL_CR (codesys));
362 switch (CODING_SYSTEM_TYPE (codesys))
366 case CODESYS_ISO2022:
367 for (i = 0; i < 4; i++)
368 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
369 if (codesys->iso2022.input_conv)
371 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
373 struct charset_conversion_spec *ccs =
374 Dynarr_atp (codesys->iso2022.input_conv, i);
375 mark_object (ccs->from_charset);
376 mark_object (ccs->to_charset);
379 if (codesys->iso2022.output_conv)
381 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
383 struct charset_conversion_spec *ccs =
384 Dynarr_atp (codesys->iso2022.output_conv, i);
385 mark_object (ccs->from_charset);
386 mark_object (ccs->to_charset);
393 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
394 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
399 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
400 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
407 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
409 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
411 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
415 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
418 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
420 error ("printing unreadable object #<coding_system 0x%x>",
423 write_c_string ("#<coding_system ", printcharfun);
424 print_internal (c->name, printcharfun, 1);
425 write_c_string (">", printcharfun);
429 finalize_coding_system (void *header, int for_disksave)
431 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
432 /* Since coding systems never go away, this function is not
433 necessary. But it would be necessary if we changed things
434 so that coding systems could go away. */
435 if (!for_disksave) /* see comment in lstream.c */
437 switch (CODING_SYSTEM_TYPE (c))
440 case CODESYS_ISO2022:
441 if (c->iso2022.input_conv)
443 Dynarr_free (c->iso2022.input_conv);
444 c->iso2022.input_conv = 0;
446 if (c->iso2022.output_conv)
448 Dynarr_free (c->iso2022.output_conv);
449 c->iso2022.output_conv = 0;
460 symbol_to_eol_type (Lisp_Object symbol)
462 CHECK_SYMBOL (symbol);
463 if (NILP (symbol)) return EOL_AUTODETECT;
464 if (EQ (symbol, Qlf)) return EOL_LF;
465 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
466 if (EQ (symbol, Qcr)) return EOL_CR;
468 signal_simple_error ("Unrecognized eol type", symbol);
469 return EOL_AUTODETECT; /* not reached */
473 eol_type_to_symbol (eol_type_t type)
478 case EOL_LF: return Qlf;
479 case EOL_CRLF: return Qcrlf;
480 case EOL_CR: return Qcr;
481 case EOL_AUTODETECT: return Qnil;
486 setup_eol_coding_systems (Lisp_Coding_System *codesys)
488 Lisp_Object codesys_obj;
489 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
490 char *codesys_name = (char *) alloca (len + 7);
492 char *codesys_mnemonic=0;
494 Lisp_Object codesys_name_sym, sub_codesys_obj;
498 XSETCODING_SYSTEM (codesys_obj, codesys);
500 memcpy (codesys_name,
501 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
503 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
505 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
506 codesys_mnemonic = (char *) alloca (mlen + 7);
507 memcpy (codesys_mnemonic,
508 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
511 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
512 strcpy (codesys_name + len, "-" op_sys); \
514 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
515 codesys_name_sym = intern (codesys_name); \
516 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
517 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
519 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
520 build_string (codesys_mnemonic); \
521 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
524 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
525 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
526 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
529 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
530 Return t if OBJECT is a coding system.
531 A coding system is an object that defines how text containing multiple
532 character sets is encoded into a stream of (typically 8-bit) bytes.
533 The coding system is used to decode the stream into a series of
534 characters (which may be from multiple charsets) when the text is read
535 from a file or process, and is used to encode the text back into the
536 same format when it is written out to a file or process.
538 For example, many ISO2022-compliant coding systems (such as Compound
539 Text, which is used for inter-client data under the X Window System)
540 use escape sequences to switch between different charsets -- Japanese
541 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
542 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
543 `make-coding-system' for more information.
545 Coding systems are normally identified using a symbol, and the
546 symbol is accepted in place of the actual coding system object whenever
547 a coding system is called for. (This is similar to how faces work.)
551 return CODING_SYSTEMP (object) ? Qt : Qnil;
554 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
555 Retrieve the coding system of the given name.
557 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
558 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
559 If there is no such coding system, nil is returned. Otherwise the
560 associated coding system object is returned.
562 (coding_system_or_name))
564 if (NILP (coding_system_or_name))
565 coding_system_or_name = Qbinary;
566 else if (CODING_SYSTEMP (coding_system_or_name))
567 return coding_system_or_name;
569 CHECK_SYMBOL (coding_system_or_name);
573 coding_system_or_name =
574 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
576 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
577 return coding_system_or_name;
581 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
582 Retrieve the coding system of the given name.
583 Same as `find-coding-system' except that if there is no such
584 coding system, an error is signaled instead of returning nil.
588 Lisp_Object coding_system = Ffind_coding_system (name);
590 if (NILP (coding_system))
591 signal_simple_error ("No such coding system", name);
592 return coding_system;
595 /* We store the coding systems in hash tables with the names as the key and the
596 actual coding system object as the value. Occasionally we need to use them
597 in a list format. These routines provide us with that. */
598 struct coding_system_list_closure
600 Lisp_Object *coding_system_list;
604 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
605 void *coding_system_list_closure)
607 /* This function can GC */
608 struct coding_system_list_closure *cscl =
609 (struct coding_system_list_closure *) coding_system_list_closure;
610 Lisp_Object *coding_system_list = cscl->coding_system_list;
612 *coding_system_list = Fcons (key, *coding_system_list);
616 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
617 Return a list of the names of all defined coding systems.
621 Lisp_Object coding_system_list = Qnil;
623 struct coding_system_list_closure coding_system_list_closure;
625 GCPRO1 (coding_system_list);
626 coding_system_list_closure.coding_system_list = &coding_system_list;
627 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
628 &coding_system_list_closure);
631 return coding_system_list;
634 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
635 Return the name of the given coding system.
639 coding_system = Fget_coding_system (coding_system);
640 return XCODING_SYSTEM_NAME (coding_system);
643 static Lisp_Coding_System *
644 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
646 Lisp_Coding_System *codesys =
647 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
649 zero_lcrecord (codesys);
650 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
651 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
652 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
653 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
654 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
655 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
656 CODING_SYSTEM_TYPE (codesys) = type;
657 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
660 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
662 if (type == CODESYS_ISO2022)
665 for (i = 0; i < 4; i++)
666 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
669 if (type == CODESYS_BIG5)
671 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
673 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
674 = Vcharset_chinese_big5;
675 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
677 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
681 else if (type == CODESYS_CCL)
683 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
684 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
687 CODING_SYSTEM_NAME (codesys) = name;
693 /* Given a list of charset conversion specs as specified in a Lisp
694 program, parse it into STORE_HERE. */
697 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
698 Lisp_Object spec_list)
702 EXTERNAL_LIST_LOOP (rest, spec_list)
704 Lisp_Object car = XCAR (rest);
705 Lisp_Object from, to;
706 struct charset_conversion_spec spec;
708 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
709 signal_simple_error ("Invalid charset conversion spec", car);
710 from = Fget_charset (XCAR (car));
711 to = Fget_charset (XCAR (XCDR (car)));
712 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
713 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
714 signal_simple_error_2
715 ("Attempted conversion between different charset types",
717 spec.from_charset = from;
718 spec.to_charset = to;
720 Dynarr_add (store_here, spec);
724 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
725 specs, return the equivalent as the Lisp programmer would see it.
727 If LOAD_HERE is 0, return Qnil. */
730 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
737 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
739 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
740 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
743 return Fnreverse (result);
748 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
749 Register symbol NAME as a coding system.
751 TYPE describes the conversion method used and should be one of
754 Automatic conversion. XEmacs attempts to detect the coding system
757 No conversion. Use this for binary files and such. On output,
758 graphic characters that are not in ASCII or Latin-1 will be
759 replaced by a ?. (For a no-conversion-encoded buffer, these
760 characters will only be present if you explicitly insert them.)
762 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
764 ISO 10646 UCS-4 encoding.
766 ISO 10646 UTF-8 encoding.
768 Any ISO2022-compliant encoding. Among other things, this includes
769 JIS (the Japanese encoding commonly used for e-mail), EUC (the
770 standard Unix encoding for Japanese and other languages), and
771 Compound Text (the encoding used in X11). You can specify more
772 specific information about the conversion with the PROPS argument.
774 Big5 (the encoding commonly used for Taiwanese).
776 The conversion is performed using a user-written pseudo-code
777 program. CCL (Code Conversion Language) is the name of this
780 Write out or read in the raw contents of the memory representing
781 the buffer's text. This is primarily useful for debugging
782 purposes, and is only enabled when XEmacs has been compiled with
783 DEBUG_XEMACS defined (via the --debug configure option).
784 WARNING: Reading in a file using 'internal conversion can result
785 in an internal inconsistency in the memory representing a
786 buffer's text, which will produce unpredictable results and may
787 cause XEmacs to crash. Under normal circumstances you should
788 never use 'internal conversion.
790 DOC-STRING is a string describing the coding system.
792 PROPS is a property list, describing the specific nature of the
793 character set. Recognized properties are:
796 String to be displayed in the modeline when this coding system is
800 End-of-line conversion to be used. It should be one of
803 Automatically detect the end-of-line type (LF, CRLF,
804 or CR). Also generate subsidiary coding systems named
805 `NAME-unix', `NAME-dos', and `NAME-mac', that are
806 identical to this coding system but have an EOL-TYPE
807 value of 'lf, 'crlf, and 'cr, respectively.
809 The end of a line is marked externally using ASCII LF.
810 Since this is also the way that XEmacs represents an
811 end-of-line internally, specifying this option results
812 in no end-of-line conversion. This is the standard
813 format for Unix text files.
815 The end of a line is marked externally using ASCII
816 CRLF. This is the standard format for MS-DOS text
819 The end of a line is marked externally using ASCII CR.
820 This is the standard format for Macintosh text files.
822 Automatically detect the end-of-line type but do not
823 generate subsidiary coding systems. (This value is
824 converted to nil when stored internally, and
825 `coding-system-property' will return nil.)
828 If non-nil, composition/decomposition for combining characters
831 'use-entity-reference
832 If non-nil, SGML style entity-reference is used for non-system-characters.
834 'post-read-conversion
835 Function called after a file has been read in, to perform the
836 decoding. Called with two arguments, START and END, denoting
837 a region of the current buffer to be decoded.
839 'pre-write-conversion
840 Function called before a file is written out, to perform the
841 encoding. Called with two arguments, START and END, denoting
842 a region of the current buffer to be encoded.
845 The following additional properties are recognized if TYPE is 'iso2022:
851 The character set initially designated to the G0 - G3 registers.
852 The value should be one of
854 -- A charset object (designate that character set)
855 -- nil (do not ever use this register)
856 -- t (no character set is initially designated to
857 the register, but may be later on; this automatically
858 sets the corresponding `force-g*-on-output' property)
864 If non-nil, send an explicit designation sequence on output before
865 using the specified register.
868 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
869 "ESC $ B" on output in place of the full designation sequences
870 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
873 If non-nil, don't designate ASCII to G0 at each end of line on output.
874 Setting this to non-nil also suppresses other state-resetting that
875 normally happens at the end of a line.
878 If non-nil, don't designate ASCII to G0 before control chars on output.
881 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
885 If non-nil, use locking-shift (SO/SI) instead of single-shift
886 or designation by escape sequence.
889 If non-nil, don't use ISO6429's direction specification.
892 If non-nil, literal control characters that are the same as
893 the beginning of a recognized ISO2022 or ISO6429 escape sequence
894 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
895 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
896 so that they can be properly distinguished from an escape sequence.
897 (Note that doing this results in a non-portable encoding.) This
898 encoding flag is used for byte-compiled files. Note that ESC
899 is a good choice for a quoting character because there are no
900 escape sequences whose second byte is a character from the Control-0
901 or Control-1 character sets; this is explicitly disallowed by the
904 'input-charset-conversion
905 A list of conversion specifications, specifying conversion of
906 characters in one charset to another when decoding is performed.
907 Each specification is a list of two elements: the source charset,
908 and the destination charset.
910 'output-charset-conversion
911 A list of conversion specifications, specifying conversion of
912 characters in one charset to another when encoding is performed.
913 The form of each specification is the same as for
914 'input-charset-conversion.
917 The following additional properties are recognized (and required)
921 CCL program used for decoding (converting to internal format).
924 CCL program used for encoding (converting to external format).
926 (name, type, doc_string, props))
928 Lisp_Coding_System *codesys;
929 enum coding_system_type ty;
930 int need_to_setup_eol_systems = 1;
932 /* Convert type to constant */
933 if (NILP (type) || EQ (type, Qundecided))
934 { ty = CODESYS_AUTODETECT; }
936 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
937 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
938 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
939 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
940 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
941 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
943 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
945 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
948 signal_simple_error ("Invalid coding system type", type);
952 codesys = allocate_coding_system (ty, name);
954 if (NILP (doc_string))
955 doc_string = build_string ("");
957 CHECK_STRING (doc_string);
958 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
961 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
963 if (EQ (key, Qmnemonic))
966 CHECK_STRING (value);
967 CODING_SYSTEM_MNEMONIC (codesys) = value;
970 else if (EQ (key, Qeol_type))
972 need_to_setup_eol_systems = NILP (value);
975 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
978 else if (EQ (key, Qpost_read_conversion))
979 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
980 else if (EQ (key, Qpre_write_conversion))
981 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
983 else if (EQ (key, Qdisable_composition))
984 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
985 else if (EQ (key, Quse_entity_reference))
986 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
989 else if (ty == CODESYS_ISO2022)
991 #define FROB_INITIAL_CHARSET(charset_num) \
992 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
993 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
995 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
996 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
997 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
998 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1000 #define FROB_FORCE_CHARSET(charset_num) \
1001 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1003 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1004 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1005 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1006 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1008 #define FROB_BOOLEAN_PROPERTY(prop) \
1009 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1011 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1012 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1013 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1014 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1015 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1016 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1017 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1019 else if (EQ (key, Qinput_charset_conversion))
1021 codesys->iso2022.input_conv =
1022 Dynarr_new (charset_conversion_spec);
1023 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1026 else if (EQ (key, Qoutput_charset_conversion))
1028 codesys->iso2022.output_conv =
1029 Dynarr_new (charset_conversion_spec);
1030 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1034 signal_simple_error ("Unrecognized property", key);
1037 else if (ty == CODESYS_BIG5)
1039 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1040 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1042 signal_simple_error ("Unrecognized property", key);
1045 else if (EQ (type, Qccl))
1048 struct ccl_program test_ccl;
1051 /* Check key first. */
1052 if (EQ (key, Qdecode))
1053 suffix = "-ccl-decode";
1054 else if (EQ (key, Qencode))
1055 suffix = "-ccl-encode";
1057 signal_simple_error ("Unrecognized property", key);
1059 /* If value is vector, register it as a ccl program
1060 associated with an newly created symbol for
1061 backward compatibility. */
1062 if (VECTORP (value))
1064 sym = Fintern (concat2 (Fsymbol_name (name),
1065 build_string (suffix)),
1067 Fregister_ccl_program (sym, value);
1071 CHECK_SYMBOL (value);
1074 /* check if the given ccl programs are valid. */
1075 if (setup_ccl_program (&test_ccl, sym) < 0)
1076 signal_simple_error ("Invalid CCL program", value);
1078 if (EQ (key, Qdecode))
1079 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1080 else if (EQ (key, Qencode))
1081 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1086 signal_simple_error ("Unrecognized property", key);
1090 if (need_to_setup_eol_systems)
1091 setup_eol_coding_systems (codesys);
1094 Lisp_Object codesys_obj;
1095 XSETCODING_SYSTEM (codesys_obj, codesys);
1096 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1101 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1102 Copy OLD-CODING-SYSTEM to NEW-NAME.
1103 If NEW-NAME does not name an existing coding system, a new one will
1106 (old_coding_system, new_name))
1108 Lisp_Object new_coding_system;
1109 old_coding_system = Fget_coding_system (old_coding_system);
1110 new_coding_system = Ffind_coding_system (new_name);
1111 if (NILP (new_coding_system))
1113 XSETCODING_SYSTEM (new_coding_system,
1114 allocate_coding_system
1115 (XCODING_SYSTEM_TYPE (old_coding_system),
1117 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1121 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1122 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1123 memcpy (((char *) to ) + sizeof (to->header),
1124 ((char *) from) + sizeof (from->header),
1125 sizeof (*from) - sizeof (from->header));
1126 to->name = new_name;
1128 return new_coding_system;
1131 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1132 Return t if OBJECT names a coding system, and is not a coding system alias.
1136 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1140 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1141 Return t if OBJECT is a coding system alias.
1142 All coding system aliases are created by `define-coding-system-alias'.
1146 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1150 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1151 Return the coding-system symbol for which symbol ALIAS is an alias.
1155 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1156 if (SYMBOLP (aliasee))
1159 signal_simple_error ("Symbol is not a coding system alias", alias);
1160 return Qnil; /* To keep the compiler happy */
1164 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1166 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1170 /* A maphash function, for removing dangling coding system aliases. */
1172 dangling_coding_system_alias_p (Lisp_Object alias,
1173 Lisp_Object aliasee,
1174 void *dangling_aliases)
1176 if (SYMBOLP (aliasee)
1177 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1179 (*(int *) dangling_aliases)++;
1186 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1187 Define symbol ALIAS as an alias for coding system ALIASEE.
1189 You can use this function to redefine an alias that has already been defined,
1190 but you cannot redefine a name which is the canonical name for a coding system.
1191 \(a canonical name of a coding system is what is returned when you call
1192 `coding-system-name' on a coding system).
1194 ALIASEE itself can be an alias, which allows you to define nested aliases.
1196 You are forbidden, however, from creating alias loops or `dangling' aliases.
1197 These will be detected, and an error will be signaled if you attempt to do so.
1199 If ALIASEE is nil, then ALIAS will simply be undefined.
1201 See also `coding-system-alias-p', `coding-system-aliasee',
1202 and `coding-system-canonical-name-p'.
1206 Lisp_Object real_coding_system, probe;
1208 CHECK_SYMBOL (alias);
1210 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1212 ("Symbol is the canonical name of a coding system and cannot be redefined",
1217 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1218 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1219 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1221 Fremhash (alias, Vcoding_system_hash_table);
1223 /* Undefine subsidiary aliases,
1224 presumably created by a previous call to this function */
1225 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1226 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1227 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1229 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1230 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1231 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1234 /* Undefine dangling coding system aliases. */
1236 int dangling_aliases;
1239 dangling_aliases = 0;
1240 elisp_map_remhash (dangling_coding_system_alias_p,
1241 Vcoding_system_hash_table,
1243 } while (dangling_aliases > 0);
1249 if (CODING_SYSTEMP (aliasee))
1250 aliasee = XCODING_SYSTEM_NAME (aliasee);
1252 /* Checks that aliasee names a coding-system */
1253 real_coding_system = Fget_coding_system (aliasee);
1255 /* Check for coding system alias loops */
1256 if (EQ (alias, aliasee))
1257 alias_loop: signal_simple_error_2
1258 ("Attempt to create a coding system alias loop", alias, aliasee);
1260 for (probe = aliasee;
1262 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1264 if (EQ (probe, alias))
1268 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1270 /* Set up aliases for subsidiaries.
1271 #### There must be a better way to handle subsidiary coding systems. */
1273 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1275 for (i = 0; i < countof (suffixes); i++)
1277 Lisp_Object alias_subsidiary =
1278 append_suffix_to_symbol (alias, suffixes[i]);
1279 Lisp_Object aliasee_subsidiary =
1280 append_suffix_to_symbol (aliasee, suffixes[i]);
1282 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1283 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1286 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1287 but it doesn't look intentional, so I'd rather return something
1288 meaningful or nothing at all. */
1293 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1295 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1296 Lisp_Object new_coding_system;
1298 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1299 return coding_system;
1303 case EOL_AUTODETECT: return coding_system;
1304 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1305 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1306 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1307 default: abort (); return Qnil;
1310 return NILP (new_coding_system) ? coding_system : new_coding_system;
1313 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1314 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1316 (coding_system, eol_type))
1318 coding_system = Fget_coding_system (coding_system);
1320 return subsidiary_coding_system (coding_system,
1321 symbol_to_eol_type (eol_type));
1325 /************************************************************************/
1326 /* Coding system accessors */
1327 /************************************************************************/
1329 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1330 Return the doc string for CODING-SYSTEM.
1334 coding_system = Fget_coding_system (coding_system);
1335 return XCODING_SYSTEM_DOC_STRING (coding_system);
1338 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1339 Return the type of CODING-SYSTEM.
1343 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1346 case CODESYS_AUTODETECT: return Qundecided;
1348 case CODESYS_SHIFT_JIS: return Qshift_jis;
1349 case CODESYS_ISO2022: return Qiso2022;
1350 case CODESYS_BIG5: return Qbig5;
1351 case CODESYS_UCS4: return Qucs4;
1352 case CODESYS_UTF8: return Qutf8;
1353 case CODESYS_CCL: return Qccl;
1355 case CODESYS_NO_CONVERSION: return Qno_conversion;
1357 case CODESYS_INTERNAL: return Qinternal;
1364 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1367 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1369 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1372 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1373 Return initial charset of CODING-SYSTEM designated to GNUM.
1376 (coding_system, gnum))
1378 coding_system = Fget_coding_system (coding_system);
1381 return coding_system_charset (coding_system, XINT (gnum));
1385 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1386 Return the PROP property of CODING-SYSTEM.
1388 (coding_system, prop))
1391 enum coding_system_type type;
1393 coding_system = Fget_coding_system (coding_system);
1394 CHECK_SYMBOL (prop);
1395 type = XCODING_SYSTEM_TYPE (coding_system);
1397 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1398 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1401 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1403 case CODESYS_PROP_ALL_OK:
1406 case CODESYS_PROP_ISO2022:
1407 if (type != CODESYS_ISO2022)
1409 ("Property only valid in ISO2022 coding systems",
1413 case CODESYS_PROP_CCL:
1414 if (type != CODESYS_CCL)
1416 ("Property only valid in CCL coding systems",
1426 signal_simple_error ("Unrecognized property", prop);
1428 if (EQ (prop, Qname))
1429 return XCODING_SYSTEM_NAME (coding_system);
1430 else if (EQ (prop, Qtype))
1431 return Fcoding_system_type (coding_system);
1432 else if (EQ (prop, Qdoc_string))
1433 return XCODING_SYSTEM_DOC_STRING (coding_system);
1434 else if (EQ (prop, Qmnemonic))
1435 return XCODING_SYSTEM_MNEMONIC (coding_system);
1436 else if (EQ (prop, Qeol_type))
1437 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1438 else if (EQ (prop, Qeol_lf))
1439 return XCODING_SYSTEM_EOL_LF (coding_system);
1440 else if (EQ (prop, Qeol_crlf))
1441 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1442 else if (EQ (prop, Qeol_cr))
1443 return XCODING_SYSTEM_EOL_CR (coding_system);
1444 else if (EQ (prop, Qpost_read_conversion))
1445 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1446 else if (EQ (prop, Qpre_write_conversion))
1447 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1450 else if (EQ (prop, Qdisable_composition))
1451 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1452 else if (EQ (prop, Quse_entity_reference))
1453 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1455 else if (type == CODESYS_ISO2022)
1457 if (EQ (prop, Qcharset_g0))
1458 return coding_system_charset (coding_system, 0);
1459 else if (EQ (prop, Qcharset_g1))
1460 return coding_system_charset (coding_system, 1);
1461 else if (EQ (prop, Qcharset_g2))
1462 return coding_system_charset (coding_system, 2);
1463 else if (EQ (prop, Qcharset_g3))
1464 return coding_system_charset (coding_system, 3);
1466 #define FORCE_CHARSET(charset_num) \
1467 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1468 (coding_system, charset_num) ? Qt : Qnil)
1470 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1471 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1472 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1473 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1475 #define LISP_BOOLEAN(prop) \
1476 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1478 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1479 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1480 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1481 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1482 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1483 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1484 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1486 else if (EQ (prop, Qinput_charset_conversion))
1488 unparse_charset_conversion_specs
1489 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1490 else if (EQ (prop, Qoutput_charset_conversion))
1492 unparse_charset_conversion_specs
1493 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1497 else if (type == CODESYS_CCL)
1499 if (EQ (prop, Qdecode))
1500 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1501 else if (EQ (prop, Qencode))
1502 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1510 return Qnil; /* not reached */
1514 /************************************************************************/
1515 /* Coding category functions */
1516 /************************************************************************/
1519 decode_coding_category (Lisp_Object symbol)
1523 CHECK_SYMBOL (symbol);
1524 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1525 if (EQ (coding_category_symbol[i], symbol))
1528 signal_simple_error ("Unrecognized coding category", symbol);
1529 return 0; /* not reached */
1532 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1533 Return a list of all recognized coding categories.
1538 Lisp_Object list = Qnil;
1540 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1541 list = Fcons (coding_category_symbol[i], list);
1545 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1546 Change the priority order of the coding categories.
1547 LIST should be list of coding categories, in descending order of
1548 priority. Unspecified coding categories will be lower in priority
1549 than all specified ones, in the same relative order they were in
1554 int category_to_priority[CODING_CATEGORY_LAST];
1558 /* First generate a list that maps coding categories to priorities. */
1560 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1561 category_to_priority[i] = -1;
1563 /* Highest priority comes from the specified list. */
1565 EXTERNAL_LIST_LOOP (rest, list)
1567 int cat = decode_coding_category (XCAR (rest));
1569 if (category_to_priority[cat] >= 0)
1570 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1571 category_to_priority[cat] = i++;
1574 /* Now go through the existing categories by priority to retrieve
1575 the categories not yet specified and preserve their priority
1577 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1579 int cat = fcd->coding_category_by_priority[j];
1580 if (category_to_priority[cat] < 0)
1581 category_to_priority[cat] = i++;
1584 /* Now we need to construct the inverse of the mapping we just
1587 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1588 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1590 /* Phew! That was confusing. */
1594 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1595 Return a list of coding categories in descending order of priority.
1600 Lisp_Object list = Qnil;
1602 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1603 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1608 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1609 Change the coding system associated with a coding category.
1611 (coding_category, coding_system))
1613 int cat = decode_coding_category (coding_category);
1615 coding_system = Fget_coding_system (coding_system);
1616 fcd->coding_category_system[cat] = coding_system;
1620 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1621 Return the coding system associated with a coding category.
1625 int cat = decode_coding_category (coding_category);
1626 Lisp_Object sys = fcd->coding_category_system[cat];
1629 return XCODING_SYSTEM_NAME (sys);
1634 /************************************************************************/
1635 /* Detecting the encoding of data */
1636 /************************************************************************/
1638 struct detection_state
1640 eol_type_t eol_type;
1676 struct iso2022_decoder iso;
1678 int high_byte_count;
1679 unsigned int saw_single_shift:1;
1692 acceptable_control_char_p (int c)
1696 /* Allow and ignore control characters that you might
1697 reasonably see in a text file */
1702 case 8: /* backspace */
1703 case 11: /* vertical tab */
1704 case 12: /* form feed */
1705 case 26: /* MS-DOS C-z junk */
1706 case 31: /* '^_' -- for info */
1714 mask_has_at_most_one_bit_p (int mask)
1716 /* Perhaps the only thing useful you learn from intensive Microsoft
1717 technical interviews */
1718 return (mask & (mask - 1)) == 0;
1722 detect_eol_type (struct detection_state *st, const Extbyte *src,
1727 unsigned char c = *(unsigned char *)src++;
1730 if (st->eol.just_saw_cr)
1732 else if (st->eol.seen_anything)
1735 else if (st->eol.just_saw_cr)
1738 st->eol.just_saw_cr = 1;
1740 st->eol.just_saw_cr = 0;
1741 st->eol.seen_anything = 1;
1744 return EOL_AUTODETECT;
1747 /* Attempt to determine the encoding and EOL type of the given text.
1748 Before calling this function for the first type, you must initialize
1749 st->eol_type as appropriate and initialize st->mask to ~0.
1751 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1754 st->mask holds the determined coding category mask, or ~0 if only
1755 ASCII has been seen so far.
1759 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1760 is present in st->mask
1761 1 == definitive answers are here for both st->eol_type and st->mask
1765 detect_coding_type (struct detection_state *st, const Extbyte *src,
1766 size_t n, int just_do_eol)
1768 if (st->eol_type == EOL_AUTODETECT)
1769 st->eol_type = detect_eol_type (st, src, n);
1772 return st->eol_type != EOL_AUTODETECT;
1774 if (!st->seen_non_ascii)
1776 for (; n; n--, src++)
1778 unsigned char c = *(unsigned char *) src;
1779 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1781 st->seen_non_ascii = 1;
1783 st->shift_jis.mask = ~0;
1787 st->iso2022.mask = ~0;
1797 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1798 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1799 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1800 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1801 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1802 st->big5.mask = detect_coding_big5 (st, src, n);
1803 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1804 st->utf8.mask = detect_coding_utf8 (st, src, n);
1805 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1806 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1809 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1810 | st->utf8.mask | st->ucs4.mask;
1813 int retval = mask_has_at_most_one_bit_p (st->mask);
1814 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1815 return retval && st->eol_type != EOL_AUTODETECT;
1820 coding_system_from_mask (int mask)
1824 /* If the file was entirely or basically ASCII, use the
1825 default value of `buffer-file-coding-system'. */
1826 Lisp_Object retval =
1827 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1830 retval = Ffind_coding_system (retval);
1834 (Qbad_variable, Qwarning,
1835 "Invalid `default-buffer-file-coding-system', set to nil");
1836 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1840 retval = Fget_coding_system (Qraw_text);
1848 mask = postprocess_iso2022_mask (mask);
1850 /* Look through the coding categories by priority and find
1851 the first one that is allowed. */
1852 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1854 cat = fcd->coding_category_by_priority[i];
1855 if ((mask & (1 << cat)) &&
1856 !NILP (fcd->coding_category_system[cat]))
1860 return fcd->coding_category_system[cat];
1862 return Fget_coding_system (Qraw_text);
1866 /* Given a seekable read stream and potential coding system and EOL type
1867 as specified, do any autodetection that is called for. If the
1868 coding system and/or EOL type are not `autodetect', they will be left
1869 alone; but this function will never return an autodetect coding system
1872 This function does not automatically fetch subsidiary coding systems;
1873 that should be unnecessary with the explicit eol-type argument. */
1875 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1878 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1879 eol_type_t *eol_type_in_out)
1881 struct detection_state decst;
1883 if (*eol_type_in_out == EOL_AUTODETECT)
1884 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1887 decst.eol_type = *eol_type_in_out;
1890 /* If autodetection is called for, do it now. */
1891 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1892 || *eol_type_in_out == EOL_AUTODETECT)
1895 Lisp_Object coding_system = Qnil;
1897 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1900 /* Look for initial "-*-"; mode line prefix */
1902 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1907 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1909 Extbyte *local_vars_beg = p + 3;
1910 /* Look for final "-*-"; mode line suffix */
1911 for (p = local_vars_beg,
1912 scan_end = buf + nread - LENGTH ("-*-");
1917 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1919 Extbyte *suffix = p;
1920 /* Look for "coding:" */
1921 for (p = local_vars_beg,
1922 scan_end = suffix - LENGTH ("coding:?");
1925 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1926 && (p == local_vars_beg
1927 || (*(p-1) == ' ' ||
1933 p += LENGTH ("coding:");
1934 while (*p == ' ' || *p == '\t') p++;
1936 /* Get coding system name */
1937 save = *suffix; *suffix = '\0';
1938 /* Characters valid in a MIME charset name (rfc 1521),
1939 and in a Lisp symbol name. */
1940 n = strspn ( (char *) p,
1941 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1942 "abcdefghijklmnopqrstuvwxyz"
1948 save = p[n]; p[n] = '\0';
1950 Ffind_coding_system (intern ((char *) p));
1960 if (NILP (coding_system))
1963 if (detect_coding_type (&decst, buf, nread,
1964 XCODING_SYSTEM_TYPE (*codesys_in_out)
1965 != CODESYS_AUTODETECT))
1967 nread = Lstream_read (stream, buf, sizeof (buf));
1973 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1974 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1977 if (detect_coding_type (&decst, buf, nread, 1))
1979 nread = Lstream_read (stream, buf, sizeof (buf));
1985 *eol_type_in_out = decst.eol_type;
1986 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1988 if (NILP (coding_system))
1989 *codesys_in_out = coding_system_from_mask (decst.mask);
1991 *codesys_in_out = coding_system;
1995 /* If we absolutely can't determine the EOL type, just assume LF. */
1996 if (*eol_type_in_out == EOL_AUTODETECT)
1997 *eol_type_in_out = EOL_LF;
1999 Lstream_rewind (stream);
2002 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2003 Detect coding system of the text in the region between START and END.
2004 Return a list of possible coding systems ordered by priority.
2005 If only ASCII characters are found, return 'undecided or one of
2006 its subsidiary coding systems according to a detected end-of-line
2007 type. Optional arg BUFFER defaults to the current buffer.
2009 (start, end, buffer))
2011 Lisp_Object val = Qnil;
2012 struct buffer *buf = decode_buffer (buffer, 0);
2014 Lisp_Object instream, lb_instream;
2015 Lstream *istr, *lb_istr;
2016 struct detection_state decst;
2017 struct gcpro gcpro1, gcpro2;
2019 get_buffer_range_char (buf, start, end, &b, &e, 0);
2020 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2021 lb_istr = XLSTREAM (lb_instream);
2022 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2023 istr = XLSTREAM (instream);
2024 GCPRO2 (instream, lb_instream);
2026 decst.eol_type = EOL_AUTODETECT;
2030 Extbyte random_buffer[4096];
2031 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2035 if (detect_coding_type (&decst, random_buffer, nread, 0))
2039 if (decst.mask == ~0)
2040 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2048 decst.mask = postprocess_iso2022_mask (decst.mask);
2050 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2052 int sys = fcd->coding_category_by_priority[i];
2053 if (decst.mask & (1 << sys))
2055 Lisp_Object codesys = fcd->coding_category_system[sys];
2056 if (!NILP (codesys))
2057 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2058 val = Fcons (codesys, val);
2062 Lstream_close (istr);
2064 Lstream_delete (istr);
2065 Lstream_delete (lb_istr);
2070 /************************************************************************/
2071 /* Converting to internal Mule format ("decoding") */
2072 /************************************************************************/
2074 /* A decoding stream is a stream used for decoding text (i.e.
2075 converting from some external format to internal format).
2076 The decoding-stream object keeps track of the actual coding
2077 stream, the stream that is at the other end, and data that
2078 needs to be persistent across the lifetime of the stream. */
2080 /* Handle the EOL stuff related to just-read-in character C.
2081 EOL_TYPE is the EOL type of the coding stream.
2082 FLAGS is the current value of FLAGS in the coding stream, and may
2083 be modified by this macro. (The macro only looks at the
2084 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2085 bytes are to be written. You need to also define a local goto
2086 label "label_continue_loop" that is at the end of the main
2087 character-reading loop.
2089 If C is a CR character, then this macro handles it entirely and
2090 jumps to label_continue_loop. Otherwise, this macro does not add
2091 anything to DST, and continues normally. You should continue
2092 processing C normally after this macro. */
2094 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2098 if (eol_type == EOL_CR) \
2099 Dynarr_add (dst, '\n'); \
2100 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2101 Dynarr_add (dst, c); \
2103 flags |= CODING_STATE_CR; \
2104 goto label_continue_loop; \
2106 else if (flags & CODING_STATE_CR) \
2107 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2109 Dynarr_add (dst, '\r'); \
2110 flags &= ~CODING_STATE_CR; \
2114 /* C should be a binary character in the range 0 - 255; convert
2115 to internal format and add to Dynarr DST. */
2118 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2120 if (BYTE_ASCII_P (c)) \
2121 Dynarr_add (dst, c); \
2124 Dynarr_add (dst, (c >> 6) | 0xc0); \
2125 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2129 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2131 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2135 Dynarr_add (dst, c);
2137 else if ( c <= 0x7ff )
2139 Dynarr_add (dst, (c >> 6) | 0xc0);
2140 Dynarr_add (dst, (c & 0x3f) | 0x80);
2142 else if ( c <= 0xffff )
2144 Dynarr_add (dst, (c >> 12) | 0xe0);
2145 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2146 Dynarr_add (dst, (c & 0x3f) | 0x80);
2148 else if ( c <= 0x1fffff )
2150 Dynarr_add (dst, (c >> 18) | 0xf0);
2151 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2152 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2153 Dynarr_add (dst, (c & 0x3f) | 0x80);
2155 else if ( c <= 0x3ffffff )
2157 Dynarr_add (dst, (c >> 24) | 0xf8);
2158 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2159 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2160 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2161 Dynarr_add (dst, (c & 0x3f) | 0x80);
2165 Dynarr_add (dst, (c >> 30) | 0xfc);
2166 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2167 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2168 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2169 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2170 Dynarr_add (dst, (c & 0x3f) | 0x80);
2174 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2176 if (BYTE_ASCII_P (c)) \
2177 Dynarr_add (dst, c); \
2178 else if (BYTE_C1_P (c)) \
2180 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2181 Dynarr_add (dst, c + 0x20); \
2185 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2186 Dynarr_add (dst, c); \
2191 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2195 DECODE_ADD_BINARY_CHAR (ch, dst); \
2200 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2202 if (flags & CODING_STATE_END) \
2204 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2205 if (flags & CODING_STATE_CR) \
2206 Dynarr_add (dst, '\r'); \
2210 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2212 struct decoding_stream
2214 /* Coding system that governs the conversion. */
2215 Lisp_Coding_System *codesys;
2217 /* Stream that we read the encoded data from or
2218 write the decoded data to. */
2221 /* If we are reading, then we can return only a fixed amount of
2222 data, so if the conversion resulted in too much data, we store it
2223 here for retrieval the next time around. */
2224 unsigned_char_dynarr *runoff;
2226 /* FLAGS holds flags indicating the current state of the decoding.
2227 Some of these flags are dependent on the coding system. */
2230 /* CPOS holds a partially built-up code-point of character. */
2233 /* EOL_TYPE specifies the type of end-of-line conversion that
2234 currently applies. We need to keep this separate from the
2235 EOL type stored in CODESYS because the latter might indicate
2236 automatic EOL-type detection while the former will always
2237 indicate a particular EOL type. */
2238 eol_type_t eol_type;
2240 /* Additional ISO2022 information. We define the structure above
2241 because it's also needed by the detection routines. */
2242 struct iso2022_decoder iso2022;
2244 /* Additional information (the state of the running CCL program)
2245 used by the CCL decoder. */
2246 struct ccl_program ccl;
2248 /* counter for UTF-8 or UCS-4 */
2249 unsigned char counter;
2252 unsigned char er_counter;
2253 unsigned char er_buf[16];
2255 unsigned combined_char_count;
2256 Emchar combined_chars[16];
2257 Lisp_Object combining_table;
2259 struct detection_state decst;
2263 extern Lisp_Object Vcharacter_composition_table;
2266 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
2268 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
2270 if ( str->er_counter > 0)
2272 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2273 str->er_counter = 0;
2277 void decode_add_er_char (struct decoding_stream *str, Emchar character,
2278 unsigned_char_dynarr* dst);
2280 decode_add_er_char (struct decoding_stream *str, Emchar c,
2281 unsigned_char_dynarr* dst)
2283 if (str->er_counter == 0)
2285 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
2288 str->er_buf[0] = '&';
2292 DECODE_ADD_UCS_CHAR (c, dst);
2296 Lisp_Object string = make_string (str->er_buf,
2298 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
2305 while (!NILP (rest))
2309 if (NILP (ccs = Ffind_charset (ccs)))
2324 pat = concat3 (build_string ("^&"),
2325 pat, build_string ("\\([0-9]+\\)$"));
2328 else if (EQ (ret, Qx))
2330 pat = concat3 (build_string ("^&"),
2331 pat, build_string ("\\([0-9a-f]+\\)$"));
2334 else if (EQ (ret, QX))
2336 pat = concat3 (build_string ("^&"),
2337 pat, build_string ("\\([0-9A-F]+\\)$"));
2343 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
2346 = XINT (Fstring_to_number
2347 (Fsubstring (string,
2348 Fmatch_beginning (make_int (1)),
2349 Fmatch_end (make_int (1))),
2352 DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst);
2357 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
2358 string, Qnil, Qnil)))
2361 = XINT (Fstring_to_number
2362 (Fsubstring (string,
2363 Fmatch_beginning (make_int (1)),
2364 Fmatch_end (make_int (1))),
2367 DECODE_ADD_UCS_CHAR (code, dst);
2371 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2372 Dynarr_add (dst, ';');
2375 str->er_counter = 0;
2377 else if ( (str->er_counter >= 16) || (c >= 0x7F) )
2379 Dynarr_add_many (dst, str->er_buf, str->er_counter);
2380 str->er_counter = 0;
2381 DECODE_ADD_UCS_CHAR (c, dst);
2384 str->er_buf[str->er_counter++] = c;
2388 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2390 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2394 for (i = 0; i < str->combined_char_count; i++)
2395 decode_add_er_char (str, str->combined_chars[i], dst);
2396 str->combined_char_count = 0;
2397 str->combining_table = Qnil;
2400 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
2401 unsigned_char_dynarr* dst);
2403 COMPOSE_ADD_CHAR (struct decoding_stream *str,
2404 Emchar character, unsigned_char_dynarr* dst)
2406 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2407 decode_add_er_char (str, character, dst);
2408 else if (!CHAR_TABLEP (str->combining_table))
2411 = get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2415 decode_add_er_char (str, character, dst);
2418 str->combined_chars[0] = character;
2419 str->combined_char_count = 1;
2420 str->combining_table = ret;
2426 = get_char_id_table (XCHAR_TABLE(str->combining_table),
2431 Emchar char2 = XCHARVAL (ret);
2433 get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2437 decode_add_er_char (str, character, dst);
2438 str->combined_char_count = 0;
2439 str->combining_table = Qnil;
2443 str->combined_chars[0] = char2;
2444 str->combined_char_count = 1;
2445 str->combining_table = ret;
2448 else if (CHAR_TABLEP (ret))
2450 str->combined_chars[str->combined_char_count++] = character;
2451 str->combining_table = ret;
2455 COMPOSE_FLUSH_CHARS (str, dst);
2456 decode_add_er_char (str, character, dst);
2460 #else /* not UTF2000 */
2461 #define COMPOSE_FLUSH_CHARS(str, dst)
2462 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2463 #endif /* UTF2000 */
2465 static ssize_t decoding_reader (Lstream *stream,
2466 unsigned char *data, size_t size);
2467 static ssize_t decoding_writer (Lstream *stream,
2468 const unsigned char *data, size_t size);
2469 static int decoding_rewinder (Lstream *stream);
2470 static int decoding_seekable_p (Lstream *stream);
2471 static int decoding_flusher (Lstream *stream);
2472 static int decoding_closer (Lstream *stream);
2474 static Lisp_Object decoding_marker (Lisp_Object stream);
2476 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2477 sizeof (struct decoding_stream));
2480 decoding_marker (Lisp_Object stream)
2482 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2483 Lisp_Object str_obj;
2485 /* We do not need to mark the coding systems or charsets stored
2486 within the stream because they are stored in a global list
2487 and automatically marked. */
2489 XSETLSTREAM (str_obj, str);
2490 mark_object (str_obj);
2491 if (str->imp->marker)
2492 return (str->imp->marker) (str_obj);
2497 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2498 so we read data from the other end, decode it, and store it into DATA. */
2501 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2503 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2504 unsigned char *orig_data = data;
2506 int error_occurred = 0;
2508 /* We need to interface to mule_decode(), which expects to take some
2509 amount of data and store the result into a Dynarr. We have
2510 mule_decode() store into str->runoff, and take data from there
2513 /* We loop until we have enough data, reading chunks from the other
2514 end and decoding it. */
2517 /* Take data from the runoff if we can. Make sure to take at
2518 most SIZE bytes, and delete the data from the runoff. */
2519 if (Dynarr_length (str->runoff) > 0)
2521 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2522 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2523 Dynarr_delete_many (str->runoff, 0, chunk);
2529 break; /* No more room for data */
2531 if (str->flags & CODING_STATE_END)
2532 /* This means that on the previous iteration, we hit the EOF on
2533 the other end. We loop once more so that mule_decode() can
2534 output any final stuff it may be holding, or any "go back
2535 to a sane state" escape sequences. (This latter makes sense
2536 during encoding.) */
2539 /* Exhausted the runoff, so get some more. DATA has at least
2540 SIZE bytes left of storage in it, so it's OK to read directly
2541 into it. (We'll be overwriting above, after we've decoded it
2542 into the runoff.) */
2543 read_size = Lstream_read (str->other_end, data, size);
2550 /* There might be some more end data produced in the translation.
2551 See the comment above. */
2552 str->flags |= CODING_STATE_END;
2553 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2556 if (data - orig_data == 0)
2557 return error_occurred ? -1 : 0;
2559 return data - orig_data;
2563 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2565 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2568 /* Decode all our data into the runoff, and then attempt to write
2569 it all out to the other end. Remove whatever chunk we succeeded
2571 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2572 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2573 Dynarr_length (str->runoff));
2575 Dynarr_delete_many (str->runoff, 0, retval);
2576 /* Do NOT return retval. The return value indicates how much
2577 of the incoming data was written, not how many bytes were
2583 reset_decoding_stream (struct decoding_stream *str)
2586 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2588 Lisp_Object coding_system;
2589 XSETCODING_SYSTEM (coding_system, str->codesys);
2590 reset_iso2022 (coding_system, &str->iso2022);
2592 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2594 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2599 str->er_counter = 0;
2600 str->combined_char_count = 0;
2601 str->combining_table = Qnil;
2603 str->flags = str->cpos = 0;
2607 decoding_rewinder (Lstream *stream)
2609 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2610 reset_decoding_stream (str);
2611 Dynarr_reset (str->runoff);
2612 return Lstream_rewind (str->other_end);
2616 decoding_seekable_p (Lstream *stream)
2618 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2619 return Lstream_seekable_p (str->other_end);
2623 decoding_flusher (Lstream *stream)
2625 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2626 return Lstream_flush (str->other_end);
2630 decoding_closer (Lstream *stream)
2632 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2633 if (stream->flags & LSTREAM_FL_WRITE)
2635 str->flags |= CODING_STATE_END;
2636 decoding_writer (stream, 0, 0);
2638 Dynarr_free (str->runoff);
2640 #ifdef ENABLE_COMPOSITE_CHARS
2641 if (str->iso2022.composite_chars)
2642 Dynarr_free (str->iso2022.composite_chars);
2645 return Lstream_close (str->other_end);
2649 decoding_stream_coding_system (Lstream *stream)
2651 Lisp_Object coding_system;
2652 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2654 XSETCODING_SYSTEM (coding_system, str->codesys);
2655 return subsidiary_coding_system (coding_system, str->eol_type);
2659 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2661 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2662 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2664 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2665 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2666 reset_decoding_stream (str);
2669 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2670 stream for writing, no automatic code detection will be performed.
2671 The reason for this is that automatic code detection requires a
2672 seekable input. Things will also fail if you open a decoding
2673 stream for reading using a non-fully-specified coding system and
2674 a non-seekable input stream. */
2677 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2680 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2681 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2685 str->other_end = stream;
2686 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2687 str->eol_type = EOL_AUTODETECT;
2688 if (!strcmp (mode, "r")
2689 && Lstream_seekable_p (stream))
2690 /* We can determine the coding system now. */
2691 determine_real_coding_system (stream, &codesys, &str->eol_type);
2692 set_decoding_stream_coding_system (lstr, codesys);
2693 str->decst.eol_type = str->eol_type;
2694 str->decst.mask = ~0;
2695 XSETLSTREAM (obj, lstr);
2700 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2702 return make_decoding_stream_1 (stream, codesys, "r");
2706 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2708 return make_decoding_stream_1 (stream, codesys, "w");
2711 /* Note: the decode_coding_* functions all take the same
2712 arguments as mule_decode(), which is to say some SRC data of
2713 size N, which is to be stored into dynamic array DST.
2714 DECODING is the stream within which the decoding is
2715 taking place, but no data is actually read from or
2716 written to that stream; that is handled in decoding_reader()
2717 or decoding_writer(). This allows the same functions to
2718 be used for both reading and writing. */
2721 mule_decode (Lstream *decoding, const Extbyte *src,
2722 unsigned_char_dynarr *dst, size_t n)
2724 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2726 /* If necessary, do encoding-detection now. We do this when
2727 we're a writing stream or a non-seekable reading stream,
2728 meaning that we can't just process the whole input,
2729 rewind, and start over. */
2731 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2732 str->eol_type == EOL_AUTODETECT)
2734 Lisp_Object codesys;
2736 XSETCODING_SYSTEM (codesys, str->codesys);
2737 detect_coding_type (&str->decst, src, n,
2738 CODING_SYSTEM_TYPE (str->codesys) !=
2739 CODESYS_AUTODETECT);
2740 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2741 str->decst.mask != ~0)
2742 /* #### This is cheesy. What we really ought to do is
2743 buffer up a certain amount of data so as to get a
2744 less random result. */
2745 codesys = coding_system_from_mask (str->decst.mask);
2746 str->eol_type = str->decst.eol_type;
2747 if (XCODING_SYSTEM (codesys) != str->codesys)
2749 /* Preserve the CODING_STATE_END flag in case it was set.
2750 If we erase it, bad things might happen. */
2751 int was_end = str->flags & CODING_STATE_END;
2752 set_decoding_stream_coding_system (decoding, codesys);
2754 str->flags |= CODING_STATE_END;
2758 switch (CODING_SYSTEM_TYPE (str->codesys))
2761 case CODESYS_INTERNAL:
2762 Dynarr_add_many (dst, src, n);
2765 case CODESYS_AUTODETECT:
2766 /* If we got this far and still haven't decided on the coding
2767 system, then do no conversion. */
2768 case CODESYS_NO_CONVERSION:
2769 decode_coding_no_conversion (decoding, src, dst, n);
2772 case CODESYS_SHIFT_JIS:
2773 decode_coding_sjis (decoding, src, dst, n);
2776 decode_coding_big5 (decoding, src, dst, n);
2779 decode_coding_ucs4 (decoding, src, dst, n);
2782 decode_coding_utf8 (decoding, src, dst, n);
2785 str->ccl.last_block = str->flags & CODING_STATE_END;
2786 /* When applying ccl program to stream, MUST NOT set NULL
2788 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2789 dst, n, 0, CCL_MODE_DECODING);
2791 case CODESYS_ISO2022:
2792 decode_coding_iso2022 (decoding, src, dst, n);
2800 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2801 Decode the text between START and END which is encoded in CODING-SYSTEM.
2802 This is useful if you've read in encoded text from a file without decoding
2803 it (e.g. you read in a JIS-formatted file but used the `binary' or
2804 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2805 Return length of decoded text.
2806 BUFFER defaults to the current buffer if unspecified.
2808 (start, end, coding_system, buffer))
2811 struct buffer *buf = decode_buffer (buffer, 0);
2812 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2813 Lstream *istr, *ostr;
2814 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2816 get_buffer_range_char (buf, start, end, &b, &e, 0);
2818 barf_if_buffer_read_only (buf, b, e);
2820 coding_system = Fget_coding_system (coding_system);
2821 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2822 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2823 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2825 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2826 Fget_coding_system (Qbinary));
2827 istr = XLSTREAM (instream);
2828 ostr = XLSTREAM (outstream);
2829 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2831 /* The chain of streams looks like this:
2833 [BUFFER] <----- send through
2834 ------> [ENCODE AS BINARY]
2835 ------> [DECODE AS SPECIFIED]
2841 char tempbuf[1024]; /* some random amount */
2842 Bufpos newpos, even_newer_pos;
2843 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2844 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2848 newpos = lisp_buffer_stream_startpos (istr);
2849 Lstream_write (ostr, tempbuf, size_in_bytes);
2850 even_newer_pos = lisp_buffer_stream_startpos (istr);
2851 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2854 Lstream_close (istr);
2855 Lstream_close (ostr);
2857 Lstream_delete (istr);
2858 Lstream_delete (ostr);
2859 Lstream_delete (XLSTREAM (de_outstream));
2860 Lstream_delete (XLSTREAM (lb_outstream));
2865 /************************************************************************/
2866 /* Converting to an external encoding ("encoding") */
2867 /************************************************************************/
2869 /* An encoding stream is an output stream. When you create the
2870 stream, you specify the coding system that governs the encoding
2871 and another stream that the resulting encoded data is to be
2872 sent to, and then start sending data to it. */
2874 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2876 struct encoding_stream
2878 /* Coding system that governs the conversion. */
2879 Lisp_Coding_System *codesys;
2881 /* Stream that we read the encoded data from or
2882 write the decoded data to. */
2885 /* If we are reading, then we can return only a fixed amount of
2886 data, so if the conversion resulted in too much data, we store it
2887 here for retrieval the next time around. */
2888 unsigned_char_dynarr *runoff;
2890 /* FLAGS holds flags indicating the current state of the encoding.
2891 Some of these flags are dependent on the coding system. */
2894 /* CH holds a partially built-up character. Since we only deal
2895 with one- and two-byte characters at the moment, we only use
2896 this to store the first byte of a two-byte character. */
2899 /* Additional information used by the ISO2022 encoder. */
2902 /* CHARSET holds the character sets currently assigned to the G0
2903 through G3 registers. It is initialized from the array
2904 INITIAL_CHARSET in CODESYS. */
2905 Lisp_Object charset[4];
2907 /* Which registers are currently invoked into the left (GL) and
2908 right (GR) halves of the 8-bit encoding space? */
2909 int register_left, register_right;
2911 /* Whether we need to explicitly designate the charset in the
2912 G? register before using it. It is initialized from the
2913 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2914 unsigned char force_charset_on_output[4];
2916 /* Other state variables that need to be preserved across
2918 Lisp_Object current_charset;
2920 int current_char_boundary;
2923 void (*encode_char) (struct encoding_stream *str, Emchar c,
2924 unsigned_char_dynarr *dst, unsigned int *flags);
2925 void (*finish) (struct encoding_stream *str,
2926 unsigned_char_dynarr *dst, unsigned int *flags);
2928 /* Additional information (the state of the running CCL program)
2929 used by the CCL encoder. */
2930 struct ccl_program ccl;
2934 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2935 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2937 static int encoding_rewinder (Lstream *stream);
2938 static int encoding_seekable_p (Lstream *stream);
2939 static int encoding_flusher (Lstream *stream);
2940 static int encoding_closer (Lstream *stream);
2942 static Lisp_Object encoding_marker (Lisp_Object stream);
2944 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2945 sizeof (struct encoding_stream));
2948 encoding_marker (Lisp_Object stream)
2950 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2951 Lisp_Object str_obj;
2953 /* We do not need to mark the coding systems or charsets stored
2954 within the stream because they are stored in a global list
2955 and automatically marked. */
2957 XSETLSTREAM (str_obj, str);
2958 mark_object (str_obj);
2959 if (str->imp->marker)
2960 return (str->imp->marker) (str_obj);
2965 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2966 so we read data from the other end, encode it, and store it into DATA. */
2969 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2971 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2972 unsigned char *orig_data = data;
2974 int error_occurred = 0;
2976 /* We need to interface to mule_encode(), which expects to take some
2977 amount of data and store the result into a Dynarr. We have
2978 mule_encode() store into str->runoff, and take data from there
2981 /* We loop until we have enough data, reading chunks from the other
2982 end and encoding it. */
2985 /* Take data from the runoff if we can. Make sure to take at
2986 most SIZE bytes, and delete the data from the runoff. */
2987 if (Dynarr_length (str->runoff) > 0)
2989 int chunk = min ((int) size, Dynarr_length (str->runoff));
2990 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2991 Dynarr_delete_many (str->runoff, 0, chunk);
2997 break; /* No more room for data */
2999 if (str->flags & CODING_STATE_END)
3000 /* This means that on the previous iteration, we hit the EOF on
3001 the other end. We loop once more so that mule_encode() can
3002 output any final stuff it may be holding, or any "go back
3003 to a sane state" escape sequences. (This latter makes sense
3004 during encoding.) */
3007 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
3008 left of storage in it, so it's OK to read directly into it.
3009 (We'll be overwriting above, after we've encoded it into the
3011 read_size = Lstream_read (str->other_end, data, size);
3018 /* There might be some more end data produced in the translation.
3019 See the comment above. */
3020 str->flags |= CODING_STATE_END;
3021 mule_encode (stream, data, str->runoff, read_size);
3024 if (data == orig_data)
3025 return error_occurred ? -1 : 0;
3027 return data - orig_data;
3031 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
3033 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3036 /* Encode all our data into the runoff, and then attempt to write
3037 it all out to the other end. Remove whatever chunk we succeeded
3039 mule_encode (stream, data, str->runoff, size);
3040 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
3041 Dynarr_length (str->runoff));
3043 Dynarr_delete_many (str->runoff, 0, retval);
3044 /* Do NOT return retval. The return value indicates how much
3045 of the incoming data was written, not how many bytes were
3051 reset_encoding_stream (struct encoding_stream *str)
3054 switch (CODING_SYSTEM_TYPE (str->codesys))
3056 case CODESYS_ISO2022:
3060 str->encode_char = &char_encode_iso2022;
3061 str->finish = &char_finish_iso2022;
3062 for (i = 0; i < 4; i++)
3064 str->iso2022.charset[i] =
3065 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
3066 str->iso2022.force_charset_on_output[i] =
3067 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
3069 str->iso2022.register_left = 0;
3070 str->iso2022.register_right = 1;
3071 str->iso2022.current_charset = Qnil;
3072 str->iso2022.current_half = 0;
3076 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
3079 str->encode_char = &char_encode_utf8;
3080 str->finish = &char_finish_utf8;
3083 str->encode_char = &char_encode_ucs4;
3084 str->finish = &char_finish_ucs4;
3086 case CODESYS_SHIFT_JIS:
3087 str->encode_char = &char_encode_shift_jis;
3088 str->finish = &char_finish_shift_jis;
3091 str->encode_char = &char_encode_big5;
3092 str->finish = &char_finish_big5;
3098 str->iso2022.current_char_boundary = 0;
3099 str->flags = str->ch = 0;
3103 encoding_rewinder (Lstream *stream)
3105 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3106 reset_encoding_stream (str);
3107 Dynarr_reset (str->runoff);
3108 return Lstream_rewind (str->other_end);
3112 encoding_seekable_p (Lstream *stream)
3114 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3115 return Lstream_seekable_p (str->other_end);
3119 encoding_flusher (Lstream *stream)
3121 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3122 return Lstream_flush (str->other_end);
3126 encoding_closer (Lstream *stream)
3128 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3129 if (stream->flags & LSTREAM_FL_WRITE)
3131 str->flags |= CODING_STATE_END;
3132 encoding_writer (stream, 0, 0);
3134 Dynarr_free (str->runoff);
3135 return Lstream_close (str->other_end);
3139 encoding_stream_coding_system (Lstream *stream)
3141 Lisp_Object coding_system;
3142 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3144 XSETCODING_SYSTEM (coding_system, str->codesys);
3145 return coding_system;
3149 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3151 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3152 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3154 reset_encoding_stream (str);
3158 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3161 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3162 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3166 str->runoff = Dynarr_new (unsigned_char);
3167 str->other_end = stream;
3168 set_encoding_stream_coding_system (lstr, codesys);
3169 XSETLSTREAM (obj, lstr);
3174 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3176 return make_encoding_stream_1 (stream, codesys, "r");
3180 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3182 return make_encoding_stream_1 (stream, codesys, "w");
3185 /* Convert N bytes of internally-formatted data stored in SRC to an
3186 external format, according to the encoding stream ENCODING.
3187 Store the encoded data into DST. */
3190 mule_encode (Lstream *encoding, const Bufbyte *src,
3191 unsigned_char_dynarr *dst, size_t n)
3193 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3195 switch (CODING_SYSTEM_TYPE (str->codesys))
3198 case CODESYS_INTERNAL:
3199 Dynarr_add_many (dst, src, n);
3202 case CODESYS_AUTODETECT:
3203 /* If we got this far and still haven't decided on the coding
3204 system, then do no conversion. */
3205 case CODESYS_NO_CONVERSION:
3206 encode_coding_no_conversion (encoding, src, dst, n);
3210 str->ccl.last_block = str->flags & CODING_STATE_END;
3211 /* When applying ccl program to stream, MUST NOT set NULL
3213 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3214 dst, n, 0, CCL_MODE_ENCODING);
3218 text_encode_generic (encoding, src, dst, n);
3222 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3223 Encode the text between START and END using CODING-SYSTEM.
3224 This will, for example, convert Japanese characters into stuff such as
3225 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3226 text. BUFFER defaults to the current buffer if unspecified.
3228 (start, end, coding_system, buffer))
3231 struct buffer *buf = decode_buffer (buffer, 0);
3232 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3233 Lstream *istr, *ostr;
3234 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3236 get_buffer_range_char (buf, start, end, &b, &e, 0);
3238 barf_if_buffer_read_only (buf, b, e);
3240 coding_system = Fget_coding_system (coding_system);
3241 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3242 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3243 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3244 Fget_coding_system (Qbinary));
3245 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3247 istr = XLSTREAM (instream);
3248 ostr = XLSTREAM (outstream);
3249 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3250 /* The chain of streams looks like this:
3252 [BUFFER] <----- send through
3253 ------> [ENCODE AS SPECIFIED]
3254 ------> [DECODE AS BINARY]
3259 char tempbuf[1024]; /* some random amount */
3260 Bufpos newpos, even_newer_pos;
3261 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3262 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3266 newpos = lisp_buffer_stream_startpos (istr);
3267 Lstream_write (ostr, tempbuf, size_in_bytes);
3268 even_newer_pos = lisp_buffer_stream_startpos (istr);
3269 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3275 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3276 Lstream_close (istr);
3277 Lstream_close (ostr);
3279 Lstream_delete (istr);
3280 Lstream_delete (ostr);
3281 Lstream_delete (XLSTREAM (de_outstream));
3282 Lstream_delete (XLSTREAM (lb_outstream));
3283 return make_int (retlen);
3290 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3291 unsigned_char_dynarr *dst, size_t n)
3294 unsigned char char_boundary;
3295 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3296 unsigned int flags = str->flags;
3297 Emchar ch = str->ch;
3299 char_boundary = str->iso2022.current_char_boundary;
3305 if (char_boundary == 0)
3333 (*str->encode_char) (str, c, dst, &flags);
3335 else if (char_boundary == 1)
3337 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3343 ch = (ch << 6) | (c & 0x3f);
3348 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3350 (*str->finish) (str, dst, &flags);
3355 str->iso2022.current_char_boundary = char_boundary;
3359 /************************************************************************/
3360 /* Shift-JIS methods */
3361 /************************************************************************/
3363 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3364 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3365 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3366 encoded by "position-code + 0x80". A character of JISX0208
3367 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3368 position-codes are divided and shifted so that it fit in the range
3371 --- CODE RANGE of Shift-JIS ---
3372 (character set) (range)
3374 JISX0201-Kana 0xA0 .. 0xDF
3375 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3376 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3377 -------------------------------
3381 /* Is this the first byte of a Shift-JIS two-byte char? */
3383 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3384 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3386 /* Is this the second byte of a Shift-JIS two-byte char? */
3388 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3389 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3391 #define BYTE_SJIS_KATAKANA_P(c) \
3392 ((c) >= 0xA1 && (c) <= 0xDF)
3395 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3399 unsigned char c = *(unsigned char *)src++;
3400 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3402 if (st->shift_jis.in_second_byte)
3404 st->shift_jis.in_second_byte = 0;
3408 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3409 st->shift_jis.in_second_byte = 1;
3411 return CODING_CATEGORY_SHIFT_JIS_MASK;
3414 /* Convert Shift-JIS data to internal format. */
3417 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3418 unsigned_char_dynarr *dst, size_t n)
3420 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3421 unsigned int flags = str->flags;
3422 unsigned int cpos = str->cpos;
3423 eol_type_t eol_type = str->eol_type;
3427 unsigned char c = *(unsigned char *)src++;
3431 /* Previous character was first byte of Shift-JIS Kanji char. */
3432 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3434 unsigned char e1, e2;
3436 DECODE_SJIS (cpos, c, e1, e2);
3438 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3442 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3443 Dynarr_add (dst, e1);
3444 Dynarr_add (dst, e2);
3449 DECODE_ADD_BINARY_CHAR (cpos, dst);
3450 DECODE_ADD_BINARY_CHAR (c, dst);
3456 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3457 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3459 else if (BYTE_SJIS_KATAKANA_P (c))
3462 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3465 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3466 Dynarr_add (dst, c);
3471 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3475 DECODE_ADD_BINARY_CHAR (c, dst);
3477 label_continue_loop:;
3480 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3486 /* Convert internal character representation to Shift_JIS. */
3489 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3490 unsigned_char_dynarr *dst, unsigned int *flags)
3492 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3496 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3497 Dynarr_add (dst, '\r');
3498 if (eol_type != EOL_CR)
3499 Dynarr_add (dst, ch);
3503 unsigned int s1, s2;
3505 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3507 if (code_point >= 0)
3508 Dynarr_add (dst, code_point);
3509 else if ((code_point
3510 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3513 ENCODE_SJIS ((code_point >> 8) | 0x80,
3514 (code_point & 0xFF) | 0x80, s1, s2);
3515 Dynarr_add (dst, s1);
3516 Dynarr_add (dst, s2);
3518 else if ((code_point
3519 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3521 Dynarr_add (dst, code_point | 0x80);
3522 else if ((code_point
3523 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3526 ENCODE_SJIS ((code_point >> 8) | 0x80,
3527 (code_point & 0xFF) | 0x80, s1, s2);
3528 Dynarr_add (dst, s1);
3529 Dynarr_add (dst, s2);
3531 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3533 Dynarr_add (dst, code_point);
3535 Dynarr_add (dst, '?');
3537 Lisp_Object charset;
3538 unsigned int c1, c2;
3540 BREAKUP_CHAR (ch, charset, c1, c2);
3542 if (EQ(charset, Vcharset_katakana_jisx0201))
3544 Dynarr_add (dst, c1 | 0x80);
3548 Dynarr_add (dst, c1);
3550 else if (EQ(charset, Vcharset_japanese_jisx0208))
3552 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3553 Dynarr_add (dst, s1);
3554 Dynarr_add (dst, s2);
3557 Dynarr_add (dst, '?');
3563 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3564 unsigned int *flags)
3568 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3569 Decode a JISX0208 character of Shift-JIS coding-system.
3570 CODE is the character code in Shift-JIS as a cons of type bytes.
3571 Return the corresponding character.
3575 unsigned char c1, c2, s1, s2;
3578 CHECK_INT (XCAR (code));
3579 CHECK_INT (XCDR (code));
3580 s1 = XINT (XCAR (code));
3581 s2 = XINT (XCDR (code));
3582 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3583 BYTE_SJIS_TWO_BYTE_2_P (s2))
3585 DECODE_SJIS (s1, s2, c1, c2);
3586 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3587 c1 & 0x7F, c2 & 0x7F));
3593 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3594 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3595 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3599 Lisp_Object charset;
3602 CHECK_CHAR_COERCE_INT (character);
3603 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3604 if (EQ (charset, Vcharset_japanese_jisx0208))
3606 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3607 return Fcons (make_int (s1), make_int (s2));
3614 /************************************************************************/
3616 /************************************************************************/
3618 /* BIG5 is a coding system encoding two character sets: ASCII and
3619 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3620 character set and is encoded in two-byte.
3622 --- CODE RANGE of BIG5 ---
3623 (character set) (range)
3625 Big5 (1st byte) 0xA1 .. 0xFE
3626 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3627 --------------------------
3629 Since the number of characters in Big5 is larger than maximum
3630 characters in Emacs' charset (96x96), it can't be handled as one
3631 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3632 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3633 contains frequently used characters and the latter contains less
3634 frequently used characters. */
3637 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3638 ((c) >= 0x81 && (c) <= 0xFE)
3640 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3641 ((c) >= 0xA1 && (c) <= 0xFE)
3644 /* Is this the second byte of a Shift-JIS two-byte char? */
3646 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3647 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3649 /* Number of Big5 characters which have the same code in 1st byte. */
3651 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3653 /* Code conversion macros. These are macros because they are used in
3654 inner loops during code conversion.
3656 Note that temporary variables in macros introduce the classic
3657 dynamic-scoping problems with variable names. We use capital-
3658 lettered variables in the assumption that XEmacs does not use
3659 capital letters in variables except in a very formalized way
3662 /* Convert Big5 code (b1, b2) into its internal string representation
3665 /* There is a much simpler way to split the Big5 charset into two.
3666 For the moment I'm going to leave the algorithm as-is because it
3667 claims to separate out the most-used characters into a single
3668 charset, which perhaps will lead to optimizations in various
3671 The way the algorithm works is something like this:
3673 Big5 can be viewed as a 94x157 charset, where the row is
3674 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3675 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3676 the split between low and high column numbers is apparently
3677 meaningless; ascending rows produce less and less frequent chars.
3678 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3679 the first charset, and the upper half (0xC9 .. 0xFE) to the
3680 second. To do the conversion, we convert the character into
3681 a single number where 0 .. 156 is the first row, 157 .. 313
3682 is the second, etc. That way, the characters are ordered by
3683 decreasing frequency. Then we just chop the space in two
3684 and coerce the result into a 94x94 space.
3687 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3689 int B1 = b1, B2 = b2; \
3691 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3695 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3699 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3700 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3702 c1 = I / (0xFF - 0xA1) + 0xA1; \
3703 c2 = I % (0xFF - 0xA1) + 0xA1; \
3706 /* Convert the internal string representation of a Big5 character
3707 (lb, c1, c2) into Big5 code (b1, b2). */
3709 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3711 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3713 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3715 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3717 b1 = I / BIG5_SAME_ROW + 0xA1; \
3718 b2 = I % BIG5_SAME_ROW; \
3719 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3723 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3727 unsigned char c = *(unsigned char *)src++;
3728 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3730 || (c >= 0x80 && c <= 0xA0)
3734 if (st->big5.in_second_byte)
3736 st->big5.in_second_byte = 0;
3737 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3747 st->big5.in_second_byte = 1;
3749 return CODING_CATEGORY_BIG5_MASK;
3752 /* Convert Big5 data to internal format. */
3755 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3756 unsigned_char_dynarr *dst, size_t n)
3758 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3759 unsigned int flags = str->flags;
3760 unsigned int cpos = str->cpos;
3761 eol_type_t eol_type = str->eol_type;
3764 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3765 (decoding)->codesys, 1);
3770 unsigned char c = *(unsigned char *)src++;
3773 /* Previous character was first byte of Big5 char. */
3774 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3777 int code_point = (cpos << 8) | c;
3778 Emchar char_id = decode_defined_char (ccs, code_point);
3781 char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
3782 DECODE_ADD_UCS_CHAR (char_id, dst);
3784 unsigned char b1, b2, b3;
3785 DECODE_BIG5 (cpos, c, b1, b2, b3);
3786 Dynarr_add (dst, b1);
3787 Dynarr_add (dst, b2);
3788 Dynarr_add (dst, b3);
3793 DECODE_ADD_BINARY_CHAR (cpos, dst);
3794 DECODE_ADD_BINARY_CHAR (c, dst);
3800 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3801 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3804 DECODE_ADD_BINARY_CHAR (c, dst);
3806 label_continue_loop:;
3809 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3815 /* Convert internally-formatted data to Big5. */
3818 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3819 unsigned_char_dynarr *dst, unsigned int *flags)
3821 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3825 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3826 Dynarr_add (dst, '\r');
3827 if (eol_type != EOL_CR)
3828 Dynarr_add (dst, ch);
3835 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
3837 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
3838 Dynarr_add (dst, code_point);
3839 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
3841 Dynarr_add (dst, code_point >> 8);
3842 Dynarr_add (dst, code_point & 0xFF);
3844 else if ((code_point
3845 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
3847 Dynarr_add (dst, code_point >> 8);
3848 Dynarr_add (dst, code_point & 0xFF);
3850 else if ((code_point
3851 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
3854 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3855 + ((code_point & 0xFF) - 33);
3856 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3857 unsigned char b2 = I % BIG5_SAME_ROW;
3859 b2 += b2 < 0x3F ? 0x40 : 0x62;
3860 Dynarr_add (dst, b1);
3861 Dynarr_add (dst, b2);
3863 else if ((code_point
3864 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
3867 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3868 + ((code_point & 0xFF) - 33);
3869 unsigned char b1, b2;
3871 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3872 b1 = I / BIG5_SAME_ROW + 0xA1;
3873 b2 = I % BIG5_SAME_ROW;
3874 b2 += b2 < 0x3F ? 0x40 : 0x62;
3875 Dynarr_add (dst, b1);
3876 Dynarr_add (dst, b2);
3879 Dynarr_add (dst, '?');
3886 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3887 unsigned int *flags)
3892 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3893 Decode a Big5 character CODE of BIG5 coding-system.
3894 CODE is the character code in BIG5, a cons of two integers.
3895 Return the corresponding character.
3899 unsigned char c1, c2, b1, b2;
3902 CHECK_INT (XCAR (code));
3903 CHECK_INT (XCDR (code));
3904 b1 = XINT (XCAR (code));
3905 b2 = XINT (XCDR (code));
3906 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3907 BYTE_BIG5_TWO_BYTE_2_P (b2))
3909 Charset_ID leading_byte;
3910 Lisp_Object charset;
3911 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3912 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3913 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3919 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3920 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3921 Return the corresponding character code in Big5.
3925 Lisp_Object charset;
3928 CHECK_CHAR_COERCE_INT (character);
3929 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3930 if (EQ (charset, Vcharset_chinese_big5_1) ||
3931 EQ (charset, Vcharset_chinese_big5_2))
3933 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3935 return Fcons (make_int (b1), make_int (b2));
3942 /************************************************************************/
3944 /************************************************************************/
3947 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3951 unsigned char c = *(unsigned char *)src++;
3952 switch (st->ucs4.in_byte)
3961 st->ucs4.in_byte = 0;
3967 return CODING_CATEGORY_UCS4_MASK;
3971 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3972 unsigned_char_dynarr *dst, size_t n)
3974 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3975 unsigned int flags = str->flags;
3976 unsigned int cpos = str->cpos;
3977 unsigned char counter = str->counter;
3981 unsigned char c = *(unsigned char *)src++;
3989 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3994 cpos = ( cpos << 8 ) | c;
3998 if (counter & CODING_STATE_END)
3999 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4003 str->counter = counter;
4007 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4008 unsigned_char_dynarr *dst, unsigned int *flags)
4010 Dynarr_add (dst, ch >> 24);
4011 Dynarr_add (dst, ch >> 16);
4012 Dynarr_add (dst, ch >> 8);
4013 Dynarr_add (dst, ch );
4017 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4018 unsigned int *flags)
4023 /************************************************************************/
4025 /************************************************************************/
4028 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
4032 unsigned char c = *(unsigned char *)src++;
4033 switch (st->utf8.in_byte)
4036 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4039 st->utf8.in_byte = 5;
4041 st->utf8.in_byte = 4;
4043 st->utf8.in_byte = 3;
4045 st->utf8.in_byte = 2;
4047 st->utf8.in_byte = 1;
4052 if ((c & 0xc0) != 0x80)
4058 return CODING_CATEGORY_UTF8_MASK;
4062 decode_output_utf8_partial_char (unsigned char counter,
4064 unsigned_char_dynarr *dst)
4067 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4068 else if (counter == 4)
4070 if (cpos < (1 << 6))
4071 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4074 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4075 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4078 else if (counter == 3)
4080 if (cpos < (1 << 6))
4081 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4082 else if (cpos < (1 << 12))
4084 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4085 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4089 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4090 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4091 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4094 else if (counter == 2)
4096 if (cpos < (1 << 6))
4097 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4098 else if (cpos < (1 << 12))
4100 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4101 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4103 else if (cpos < (1 << 18))
4105 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4106 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4107 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4111 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4112 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4113 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4114 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4119 if (cpos < (1 << 6))
4120 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4121 else if (cpos < (1 << 12))
4123 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4124 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4126 else if (cpos < (1 << 18))
4128 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4129 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4130 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4132 else if (cpos < (1 << 24))
4134 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4135 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4136 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4137 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4141 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4142 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4143 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4144 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4145 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4151 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4152 unsigned_char_dynarr *dst, size_t n)
4154 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4155 unsigned int flags = str->flags;
4156 unsigned int cpos = str->cpos;
4157 eol_type_t eol_type = str->eol_type;
4158 unsigned char counter = str->counter;
4162 unsigned char c = *(unsigned char *)src++;
4167 COMPOSE_FLUSH_CHARS (str, dst);
4168 decode_flush_er_chars (str, dst);
4169 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4170 DECODE_ADD_UCS_CHAR (c, dst);
4172 else if ( c < 0xC0 )
4173 /* decode_add_er_char (str, c, dst); */
4174 COMPOSE_ADD_CHAR (str, c, dst);
4177 /* decode_flush_er_chars (str, dst); */
4183 else if ( c < 0xF0 )
4188 else if ( c < 0xF8 )
4193 else if ( c < 0xFC )
4205 else if ( (c & 0xC0) == 0x80 )
4207 cpos = ( cpos << 6 ) | ( c & 0x3f );
4210 /* DECODE_ADD_UCS_CHAR (cpos, dst); */
4211 COMPOSE_ADD_CHAR (str, cpos, dst);
4220 COMPOSE_FLUSH_CHARS (str, dst);
4221 decode_flush_er_chars (str, dst);
4222 decode_output_utf8_partial_char (counter, cpos, dst);
4223 DECODE_ADD_BINARY_CHAR (c, dst);
4227 label_continue_loop:;
4230 if (flags & CODING_STATE_END)
4232 COMPOSE_FLUSH_CHARS (str, dst);
4233 decode_flush_er_chars (str, dst);
4236 decode_output_utf8_partial_char (counter, cpos, dst);
4243 str->counter = counter;
4247 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4248 unsigned_char_dynarr *dst, unsigned int *flags)
4250 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4254 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4255 Dynarr_add (dst, '\r');
4256 if (eol_type != EOL_CR)
4257 Dynarr_add (dst, ch);
4259 else if (ch <= 0x7f)
4261 Dynarr_add (dst, ch);
4265 int code_point = charset_code_point (Vcharset_ucs, ch, 0);
4267 if ( (code_point < 0) || (code_point > 0x10FFFF) )
4269 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4271 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
4275 int format_columns, idx;
4276 char buf[16], format[16];
4278 while (!NILP (rest))
4282 if (!NILP (ccs = Ffind_charset (ccs)))
4285 = charset_code_point (ccs, ch, 0)) >= 0 )
4290 && ((idx =XSTRING_LENGTH (ret)) <= 6))
4292 strncpy (format, XSTRING_DATA (ret), idx);
4301 format [idx++] = '%';
4302 format_columns = XINT (ret);
4303 if ( (2 <= format_columns)
4304 && (format_columns <= 8) )
4306 format [idx++] = '0';
4307 format [idx++] = '0' + format_columns;
4314 format [idx++] = 'd';
4315 else if (EQ (ret, Qx))
4316 format [idx++] = 'x';
4317 else if (EQ (ret, QX))
4318 format [idx++] = 'X';
4323 sprintf (buf, format, code_point);
4324 Dynarr_add (dst, '&');
4325 Dynarr_add_many (dst, buf, strlen (buf));
4326 Dynarr_add (dst, ';');
4332 sprintf (buf, "&MCS-%08X;", ch);
4333 Dynarr_add_many (dst, buf, strlen (buf));
4339 if (code_point <= 0x7ff)
4341 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4342 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4344 else if (code_point <= 0xffff)
4346 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4347 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4348 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4350 else if (code_point <= 0x1fffff)
4352 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4353 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4354 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4355 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4357 else if (code_point <= 0x3ffffff)
4359 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4360 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4361 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4362 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4363 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4367 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4368 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4369 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4370 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4371 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4372 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4378 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4379 unsigned int *flags)
4384 /************************************************************************/
4385 /* ISO2022 methods */
4386 /************************************************************************/
4388 /* The following note describes the coding system ISO2022 briefly.
4389 Since the intention of this note is to help understand the
4390 functions in this file, some parts are NOT ACCURATE or OVERLY
4391 SIMPLIFIED. For thorough understanding, please refer to the
4392 original document of ISO2022.
4394 ISO2022 provides many mechanisms to encode several character sets
4395 in 7-bit and 8-bit environments. For 7-bit environments, all text
4396 is encoded using bytes less than 128. This may make the encoded
4397 text a little bit longer, but the text passes more easily through
4398 several gateways, some of which strip off MSB (Most Signigant Bit).
4400 There are two kinds of character sets: control character set and
4401 graphic character set. The former contains control characters such
4402 as `newline' and `escape' to provide control functions (control
4403 functions are also provided by escape sequences). The latter
4404 contains graphic characters such as 'A' and '-'. Emacs recognizes
4405 two control character sets and many graphic character sets.
4407 Graphic character sets are classified into one of the following
4408 four classes, according to the number of bytes (DIMENSION) and
4409 number of characters in one dimension (CHARS) of the set:
4410 - DIMENSION1_CHARS94
4411 - DIMENSION1_CHARS96
4412 - DIMENSION2_CHARS94
4413 - DIMENSION2_CHARS96
4415 In addition, each character set is assigned an identification tag,
4416 unique for each set, called "final character" (denoted as <F>
4417 hereafter). The <F> of each character set is decided by ECMA(*)
4418 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4419 (0x30..0x3F are for private use only).
4421 Note (*): ECMA = European Computer Manufacturers Association
4423 Here are examples of graphic character set [NAME(<F>)]:
4424 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4425 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4426 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4427 o DIMENSION2_CHARS96 -- none for the moment
4429 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4430 C0 [0x00..0x1F] -- control character plane 0
4431 GL [0x20..0x7F] -- graphic character plane 0
4432 C1 [0x80..0x9F] -- control character plane 1
4433 GR [0xA0..0xFF] -- graphic character plane 1
4435 A control character set is directly designated and invoked to C0 or
4436 C1 by an escape sequence. The most common case is that:
4437 - ISO646's control character set is designated/invoked to C0, and
4438 - ISO6429's control character set is designated/invoked to C1,
4439 and usually these designations/invocations are omitted in encoded
4440 text. In a 7-bit environment, only C0 can be used, and a control
4441 character for C1 is encoded by an appropriate escape sequence to
4442 fit into the environment. All control characters for C1 are
4443 defined to have corresponding escape sequences.
4445 A graphic character set is at first designated to one of four
4446 graphic registers (G0 through G3), then these graphic registers are
4447 invoked to GL or GR. These designations and invocations can be
4448 done independently. The most common case is that G0 is invoked to
4449 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4450 these invocations and designations are omitted in encoded text.
4451 In a 7-bit environment, only GL can be used.
4453 When a graphic character set of CHARS94 is invoked to GL, codes
4454 0x20 and 0x7F of the GL area work as control characters SPACE and
4455 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4458 There are two ways of invocation: locking-shift and single-shift.
4459 With locking-shift, the invocation lasts until the next different
4460 invocation, whereas with single-shift, the invocation affects the
4461 following character only and doesn't affect the locking-shift
4462 state. Invocations are done by the following control characters or
4465 ----------------------------------------------------------------------
4466 abbrev function cntrl escape seq description
4467 ----------------------------------------------------------------------
4468 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4469 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4470 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4471 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4472 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4473 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4474 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4475 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4476 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4477 ----------------------------------------------------------------------
4478 (*) These are not used by any known coding system.
4480 Control characters for these functions are defined by macros
4481 ISO_CODE_XXX in `coding.h'.
4483 Designations are done by the following escape sequences:
4484 ----------------------------------------------------------------------
4485 escape sequence description
4486 ----------------------------------------------------------------------
4487 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4488 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4489 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4490 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4491 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4492 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4493 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4494 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4495 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4496 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4497 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4498 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4499 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4500 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4501 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4502 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4503 ----------------------------------------------------------------------
4505 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4506 of dimension 1, chars 94, and final character <F>, etc...
4508 Note (*): Although these designations are not allowed in ISO2022,
4509 Emacs accepts them on decoding, and produces them on encoding
4510 CHARS96 character sets in a coding system which is characterized as
4511 7-bit environment, non-locking-shift, and non-single-shift.
4513 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4514 '(' can be omitted. We refer to this as "short-form" hereafter.
4516 Now you may notice that there are a lot of ways for encoding the
4517 same multilingual text in ISO2022. Actually, there exist many
4518 coding systems such as Compound Text (used in X11's inter client
4519 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4520 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4521 localized platforms), and all of these are variants of ISO2022.
4523 In addition to the above, Emacs handles two more kinds of escape
4524 sequences: ISO6429's direction specification and Emacs' private
4525 sequence for specifying character composition.
4527 ISO6429's direction specification takes the following form:
4528 o CSI ']' -- end of the current direction
4529 o CSI '0' ']' -- end of the current direction
4530 o CSI '1' ']' -- start of left-to-right text
4531 o CSI '2' ']' -- start of right-to-left text
4532 The control character CSI (0x9B: control sequence introducer) is
4533 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4535 Character composition specification takes the following form:
4536 o ESC '0' -- start character composition
4537 o ESC '1' -- end character composition
4538 Since these are not standard escape sequences of any ISO standard,
4539 their use with these meanings is restricted to Emacs only. */
4542 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4546 for (i = 0; i < 4; i++)
4548 if (!NILP (coding_system))
4550 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4552 iso->charset[i] = Qt;
4553 iso->invalid_designated[i] = 0;
4555 iso->esc = ISO_ESC_NOTHING;
4556 iso->esc_bytes_index = 0;
4557 iso->register_left = 0;
4558 iso->register_right = 1;
4559 iso->switched_dir_and_no_valid_charset_yet = 0;
4560 iso->invalid_switch_dir = 0;
4561 iso->output_direction_sequence = 0;
4562 iso->output_literally = 0;
4563 #ifdef ENABLE_COMPOSITE_CHARS
4564 if (iso->composite_chars)
4565 Dynarr_reset (iso->composite_chars);
4570 fit_to_be_escape_quoted (unsigned char c)
4587 /* Parse one byte of an ISO2022 escape sequence.
4588 If the result is an invalid escape sequence, return 0 and
4589 do not change anything in STR. Otherwise, if the result is
4590 an incomplete escape sequence, update ISO2022.ESC and
4591 ISO2022.ESC_BYTES and return -1. Otherwise, update
4592 all the state variables (but not ISO2022.ESC_BYTES) and
4595 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4596 or invocation of an invalid character set and treat that as
4597 an unrecognized escape sequence. */
4600 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4601 unsigned char c, unsigned int *flags,
4602 int check_invalid_charsets)
4604 /* (1) If we're at the end of a designation sequence, CS is the
4605 charset being designated and REG is the register to designate
4608 (2) If we're at the end of a locking-shift sequence, REG is
4609 the register to invoke and HALF (0 == left, 1 == right) is
4610 the half to invoke it into.
4612 (3) If we're at the end of a single-shift sequence, REG is
4613 the register to invoke. */
4614 Lisp_Object cs = Qnil;
4617 /* NOTE: This code does goto's all over the fucking place.
4618 The reason for this is that we're basically implementing
4619 a state machine here, and hierarchical languages like C
4620 don't really provide a clean way of doing this. */
4622 if (! (*flags & CODING_STATE_ESCAPE))
4623 /* At beginning of escape sequence; we need to reset our
4624 escape-state variables. */
4625 iso->esc = ISO_ESC_NOTHING;
4627 iso->output_literally = 0;
4628 iso->output_direction_sequence = 0;
4632 case ISO_ESC_NOTHING:
4633 iso->esc_bytes_index = 0;
4636 case ISO_CODE_ESC: /* Start escape sequence */
4637 *flags |= CODING_STATE_ESCAPE;
4641 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4642 *flags |= CODING_STATE_ESCAPE;
4643 iso->esc = ISO_ESC_5_11;
4646 case ISO_CODE_SO: /* locking shift 1 */
4649 case ISO_CODE_SI: /* locking shift 0 */
4653 case ISO_CODE_SS2: /* single shift */
4656 case ISO_CODE_SS3: /* single shift */
4660 default: /* Other control characters */
4667 /**** single shift ****/
4669 case 'N': /* single shift 2 */
4672 case 'O': /* single shift 3 */
4676 /**** locking shift ****/
4678 case '~': /* locking shift 1 right */
4681 case 'n': /* locking shift 2 */
4684 case '}': /* locking shift 2 right */
4687 case 'o': /* locking shift 3 */
4690 case '|': /* locking shift 3 right */
4694 #ifdef ENABLE_COMPOSITE_CHARS
4695 /**** composite ****/
4698 iso->esc = ISO_ESC_START_COMPOSITE;
4699 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4700 CODING_STATE_COMPOSITE;
4704 iso->esc = ISO_ESC_END_COMPOSITE;
4705 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4706 ~CODING_STATE_COMPOSITE;
4708 #endif /* ENABLE_COMPOSITE_CHARS */
4710 /**** directionality ****/
4713 iso->esc = ISO_ESC_5_11;
4716 /**** designation ****/
4718 case '$': /* multibyte charset prefix */
4719 iso->esc = ISO_ESC_2_4;
4723 if (0x28 <= c && c <= 0x2F)
4725 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4729 /* This function is called with CODESYS equal to nil when
4730 doing coding-system detection. */
4732 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4733 && fit_to_be_escape_quoted (c))
4735 iso->esc = ISO_ESC_LITERAL;
4736 *flags &= CODING_STATE_ISO2022_LOCK;
4746 /**** directionality ****/
4748 case ISO_ESC_5_11: /* ISO6429 direction control */
4751 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4752 goto directionality;
4754 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4755 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4756 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4760 case ISO_ESC_5_11_0:
4763 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4764 goto directionality;
4768 case ISO_ESC_5_11_1:
4771 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4772 goto directionality;
4776 case ISO_ESC_5_11_2:
4779 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4780 goto directionality;
4785 iso->esc = ISO_ESC_DIRECTIONALITY;
4786 /* Various junk here to attempt to preserve the direction sequences
4787 literally in the text if they would otherwise be swallowed due
4788 to invalid designations that don't show up as actual charset
4789 changes in the text. */
4790 if (iso->invalid_switch_dir)
4792 /* We already inserted a direction switch literally into the
4793 text. We assume (#### this may not be right) that the
4794 next direction switch is the one going the other way,
4795 and we need to output that literally as well. */
4796 iso->output_literally = 1;
4797 iso->invalid_switch_dir = 0;
4803 /* If we are in the thrall of an invalid designation,
4804 then stick the directionality sequence literally into the
4805 output stream so it ends up in the original text again. */
4806 for (jj = 0; jj < 4; jj++)
4807 if (iso->invalid_designated[jj])
4811 iso->output_literally = 1;
4812 iso->invalid_switch_dir = 1;
4815 /* Indicate that we haven't yet seen a valid designation,
4816 so that if a switch-dir is directly followed by an
4817 invalid designation, both get inserted literally. */
4818 iso->switched_dir_and_no_valid_charset_yet = 1;
4823 /**** designation ****/
4826 if (0x28 <= c && c <= 0x2F)
4828 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4831 if (0x40 <= c && c <= 0x42)
4834 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4835 *flags & CODING_STATE_R2L ?
4836 CHARSET_RIGHT_TO_LEFT :
4837 CHARSET_LEFT_TO_RIGHT);
4848 if (c < '0' || c > '~')
4849 return 0; /* bad final byte */
4851 if (iso->esc >= ISO_ESC_2_8 &&
4852 iso->esc <= ISO_ESC_2_15)
4854 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4855 single = 1; /* single-byte */
4856 reg = (iso->esc - ISO_ESC_2_8) & 3;
4858 else if (iso->esc >= ISO_ESC_2_4_8 &&
4859 iso->esc <= ISO_ESC_2_4_15)
4861 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4862 single = -1; /* multi-byte */
4863 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4867 /* Can this ever be reached? -slb */
4871 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4872 *flags & CODING_STATE_R2L ?
4873 CHARSET_RIGHT_TO_LEFT :
4874 CHARSET_LEFT_TO_RIGHT);
4880 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4884 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4885 /* can't invoke something that ain't there. */
4887 iso->esc = ISO_ESC_SINGLE_SHIFT;
4888 *flags &= CODING_STATE_ISO2022_LOCK;
4890 *flags |= CODING_STATE_SS2;
4892 *flags |= CODING_STATE_SS3;
4896 if (check_invalid_charsets &&
4897 !CHARSETP (iso->charset[reg]))
4898 /* can't invoke something that ain't there. */
4901 iso->register_right = reg;
4903 iso->register_left = reg;
4904 *flags &= CODING_STATE_ISO2022_LOCK;
4905 iso->esc = ISO_ESC_LOCKING_SHIFT;
4909 if (NILP (cs) && check_invalid_charsets)
4911 iso->invalid_designated[reg] = 1;
4912 iso->charset[reg] = Vcharset_ascii;
4913 iso->esc = ISO_ESC_DESIGNATE;
4914 *flags &= CODING_STATE_ISO2022_LOCK;
4915 iso->output_literally = 1;
4916 if (iso->switched_dir_and_no_valid_charset_yet)
4918 /* We encountered a switch-direction followed by an
4919 invalid designation. Ensure that the switch-direction
4920 gets outputted; otherwise it will probably get eaten
4921 when the text is written out again. */
4922 iso->switched_dir_and_no_valid_charset_yet = 0;
4923 iso->output_direction_sequence = 1;
4924 /* And make sure that the switch-dir going the other
4925 way gets outputted, as well. */
4926 iso->invalid_switch_dir = 1;
4930 /* This function is called with CODESYS equal to nil when
4931 doing coding-system detection. */
4932 if (!NILP (codesys))
4934 charset_conversion_spec_dynarr *dyn =
4935 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4941 for (i = 0; i < Dynarr_length (dyn); i++)
4943 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4944 if (EQ (cs, spec->from_charset))
4945 cs = spec->to_charset;
4950 iso->charset[reg] = cs;
4951 iso->esc = ISO_ESC_DESIGNATE;
4952 *flags &= CODING_STATE_ISO2022_LOCK;
4953 if (iso->invalid_designated[reg])
4955 iso->invalid_designated[reg] = 0;
4956 iso->output_literally = 1;
4958 if (iso->switched_dir_and_no_valid_charset_yet)
4959 iso->switched_dir_and_no_valid_charset_yet = 0;
4964 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4968 /* #### There are serious deficiencies in the recognition mechanism
4969 here. This needs to be much smarter if it's going to cut it.
4970 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4971 it should be detected as Latin-1.
4972 All the ISO2022 stuff in this file should be synced up with the
4973 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4974 Perhaps we should wait till R2L works in FSF Emacs? */
4976 if (!st->iso2022.initted)
4978 reset_iso2022 (Qnil, &st->iso2022.iso);
4979 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4980 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4981 CODING_CATEGORY_ISO_8_1_MASK |
4982 CODING_CATEGORY_ISO_8_2_MASK |
4983 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4984 st->iso2022.flags = 0;
4985 st->iso2022.high_byte_count = 0;
4986 st->iso2022.saw_single_shift = 0;
4987 st->iso2022.initted = 1;
4990 mask = st->iso2022.mask;
4994 unsigned char c = *(unsigned char *)src++;
4997 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4998 st->iso2022.high_byte_count++;
5002 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5004 if (st->iso2022.high_byte_count & 1)
5005 /* odd number of high bytes; assume not iso-8-2 */
5006 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5008 st->iso2022.high_byte_count = 0;
5009 st->iso2022.saw_single_shift = 0;
5011 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5013 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5014 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5015 { /* control chars */
5018 /* Allow and ignore control characters that you might
5019 reasonably see in a text file */
5024 case 8: /* backspace */
5025 case 11: /* vertical tab */
5026 case 12: /* form feed */
5027 case 26: /* MS-DOS C-z junk */
5028 case 31: /* '^_' -- for info */
5029 goto label_continue_loop;
5036 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5039 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5040 &st->iso2022.flags, 0))
5042 switch (st->iso2022.iso.esc)
5044 case ISO_ESC_DESIGNATE:
5045 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5046 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5048 case ISO_ESC_LOCKING_SHIFT:
5049 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5050 goto ran_out_of_chars;
5051 case ISO_ESC_SINGLE_SHIFT:
5052 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5053 st->iso2022.saw_single_shift = 1;
5062 goto ran_out_of_chars;
5065 label_continue_loop:;
5074 postprocess_iso2022_mask (int mask)
5076 /* #### kind of cheesy */
5077 /* If seven-bit ISO is allowed, then assume that the encoding is
5078 entirely seven-bit and turn off the eight-bit ones. */
5079 if (mask & CODING_CATEGORY_ISO_7_MASK)
5080 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5081 CODING_CATEGORY_ISO_8_1_MASK |
5082 CODING_CATEGORY_ISO_8_2_MASK);
5086 /* If FLAGS is a null pointer or specifies right-to-left motion,
5087 output a switch-dir-to-left-to-right sequence to DST.
5088 Also update FLAGS if it is not a null pointer.
5089 If INTERNAL_P is set, we are outputting in internal format and
5090 need to handle the CSI differently. */
5093 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5094 unsigned_char_dynarr *dst,
5095 unsigned int *flags,
5098 if (!flags || (*flags & CODING_STATE_R2L))
5100 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5102 Dynarr_add (dst, ISO_CODE_ESC);
5103 Dynarr_add (dst, '[');
5105 else if (internal_p)
5106 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5108 Dynarr_add (dst, ISO_CODE_CSI);
5109 Dynarr_add (dst, '0');
5110 Dynarr_add (dst, ']');
5112 *flags &= ~CODING_STATE_R2L;
5116 /* If FLAGS is a null pointer or specifies a direction different from
5117 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5118 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5119 sequence to DST. Also update FLAGS if it is not a null pointer.
5120 If INTERNAL_P is set, we are outputting in internal format and
5121 need to handle the CSI differently. */
5124 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5125 unsigned_char_dynarr *dst, unsigned int *flags,
5128 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5129 direction == CHARSET_LEFT_TO_RIGHT)
5130 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5131 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5132 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5133 direction == CHARSET_RIGHT_TO_LEFT)
5135 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5137 Dynarr_add (dst, ISO_CODE_ESC);
5138 Dynarr_add (dst, '[');
5140 else if (internal_p)
5141 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5143 Dynarr_add (dst, ISO_CODE_CSI);
5144 Dynarr_add (dst, '2');
5145 Dynarr_add (dst, ']');
5147 *flags |= CODING_STATE_R2L;
5151 /* Convert ISO2022-format data to internal format. */
5154 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5155 unsigned_char_dynarr *dst, size_t n)
5157 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5158 unsigned int flags = str->flags;
5159 unsigned int cpos = str->cpos;
5160 unsigned char counter = str->counter;
5161 eol_type_t eol_type = str->eol_type;
5162 #ifdef ENABLE_COMPOSITE_CHARS
5163 unsigned_char_dynarr *real_dst = dst;
5165 Lisp_Object coding_system;
5167 XSETCODING_SYSTEM (coding_system, str->codesys);
5169 #ifdef ENABLE_COMPOSITE_CHARS
5170 if (flags & CODING_STATE_COMPOSITE)
5171 dst = str->iso2022.composite_chars;
5172 #endif /* ENABLE_COMPOSITE_CHARS */
5176 unsigned char c = *(unsigned char *)src++;
5177 if (flags & CODING_STATE_ESCAPE)
5178 { /* Within ESC sequence */
5179 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5184 switch (str->iso2022.esc)
5186 #ifdef ENABLE_COMPOSITE_CHARS
5187 case ISO_ESC_START_COMPOSITE:
5188 if (str->iso2022.composite_chars)
5189 Dynarr_reset (str->iso2022.composite_chars);
5191 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5192 dst = str->iso2022.composite_chars;
5194 case ISO_ESC_END_COMPOSITE:
5196 Bufbyte comstr[MAX_EMCHAR_LEN];
5198 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5199 Dynarr_length (dst));
5201 len = set_charptr_emchar (comstr, emch);
5202 Dynarr_add_many (dst, comstr, len);
5205 #endif /* ENABLE_COMPOSITE_CHARS */
5207 case ISO_ESC_LITERAL:
5208 COMPOSE_FLUSH_CHARS (str, dst);
5209 decode_flush_er_chars (str, dst);
5210 DECODE_ADD_BINARY_CHAR (c, dst);
5214 /* Everything else handled already */
5219 /* Attempted error recovery. */
5220 if (str->iso2022.output_direction_sequence)
5221 ensure_correct_direction (flags & CODING_STATE_R2L ?
5222 CHARSET_RIGHT_TO_LEFT :
5223 CHARSET_LEFT_TO_RIGHT,
5224 str->codesys, dst, 0, 1);
5225 /* More error recovery. */
5226 if (!retval || str->iso2022.output_literally)
5228 /* Output the (possibly invalid) sequence */
5230 COMPOSE_FLUSH_CHARS (str, dst);
5231 decode_flush_er_chars (str, dst);
5232 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5233 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5234 flags &= CODING_STATE_ISO2022_LOCK;
5236 n++, src--;/* Repeat the loop with the same character. */
5239 /* No sense in reprocessing the final byte of the
5240 escape sequence; it could mess things up anyway.
5242 COMPOSE_FLUSH_CHARS (str, dst);
5243 decode_flush_er_chars (str, dst);
5244 DECODE_ADD_BINARY_CHAR (c, dst);
5250 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5251 { /* Control characters */
5253 /***** Error-handling *****/
5255 /* If we were in the middle of a character, dump out the
5256 partial character. */
5259 COMPOSE_FLUSH_CHARS (str, dst);
5260 decode_flush_er_chars (str, dst);
5264 DECODE_ADD_BINARY_CHAR
5265 ((unsigned char)(cpos >> (counter * 8)), dst);
5270 /* If we just saw a single-shift character, dump it out.
5271 This may dump out the wrong sort of single-shift character,
5272 but least it will give an indication that something went
5274 if (flags & CODING_STATE_SS2)
5276 COMPOSE_FLUSH_CHARS (str, dst);
5277 decode_flush_er_chars (str, dst);
5278 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5279 flags &= ~CODING_STATE_SS2;
5281 if (flags & CODING_STATE_SS3)
5283 COMPOSE_FLUSH_CHARS (str, dst);
5284 decode_flush_er_chars (str, dst);
5285 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5286 flags &= ~CODING_STATE_SS3;
5289 /***** Now handle the control characters. *****/
5295 COMPOSE_FLUSH_CHARS (str, dst);
5296 decode_flush_er_chars (str, dst);
5297 if (eol_type == EOL_CR)
5298 Dynarr_add (dst, '\n');
5299 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5300 Dynarr_add (dst, c);
5302 flags |= CODING_STATE_CR;
5303 goto label_continue_loop;
5305 else if (flags & CODING_STATE_CR)
5306 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5308 Dynarr_add (dst, '\r');
5309 flags &= ~CODING_STATE_CR;
5312 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5315 flags &= CODING_STATE_ISO2022_LOCK;
5317 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5319 COMPOSE_FLUSH_CHARS (str, dst);
5320 decode_flush_er_chars (str, dst);
5321 DECODE_ADD_BINARY_CHAR (c, dst);
5325 { /* Graphic characters */
5326 Lisp_Object charset;
5335 COMPOSE_FLUSH_CHARS (str, dst);
5336 decode_flush_er_chars (str, dst);
5337 if (eol_type == EOL_CR)
5338 Dynarr_add (dst, '\n');
5339 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5340 Dynarr_add (dst, c);
5342 flags |= CODING_STATE_CR;
5343 goto label_continue_loop;
5345 else if (flags & CODING_STATE_CR)
5346 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5348 Dynarr_add (dst, '\r');
5349 flags &= ~CODING_STATE_CR;
5352 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5355 /* Now determine the charset. */
5356 reg = ((flags & CODING_STATE_SS2) ? 2
5357 : (flags & CODING_STATE_SS3) ? 3
5358 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5359 : str->iso2022.register_left);
5360 charset = str->iso2022.charset[reg];
5362 /* Error checking: */
5363 if (! CHARSETP (charset)
5364 || str->iso2022.invalid_designated[reg]
5365 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5366 && XCHARSET_CHARS (charset) == 94))
5367 /* Mrmph. We are trying to invoke a register that has no
5368 or an invalid charset in it, or trying to add a character
5369 outside the range of the charset. Insert that char literally
5370 to preserve it for the output. */
5372 COMPOSE_FLUSH_CHARS (str, dst);
5373 decode_flush_er_chars (str, dst);
5377 DECODE_ADD_BINARY_CHAR
5378 ((unsigned char)(cpos >> (counter * 8)), dst);
5381 DECODE_ADD_BINARY_CHAR (c, dst);
5386 /* Things are probably hunky-dorey. */
5388 /* Fetch reverse charset, maybe. */
5389 if (((flags & CODING_STATE_R2L) &&
5390 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5392 (!(flags & CODING_STATE_R2L) &&
5393 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5395 Lisp_Object new_charset =
5396 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5397 if (!NILP (new_charset))
5398 charset = new_charset;
5403 if (XCHARSET_DIMENSION (charset) == counter)
5405 COMPOSE_ADD_CHAR (str,
5406 DECODE_CHAR (charset,
5407 ((cpos & 0x7F7F7F) << 8)
5414 cpos = (cpos << 8) | c;
5416 lb = XCHARSET_LEADING_BYTE (charset);
5417 switch (XCHARSET_REP_BYTES (charset))
5420 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5421 Dynarr_add (dst, c & 0x7F);
5424 case 2: /* one-byte official */
5425 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5426 Dynarr_add (dst, lb);
5427 Dynarr_add (dst, c | 0x80);
5430 case 3: /* one-byte private or two-byte official */
5431 if (XCHARSET_PRIVATE_P (charset))
5433 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5434 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5435 Dynarr_add (dst, lb);
5436 Dynarr_add (dst, c | 0x80);
5442 Dynarr_add (dst, lb);
5443 Dynarr_add (dst, ch | 0x80);
5444 Dynarr_add (dst, c | 0x80);
5452 default: /* two-byte private */
5455 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5456 Dynarr_add (dst, lb);
5457 Dynarr_add (dst, ch | 0x80);
5458 Dynarr_add (dst, c | 0x80);
5468 flags &= CODING_STATE_ISO2022_LOCK;
5471 label_continue_loop:;
5474 if (flags & CODING_STATE_END)
5476 COMPOSE_FLUSH_CHARS (str, dst);
5477 decode_flush_er_chars (str, dst);
5478 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5482 str->counter = counter;
5486 /***** ISO2022 encoder *****/
5488 /* Designate CHARSET into register REG. */
5491 iso2022_designate (Lisp_Object charset, unsigned char reg,
5492 struct encoding_stream *str, unsigned_char_dynarr *dst)
5494 static const char inter94[] = "()*+";
5495 static const char inter96[] = ",-./";
5496 unsigned short chars;
5497 unsigned char dimension;
5498 unsigned char final;
5499 Lisp_Object old_charset = str->iso2022.charset[reg];
5501 str->iso2022.charset[reg] = charset;
5502 if (!CHARSETP (charset))
5503 /* charset might be an initial nil or t. */
5505 chars = XCHARSET_CHARS (charset);
5506 dimension = XCHARSET_DIMENSION (charset);
5507 final = XCHARSET_FINAL (charset);
5508 if (!str->iso2022.force_charset_on_output[reg] &&
5509 CHARSETP (old_charset) &&
5510 XCHARSET_CHARS (old_charset) == chars &&
5511 XCHARSET_DIMENSION (old_charset) == dimension &&
5512 XCHARSET_FINAL (old_charset) == final)
5515 str->iso2022.force_charset_on_output[reg] = 0;
5518 charset_conversion_spec_dynarr *dyn =
5519 str->codesys->iso2022.output_conv;
5525 for (i = 0; i < Dynarr_length (dyn); i++)
5527 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5528 if (EQ (charset, spec->from_charset))
5529 charset = spec->to_charset;
5534 Dynarr_add (dst, ISO_CODE_ESC);
5539 Dynarr_add (dst, inter94[reg]);
5542 Dynarr_add (dst, '$');
5544 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5547 Dynarr_add (dst, inter94[reg]);
5552 Dynarr_add (dst, inter96[reg]);
5555 Dynarr_add (dst, '$');
5556 Dynarr_add (dst, inter96[reg]);
5560 Dynarr_add (dst, final);
5564 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5566 if (str->iso2022.register_left != 0)
5568 Dynarr_add (dst, ISO_CODE_SI);
5569 str->iso2022.register_left = 0;
5574 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5576 if (str->iso2022.register_left != 1)
5578 Dynarr_add (dst, ISO_CODE_SO);
5579 str->iso2022.register_left = 1;
5584 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5585 unsigned_char_dynarr *dst, unsigned int *flags)
5587 unsigned char charmask;
5588 Lisp_Coding_System* codesys = str->codesys;
5589 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5591 Lisp_Object charset = str->iso2022.current_charset;
5592 int half = str->iso2022.current_half;
5593 int code_point = -1;
5597 restore_left_to_right_direction (codesys, dst, flags, 0);
5599 /* Make sure G0 contains ASCII */
5600 if ((ch > ' ' && ch < ISO_CODE_DEL)
5601 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5603 ensure_normal_shift (str, dst);
5604 iso2022_designate (Vcharset_ascii, 0, str, dst);
5607 /* If necessary, restore everything to the default state
5609 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5611 restore_left_to_right_direction (codesys, dst, flags, 0);
5613 ensure_normal_shift (str, dst);
5615 for (i = 0; i < 4; i++)
5617 Lisp_Object initial_charset =
5618 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5619 iso2022_designate (initial_charset, i, str, dst);
5624 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5625 Dynarr_add (dst, '\r');
5626 if (eol_type != EOL_CR)
5627 Dynarr_add (dst, ch);
5631 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5632 && fit_to_be_escape_quoted (ch))
5633 Dynarr_add (dst, ISO_CODE_ESC);
5634 Dynarr_add (dst, ch);
5637 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5639 charmask = (half == 0 ? 0x00 : 0x80);
5641 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5642 && fit_to_be_escape_quoted (ch))
5643 Dynarr_add (dst, ISO_CODE_ESC);
5644 /* you asked for it ... */
5645 Dynarr_add (dst, ch);
5651 /* Now determine which register to use. */
5653 for (i = 0; i < 4; i++)
5655 if ((CHARSETP (charset = str->iso2022.charset[i])
5656 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
5660 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5661 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
5669 Lisp_Object original_default_coded_charset_priority_list
5670 = Vdefault_coded_charset_priority_list;
5672 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5674 code_point = ENCODE_CHAR (ch, charset);
5675 if (XCHARSET_FINAL (charset))
5677 Vdefault_coded_charset_priority_list
5678 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5679 Vdefault_coded_charset_priority_list));
5681 code_point = ENCODE_CHAR (ch, charset);
5682 if (!XCHARSET_FINAL (charset))
5684 charset = Vcharset_ascii;
5688 Vdefault_coded_charset_priority_list
5689 = original_default_coded_charset_priority_list;
5691 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5692 codesys, dst, flags, 0);
5696 if (XCHARSET_GRAPHIC (charset) != 0)
5698 if (!NILP (str->iso2022.charset[1]) &&
5699 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5700 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5702 else if (!NILP (str->iso2022.charset[2]))
5704 else if (!NILP (str->iso2022.charset[3]))
5713 iso2022_designate (charset, reg, str, dst);
5715 /* Now invoke that register. */
5719 ensure_normal_shift (str, dst);
5723 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5725 ensure_shift_out (str, dst);
5732 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5734 Dynarr_add (dst, ISO_CODE_ESC);
5735 Dynarr_add (dst, 'N');
5740 Dynarr_add (dst, ISO_CODE_SS2);
5745 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5747 Dynarr_add (dst, ISO_CODE_ESC);
5748 Dynarr_add (dst, 'O');
5753 Dynarr_add (dst, ISO_CODE_SS3);
5761 charmask = (half == 0 ? 0x00 : 0x80);
5763 switch (XCHARSET_DIMENSION (charset))
5766 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5769 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5770 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5773 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5774 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5775 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5778 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5779 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5780 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5781 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5787 str->iso2022.current_charset = charset;
5788 str->iso2022.current_half = half;
5792 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5793 unsigned int *flags)
5795 Lisp_Coding_System* codesys = str->codesys;
5798 restore_left_to_right_direction (codesys, dst, flags, 0);
5799 ensure_normal_shift (str, dst);
5800 for (i = 0; i < 4; i++)
5802 Lisp_Object initial_charset
5803 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5804 iso2022_designate (initial_charset, i, str, dst);
5809 /************************************************************************/
5810 /* No-conversion methods */
5811 /************************************************************************/
5813 /* This is used when reading in "binary" files -- i.e. files that may
5814 contain all 256 possible byte values and that are not to be
5815 interpreted as being in any particular decoding. */
5817 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5818 unsigned_char_dynarr *dst, size_t n)
5820 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5821 unsigned int flags = str->flags;
5822 unsigned int cpos = str->cpos;
5823 eol_type_t eol_type = str->eol_type;
5827 unsigned char c = *(unsigned char *)src++;
5829 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5830 DECODE_ADD_BINARY_CHAR (c, dst);
5831 label_continue_loop:;
5834 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5841 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5842 unsigned_char_dynarr *dst, size_t n)
5845 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5846 unsigned int flags = str->flags;
5847 unsigned int ch = str->ch;
5848 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5850 unsigned char char_boundary = str->iso2022.current_char_boundary;
5857 if (char_boundary == 0)
5863 else if ( c >= 0xf8 )
5868 else if ( c >= 0xf0 )
5873 else if ( c >= 0xe0 )
5878 else if ( c >= 0xc0 )
5888 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5889 Dynarr_add (dst, '\r');
5890 if (eol_type != EOL_CR)
5891 Dynarr_add (dst, c);
5894 Dynarr_add (dst, c);
5897 else if (char_boundary == 1)
5899 ch = ( ch << 6 ) | ( c & 0x3f );
5900 Dynarr_add (dst, ch & 0xff);
5905 ch = ( ch << 6 ) | ( c & 0x3f );
5908 #else /* not UTF2000 */
5911 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5912 Dynarr_add (dst, '\r');
5913 if (eol_type != EOL_CR)
5914 Dynarr_add (dst, '\n');
5917 else if (BYTE_ASCII_P (c))
5920 Dynarr_add (dst, c);
5922 else if (BUFBYTE_LEADING_BYTE_P (c))
5925 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5926 c == LEADING_BYTE_CONTROL_1)
5929 Dynarr_add (dst, '~'); /* untranslatable character */
5933 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5934 Dynarr_add (dst, c);
5935 else if (ch == LEADING_BYTE_CONTROL_1)
5938 Dynarr_add (dst, c - 0x20);
5940 /* else it should be the second or third byte of an
5941 untranslatable character, so ignore it */
5944 #endif /* not UTF2000 */
5950 str->iso2022.current_char_boundary = char_boundary;
5956 /************************************************************************/
5957 /* Initialization */
5958 /************************************************************************/
5961 syms_of_file_coding (void)
5963 INIT_LRECORD_IMPLEMENTATION (coding_system);
5965 deferror (&Qcoding_system_error, "coding-system-error",
5966 "Coding-system error", Qio_error);
5968 DEFSUBR (Fcoding_system_p);
5969 DEFSUBR (Ffind_coding_system);
5970 DEFSUBR (Fget_coding_system);
5971 DEFSUBR (Fcoding_system_list);
5972 DEFSUBR (Fcoding_system_name);
5973 DEFSUBR (Fmake_coding_system);
5974 DEFSUBR (Fcopy_coding_system);
5975 DEFSUBR (Fcoding_system_canonical_name_p);
5976 DEFSUBR (Fcoding_system_alias_p);
5977 DEFSUBR (Fcoding_system_aliasee);
5978 DEFSUBR (Fdefine_coding_system_alias);
5979 DEFSUBR (Fsubsidiary_coding_system);
5981 DEFSUBR (Fcoding_system_type);
5982 DEFSUBR (Fcoding_system_doc_string);
5984 DEFSUBR (Fcoding_system_charset);
5986 DEFSUBR (Fcoding_system_property);
5988 DEFSUBR (Fcoding_category_list);
5989 DEFSUBR (Fset_coding_priority_list);
5990 DEFSUBR (Fcoding_priority_list);
5991 DEFSUBR (Fset_coding_category_system);
5992 DEFSUBR (Fcoding_category_system);
5994 DEFSUBR (Fdetect_coding_region);
5995 DEFSUBR (Fdecode_coding_region);
5996 DEFSUBR (Fencode_coding_region);
5998 DEFSUBR (Fdecode_shift_jis_char);
5999 DEFSUBR (Fencode_shift_jis_char);
6000 DEFSUBR (Fdecode_big5_char);
6001 DEFSUBR (Fencode_big5_char);
6003 defsymbol (&Qcoding_systemp, "coding-system-p");
6004 defsymbol (&Qno_conversion, "no-conversion");
6005 defsymbol (&Qraw_text, "raw-text");
6007 defsymbol (&Qbig5, "big5");
6008 defsymbol (&Qshift_jis, "shift-jis");
6009 defsymbol (&Qucs4, "ucs-4");
6010 defsymbol (&Qutf8, "utf-8");
6011 defsymbol (&Qccl, "ccl");
6012 defsymbol (&Qiso2022, "iso2022");
6014 defsymbol (&Qmnemonic, "mnemonic");
6015 defsymbol (&Qeol_type, "eol-type");
6016 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6017 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6019 defsymbol (&Qcr, "cr");
6020 defsymbol (&Qlf, "lf");
6021 defsymbol (&Qcrlf, "crlf");
6022 defsymbol (&Qeol_cr, "eol-cr");
6023 defsymbol (&Qeol_lf, "eol-lf");
6024 defsymbol (&Qeol_crlf, "eol-crlf");
6026 defsymbol (&Qcharset_g0, "charset-g0");
6027 defsymbol (&Qcharset_g1, "charset-g1");
6028 defsymbol (&Qcharset_g2, "charset-g2");
6029 defsymbol (&Qcharset_g3, "charset-g3");
6030 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6031 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6032 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6033 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6034 defsymbol (&Qno_iso6429, "no-iso6429");
6035 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6036 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6038 defsymbol (&Qshort, "short");
6039 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6040 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6041 defsymbol (&Qseven, "seven");
6042 defsymbol (&Qlock_shift, "lock-shift");
6043 defsymbol (&Qescape_quoted, "escape-quoted");
6046 defsymbol (&Qdisable_composition, "disable-composition");
6047 defsymbol (&Quse_entity_reference, "use-entity-reference");
6048 defsymbol (&Qd, "d");
6049 defsymbol (&Qx, "x");
6050 defsymbol (&QX, "X");
6052 defsymbol (&Qencode, "encode");
6053 defsymbol (&Qdecode, "decode");
6056 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6058 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6060 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6062 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6064 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6066 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6068 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6070 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6072 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6075 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6080 lstream_type_create_file_coding (void)
6082 LSTREAM_HAS_METHOD (decoding, reader);
6083 LSTREAM_HAS_METHOD (decoding, writer);
6084 LSTREAM_HAS_METHOD (decoding, rewinder);
6085 LSTREAM_HAS_METHOD (decoding, seekable_p);
6086 LSTREAM_HAS_METHOD (decoding, flusher);
6087 LSTREAM_HAS_METHOD (decoding, closer);
6088 LSTREAM_HAS_METHOD (decoding, marker);
6090 LSTREAM_HAS_METHOD (encoding, reader);
6091 LSTREAM_HAS_METHOD (encoding, writer);
6092 LSTREAM_HAS_METHOD (encoding, rewinder);
6093 LSTREAM_HAS_METHOD (encoding, seekable_p);
6094 LSTREAM_HAS_METHOD (encoding, flusher);
6095 LSTREAM_HAS_METHOD (encoding, closer);
6096 LSTREAM_HAS_METHOD (encoding, marker);
6100 vars_of_file_coding (void)
6104 fcd = xnew (struct file_coding_dump);
6105 dump_add_root_struct_ptr (&fcd, &fcd_description);
6107 /* Initialize to something reasonable ... */
6108 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6110 fcd->coding_category_system[i] = Qnil;
6111 fcd->coding_category_by_priority[i] = i;
6114 Fprovide (intern ("file-coding"));
6116 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6117 Coding system used for TTY keyboard input.
6118 Not used under a windowing system.
6120 Vkeyboard_coding_system = Qnil;
6122 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6123 Coding system used for TTY display output.
6124 Not used under a windowing system.
6126 Vterminal_coding_system = Qnil;
6128 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6129 Overriding coding system used when reading from a file or process.
6130 You should bind this variable with `let', but do not set it globally.
6131 If this is non-nil, it specifies the coding system that will be used
6132 to decode input on read operations, such as from a file or process.
6133 It overrides `buffer-file-coding-system-for-read',
6134 `insert-file-contents-pre-hook', etc. Use those variables instead of
6135 this one for permanent changes to the environment. */ );
6136 Vcoding_system_for_read = Qnil;
6138 DEFVAR_LISP ("coding-system-for-write",
6139 &Vcoding_system_for_write /*
6140 Overriding coding system used when writing to a file or process.
6141 You should bind this variable with `let', but do not set it globally.
6142 If this is non-nil, it specifies the coding system that will be used
6143 to encode output for write operations, such as to a file or process.
6144 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6145 Use those variables instead of this one for permanent changes to the
6147 Vcoding_system_for_write = Qnil;
6149 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6150 Coding system used to convert pathnames when accessing files.
6152 Vfile_name_coding_system = Qnil;
6154 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6155 &Vcoded_charset_entity_reference_alist /*
6156 Alist of coded-charset vs corresponding entity-reference.
6157 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6158 CCS is coded-charset.
6159 CODE-COLUMNS is columns of code-point of entity-reference.
6160 CODE-TYPE is format type of code-point of entity-reference.
6161 `d' means decimal value and `x' means hexadecimal value.
6163 Vcoded_charset_entity_reference_alist = Qnil;
6165 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6166 Non-nil means the buffer contents are regarded as multi-byte form
6167 of characters, not a binary code. This affects the display, file I/O,
6168 and behaviors of various editing commands.
6170 Setting this to nil does not do anything.
6172 enable_multibyte_characters = 1;
6176 complex_vars_of_file_coding (void)
6178 staticpro (&Vcoding_system_hash_table);
6179 Vcoding_system_hash_table =
6180 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6182 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6183 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6185 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6187 struct codesys_prop csp; \
6189 csp.prop_type = (Prop_Type); \
6190 Dynarr_add (the_codesys_prop_dynarr, csp); \
6193 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6194 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6195 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6196 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6197 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6198 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6199 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6201 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6202 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6203 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6204 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6205 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6206 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6207 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6208 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6209 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6210 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6211 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6212 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6213 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6214 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6215 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6216 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6217 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6219 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6220 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6222 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6223 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6226 /* Need to create this here or we're really screwed. */
6228 (Qraw_text, Qno_conversion,
6229 build_string ("Raw text, which means it converts only line-break-codes."),
6230 list2 (Qmnemonic, build_string ("Raw")));
6233 (Qbinary, Qno_conversion,
6234 build_string ("Binary, which means it does not convert anything."),
6235 list4 (Qeol_type, Qlf,
6236 Qmnemonic, build_string ("Binary")));
6241 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6242 list2 (Qmnemonic, build_string ("UTF8")));
6245 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6247 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6249 Fdefine_coding_system_alias (Qterminal, Qbinary);
6250 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6252 /* Need this for bootstrapping */
6253 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6254 Fget_coding_system (Qraw_text);
6257 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6258 = Fget_coding_system (Qutf8);
6261 #if defined(MULE) && !defined(UTF2000)
6265 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6266 fcd->ucs_to_mule_table[i] = Qnil;
6268 staticpro (&mule_to_ucs_table);
6269 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6270 #endif /* defined(MULE) && !defined(UTF2000) */